Spaces:
No application file
No application file
jameshuntercarter
commited on
Commit
•
b5dba8a
1
Parent(s):
ed39a03
Upload 24 files
Browse files- .gitattributes +1 -0
- LICENSE +21 -0
- args.py +10 -0
- bark_hubert_quantizer/__init__.py +0 -0
- bark_hubert_quantizer/customtokenizer.py +200 -0
- bark_hubert_quantizer/hubert_manager.py +33 -0
- bark_hubert_quantizer/pre_kmeans_hubert.py +106 -0
- cog.yaml +20 -0
- colab_notebook.ipynb +202 -0
- data/.DS_Store +0 -0
- data/models/.DS_Store +0 -0
- data/models/hubert/hubert_base_ls960.pt +3 -0
- data/models/hubert/quantifier_V1_hubert_base_ls960_23.pth +3 -0
- data/models/hubert/quantifier_hubert_base_ls960_14.pth +3 -0
- examples/biden_example.mov +0 -0
- install_hubert.py +28 -0
- notebook.ipynb +180 -0
- predict.py +87 -0
- prepare.py +88 -0
- process.py +22 -0
- readme.md +108 -0
- requirements.txt +8 -0
- sample-speaker.wav +3 -0
- setup.py +17 -0
- test_hubert.py +23 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
sample-speaker.wav filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Mylo
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
args.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from argparse import ArgumentParser
|
2 |
+
|
3 |
+
parser = ArgumentParser()
|
4 |
+
|
5 |
+
parser.add_argument('--path', required=True, help='The path containing your semantic tokens and wavs')
|
6 |
+
parser.add_argument('--mode', required=True, help='The mode to use', choices=['prepare', 'prepare2', 'train', 'test'])
|
7 |
+
parser.add_argument('--hubert-model', default='model/hubert/hubert_base_ls960.pt', help='The hubert model to use for preparing the data and later creation of semantic tokens.')
|
8 |
+
parser.add_argument('--train-save-epochs', default=1, type=int, help='The amount of epochs to train before saving')
|
9 |
+
|
10 |
+
args = parser.parse_args()
|
bark_hubert_quantizer/__init__.py
ADDED
File without changes
|
bark_hubert_quantizer/customtokenizer.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Custom tokenizer model.
|
3 |
+
Author: https://www.github.com/gitmylo/
|
4 |
+
License: MIT
|
5 |
+
"""
|
6 |
+
|
7 |
+
import json
|
8 |
+
import os.path
|
9 |
+
from zipfile import ZipFile
|
10 |
+
|
11 |
+
import numpy
|
12 |
+
import torch
|
13 |
+
from torch import nn, optim
|
14 |
+
from torch.serialization import MAP_LOCATION
|
15 |
+
|
16 |
+
|
17 |
+
class CustomTokenizer(nn.Module):
|
18 |
+
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
|
19 |
+
super(CustomTokenizer, self).__init__()
|
20 |
+
next_size = input_size
|
21 |
+
if version == 0:
|
22 |
+
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
|
23 |
+
next_size = hidden_size
|
24 |
+
if version == 1:
|
25 |
+
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
|
26 |
+
self.intermediate = nn.Linear(hidden_size, 4096)
|
27 |
+
next_size = 4096
|
28 |
+
|
29 |
+
self.fc = nn.Linear(next_size, output_size)
|
30 |
+
self.softmax = nn.LogSoftmax(dim=1)
|
31 |
+
self.optimizer: optim.Optimizer = None
|
32 |
+
self.lossfunc = nn.CrossEntropyLoss()
|
33 |
+
self.input_size = input_size
|
34 |
+
self.hidden_size = hidden_size
|
35 |
+
self.output_size = output_size
|
36 |
+
self.version = version
|
37 |
+
|
38 |
+
def forward(self, x):
|
39 |
+
x, _ = self.lstm(x)
|
40 |
+
if self.version == 1:
|
41 |
+
x = self.intermediate(x)
|
42 |
+
x = self.fc(x)
|
43 |
+
x = self.softmax(x)
|
44 |
+
return x
|
45 |
+
|
46 |
+
@torch.no_grad()
|
47 |
+
def get_token(self, x):
|
48 |
+
"""
|
49 |
+
Used to get the token for the first
|
50 |
+
:param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
|
51 |
+
:return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
|
52 |
+
"""
|
53 |
+
return torch.argmax(self(x), dim=1)
|
54 |
+
|
55 |
+
def prepare_training(self):
|
56 |
+
self.optimizer = optim.Adam(self.parameters(), 0.001)
|
57 |
+
|
58 |
+
def train_step(self, x_train, y_train, log_loss=False):
|
59 |
+
# y_train = y_train[:-1]
|
60 |
+
# y_train = y_train[1:]
|
61 |
+
|
62 |
+
optimizer = self.optimizer
|
63 |
+
lossfunc = self.lossfunc
|
64 |
+
# Zero the gradients
|
65 |
+
self.zero_grad()
|
66 |
+
|
67 |
+
# Forward pass
|
68 |
+
y_pred = self(x_train)
|
69 |
+
|
70 |
+
y_train_len = len(y_train)
|
71 |
+
y_pred_len = y_pred.shape[0]
|
72 |
+
|
73 |
+
if y_train_len > y_pred_len:
|
74 |
+
diff = y_train_len - y_pred_len
|
75 |
+
y_train = y_train[diff:]
|
76 |
+
elif y_train_len < y_pred_len:
|
77 |
+
diff = y_pred_len - y_train_len
|
78 |
+
y_pred = y_pred[:-diff, :]
|
79 |
+
|
80 |
+
y_train_hot = torch.zeros(len(y_train), self.output_size)
|
81 |
+
y_train_hot[range(len(y_train)), y_train] = 1
|
82 |
+
y_train_hot = y_train_hot.to('cuda')
|
83 |
+
|
84 |
+
# Calculate the loss
|
85 |
+
loss = lossfunc(y_pred, y_train_hot)
|
86 |
+
|
87 |
+
# Print loss
|
88 |
+
if log_loss:
|
89 |
+
print('Loss', loss.item())
|
90 |
+
|
91 |
+
# Backward pass
|
92 |
+
loss.backward()
|
93 |
+
|
94 |
+
# Update the weights
|
95 |
+
optimizer.step()
|
96 |
+
|
97 |
+
def save(self, path):
|
98 |
+
info_path = '.'.join(os.path.basename(path).split('.')[:-1]) + '/.info'
|
99 |
+
torch.save(self.state_dict(), path)
|
100 |
+
data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
|
101 |
+
with ZipFile(path, 'a') as model_zip:
|
102 |
+
model_zip.writestr(info_path, data_from_model.save())
|
103 |
+
model_zip.close()
|
104 |
+
|
105 |
+
@staticmethod
|
106 |
+
def load_from_checkpoint(path, map_location: MAP_LOCATION = None):
|
107 |
+
old = True
|
108 |
+
with ZipFile(path) as model_zip:
|
109 |
+
filesMatch = [file for file in model_zip.namelist() if file.endswith('/.info')]
|
110 |
+
file = filesMatch[0] if filesMatch else None
|
111 |
+
if file:
|
112 |
+
old = False
|
113 |
+
data_from_model = Data.load(model_zip.read(file).decode('utf-8'))
|
114 |
+
model_zip.close()
|
115 |
+
if old:
|
116 |
+
model = CustomTokenizer()
|
117 |
+
else:
|
118 |
+
model = CustomTokenizer(data_from_model.hidden_size, data_from_model.input_size, data_from_model.output_size, data_from_model.version)
|
119 |
+
model.load_state_dict(torch.load(path, map_location=map_location))
|
120 |
+
if map_location:
|
121 |
+
model = model.to(map_location)
|
122 |
+
return model
|
123 |
+
|
124 |
+
|
125 |
+
|
126 |
+
class Data:
|
127 |
+
input_size: int
|
128 |
+
hidden_size: int
|
129 |
+
output_size: int
|
130 |
+
version: int
|
131 |
+
|
132 |
+
def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
|
133 |
+
self.input_size = input_size
|
134 |
+
self.hidden_size = hidden_size
|
135 |
+
self.output_size = output_size
|
136 |
+
self.version = version
|
137 |
+
|
138 |
+
@staticmethod
|
139 |
+
def load(string):
|
140 |
+
data = json.loads(string)
|
141 |
+
return Data(data['input_size'], data['hidden_size'], data['output_size'], data['version'])
|
142 |
+
|
143 |
+
def save(self):
|
144 |
+
data = {
|
145 |
+
'input_size': self.input_size,
|
146 |
+
'hidden_size': self.hidden_size,
|
147 |
+
'output_size': self.output_size,
|
148 |
+
'version': self.version,
|
149 |
+
}
|
150 |
+
return json.dumps(data)
|
151 |
+
|
152 |
+
|
153 |
+
def auto_train(data_path, save_path='model.pth', load_model: str | None = None, save_epochs=1):
|
154 |
+
data_x, data_y = {}, {}
|
155 |
+
|
156 |
+
if load_model and os.path.isfile(load_model):
|
157 |
+
print('Loading model from', load_model)
|
158 |
+
model_training = CustomTokenizer.load_from_checkpoint(load_model, 'cuda')
|
159 |
+
else:
|
160 |
+
print('Creating new model.')
|
161 |
+
model_training = CustomTokenizer(version=1).to('cuda')
|
162 |
+
save_path = os.path.join(data_path, save_path)
|
163 |
+
base_save_path = '.'.join(save_path.split('.')[:-1])
|
164 |
+
|
165 |
+
sem_string = '_semantic.npy'
|
166 |
+
feat_string = '_semantic_features.npy'
|
167 |
+
|
168 |
+
ready = os.path.join(data_path, 'ready')
|
169 |
+
for input_file in os.listdir(ready):
|
170 |
+
full_path = os.path.join(ready, input_file)
|
171 |
+
try:
|
172 |
+
prefix = input_file.split("_")[0]
|
173 |
+
number = int(prefix)
|
174 |
+
except ValueError as e:
|
175 |
+
raise e
|
176 |
+
if input_file.endswith(sem_string):
|
177 |
+
data_y[number] = numpy.load(full_path)
|
178 |
+
elif input_file.endswith(feat_string):
|
179 |
+
data_x[number] = numpy.load(full_path)
|
180 |
+
|
181 |
+
model_training.prepare_training()
|
182 |
+
epoch = 1
|
183 |
+
|
184 |
+
while 1:
|
185 |
+
for i in range(save_epochs):
|
186 |
+
j = 0
|
187 |
+
for i in range(max(len(data_x), len(data_y))):
|
188 |
+
x = data_x.get(i)
|
189 |
+
y = data_y.get(i)
|
190 |
+
if x is None or y is None:
|
191 |
+
print(f'The training data does not match. key={i}')
|
192 |
+
continue
|
193 |
+
model_training.train_step(torch.tensor(x).to('cuda'), torch.tensor(y).to('cuda'), j % 50 == 0) # Print loss every 50 steps
|
194 |
+
j += 1
|
195 |
+
save_p = save_path
|
196 |
+
save_p_2 = f'{base_save_path}_epoch_{epoch}.pth'
|
197 |
+
model_training.save(save_p)
|
198 |
+
model_training.save(save_p_2)
|
199 |
+
print(f'Epoch {epoch} completed')
|
200 |
+
epoch += 1
|
bark_hubert_quantizer/hubert_manager.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os.path
|
2 |
+
import shutil
|
3 |
+
import urllib.request
|
4 |
+
|
5 |
+
import huggingface_hub
|
6 |
+
|
7 |
+
|
8 |
+
class HuBERTManager:
|
9 |
+
@staticmethod
|
10 |
+
def make_sure_hubert_installed(download_url: str = 'https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', file_name: str = 'hubert.pt'):
|
11 |
+
install_dir = os.path.join('data', 'models', 'hubert')
|
12 |
+
if not os.path.isdir(install_dir):
|
13 |
+
os.makedirs(install_dir, exist_ok=True)
|
14 |
+
install_file = os.path.join(install_dir, file_name)
|
15 |
+
if not os.path.isfile(install_file):
|
16 |
+
print('Downloading HuBERT base model')
|
17 |
+
urllib.request.urlretrieve(download_url, install_file)
|
18 |
+
print('Downloaded HuBERT')
|
19 |
+
return install_file
|
20 |
+
|
21 |
+
|
22 |
+
@staticmethod
|
23 |
+
def make_sure_tokenizer_installed(model: str = 'quantifier_hubert_base_ls960_14.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'):
|
24 |
+
install_dir = os.path.join('data', 'models', 'hubert')
|
25 |
+
if not os.path.isdir(install_dir):
|
26 |
+
os.makedirs(install_dir, exist_ok=True)
|
27 |
+
install_file = os.path.join(install_dir, local_file)
|
28 |
+
if not os.path.isfile(install_file):
|
29 |
+
print('Downloading HuBERT custom tokenizer')
|
30 |
+
huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False)
|
31 |
+
shutil.move(os.path.join(install_dir, model), install_file)
|
32 |
+
print('Downloaded tokenizer')
|
33 |
+
return install_file
|
bark_hubert_quantizer/pre_kmeans_hubert.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Modified HuBERT model without kmeans.
|
3 |
+
Original author: https://github.com/lucidrains/
|
4 |
+
Modified by: https://www.github.com/gitmylo/
|
5 |
+
License: MIT
|
6 |
+
"""
|
7 |
+
|
8 |
+
# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
|
9 |
+
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
import torch
|
13 |
+
from torch import nn
|
14 |
+
from einops import pack, unpack
|
15 |
+
|
16 |
+
import fairseq
|
17 |
+
|
18 |
+
from torchaudio.functional import resample
|
19 |
+
|
20 |
+
from audiolm_pytorch.utils import curtail_to_multiple
|
21 |
+
|
22 |
+
import logging
|
23 |
+
logging.root.setLevel(logging.ERROR)
|
24 |
+
|
25 |
+
|
26 |
+
def exists(val):
|
27 |
+
return val is not None
|
28 |
+
|
29 |
+
|
30 |
+
def default(val, d):
|
31 |
+
return val if exists(val) else d
|
32 |
+
|
33 |
+
|
34 |
+
class CustomHubert(nn.Module):
|
35 |
+
"""
|
36 |
+
checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
|
37 |
+
or you can train your own
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(
|
41 |
+
self,
|
42 |
+
checkpoint_path,
|
43 |
+
target_sample_hz=16000,
|
44 |
+
seq_len_multiple_of=None,
|
45 |
+
output_layer=9,
|
46 |
+
device=None
|
47 |
+
):
|
48 |
+
super().__init__()
|
49 |
+
self.target_sample_hz = target_sample_hz
|
50 |
+
self.seq_len_multiple_of = seq_len_multiple_of
|
51 |
+
self.output_layer = output_layer
|
52 |
+
|
53 |
+
if device is not None:
|
54 |
+
self.to(device)
|
55 |
+
|
56 |
+
model_path = Path(checkpoint_path)
|
57 |
+
|
58 |
+
assert model_path.exists(), f'path {checkpoint_path} does not exist'
|
59 |
+
|
60 |
+
checkpoint = torch.load(checkpoint_path, map_location=device)
|
61 |
+
load_model_input = {checkpoint_path: checkpoint}
|
62 |
+
model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)
|
63 |
+
|
64 |
+
if device is not None:
|
65 |
+
model[0].to(device)
|
66 |
+
|
67 |
+
self.model = model[0]
|
68 |
+
self.model.eval()
|
69 |
+
|
70 |
+
@property
|
71 |
+
def groups(self):
|
72 |
+
return 1
|
73 |
+
|
74 |
+
@torch.no_grad()
|
75 |
+
def forward(
|
76 |
+
self,
|
77 |
+
wav_input,
|
78 |
+
flatten=True,
|
79 |
+
input_sample_hz=None
|
80 |
+
):
|
81 |
+
device = wav_input.device
|
82 |
+
|
83 |
+
if exists(input_sample_hz):
|
84 |
+
wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
|
85 |
+
|
86 |
+
if exists(self.seq_len_multiple_of):
|
87 |
+
wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
|
88 |
+
|
89 |
+
embed = self.model(
|
90 |
+
wav_input,
|
91 |
+
features_only=True,
|
92 |
+
mask=False, # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
|
93 |
+
output_layer=self.output_layer
|
94 |
+
)
|
95 |
+
|
96 |
+
embed, packed_shape = pack([embed['x']], '* d')
|
97 |
+
|
98 |
+
# codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())
|
99 |
+
|
100 |
+
codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) # .long()
|
101 |
+
|
102 |
+
if flatten:
|
103 |
+
return codebook_indices
|
104 |
+
|
105 |
+
codebook_indices, = unpack(codebook_indices, packed_shape, '*')
|
106 |
+
return codebook_indices
|
cog.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
build:
|
2 |
+
gpu: true
|
3 |
+
cuda: "11.8"
|
4 |
+
python_version: "3.10"
|
5 |
+
python_packages:
|
6 |
+
- "audiolm-pytorch==1.1.4"
|
7 |
+
- "fairseq"
|
8 |
+
- "huggingface-hub"
|
9 |
+
- "sentencepiece"
|
10 |
+
- "transformers"
|
11 |
+
- "encodec"
|
12 |
+
- 'soundfile; platform_system == "Windows"'
|
13 |
+
- 'sox; platform_system != "Windows"'
|
14 |
+
- "tensorboardX"
|
15 |
+
- "torch"
|
16 |
+
- "torchvision"
|
17 |
+
- "torchaudio"
|
18 |
+
- "light-the-torch"
|
19 |
+
|
20 |
+
predict: "predict.py:Predictor"
|
colab_notebook.ipynb
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"collapsed": false
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"# Bark text-to-speech voice cloning.\n",
|
10 |
+
"Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark).\n",
|
11 |
+
"(This version of the notebook is made to work on Google Colab, make sure your runtime hardware accelerator is set to GPU)"
|
12 |
+
]
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"cell_type": "markdown",
|
16 |
+
"metadata": {
|
17 |
+
"collapsed": false
|
18 |
+
},
|
19 |
+
"source": [
|
20 |
+
"# Google Colab: Clone the repository"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": null,
|
26 |
+
"metadata": {
|
27 |
+
"collapsed": false
|
28 |
+
},
|
29 |
+
"outputs": [],
|
30 |
+
"source": [
|
31 |
+
"!git clone https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/\n",
|
32 |
+
"%cd bark-voice-cloning-HuBERT-quantizer"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"cell_type": "markdown",
|
37 |
+
"metadata": {
|
38 |
+
"collapsed": false
|
39 |
+
},
|
40 |
+
"source": [
|
41 |
+
"## Install packages"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": null,
|
47 |
+
"metadata": {
|
48 |
+
"collapsed": true
|
49 |
+
},
|
50 |
+
"outputs": [],
|
51 |
+
"source": [
|
52 |
+
"%pip install -r requirements.txt\n",
|
53 |
+
"%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"cell_type": "markdown",
|
58 |
+
"metadata": {
|
59 |
+
"collapsed": false
|
60 |
+
},
|
61 |
+
"source": [
|
62 |
+
"## Load models"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"cell_type": "code",
|
67 |
+
"execution_count": 1,
|
68 |
+
"metadata": {
|
69 |
+
"collapsed": false
|
70 |
+
},
|
71 |
+
"outputs": [
|
72 |
+
{
|
73 |
+
"name": "stderr",
|
74 |
+
"output_type": "stream",
|
75 |
+
"text": [
|
76 |
+
"2023-05-26 21:27:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n"
|
77 |
+
]
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"name": "stdout",
|
81 |
+
"output_type": "stream",
|
82 |
+
"text": [
|
83 |
+
"Loading HuBERT...\n",
|
84 |
+
"Loading Quantizer...\n",
|
85 |
+
"Loading Encodec...\n",
|
86 |
+
"Downloaded and loaded models!\n"
|
87 |
+
]
|
88 |
+
}
|
89 |
+
],
|
90 |
+
"source": [
|
91 |
+
"large_quant_model = False # Use the larger pretrained model\n",
|
92 |
+
"device = 'cuda' # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')\n",
|
93 |
+
"\n",
|
94 |
+
"import numpy as np\n",
|
95 |
+
"import torch\n",
|
96 |
+
"import torchaudio\n",
|
97 |
+
"from encodec import EncodecModel\n",
|
98 |
+
"from encodec.utils import convert_audio\n",
|
99 |
+
"from bark_hubert_quantizer.hubert_manager import HuBERTManager\n",
|
100 |
+
"from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n",
|
101 |
+
"from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n",
|
102 |
+
"\n",
|
103 |
+
"model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')\n",
|
104 |
+
"\n",
|
105 |
+
"print('Loading HuBERT...')\n",
|
106 |
+
"hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)\n",
|
107 |
+
"print('Loading Quantizer...')\n",
|
108 |
+
"quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)\n",
|
109 |
+
"print('Loading Encodec...')\n",
|
110 |
+
"encodec_model = EncodecModel.encodec_model_24khz()\n",
|
111 |
+
"encodec_model.set_target_bandwidth(6.0)\n",
|
112 |
+
"encodec_model.to(device)\n",
|
113 |
+
"\n",
|
114 |
+
"print('Downloaded and loaded models!')"
|
115 |
+
]
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"cell_type": "markdown",
|
119 |
+
"metadata": {
|
120 |
+
"collapsed": false
|
121 |
+
},
|
122 |
+
"source": [
|
123 |
+
"## Load wav and create speaker history prompt"
|
124 |
+
]
|
125 |
+
},
|
126 |
+
{
|
127 |
+
"cell_type": "code",
|
128 |
+
"execution_count": 2,
|
129 |
+
"metadata": {
|
130 |
+
"collapsed": false
|
131 |
+
},
|
132 |
+
"outputs": [
|
133 |
+
{
|
134 |
+
"name": "stdout",
|
135 |
+
"output_type": "stream",
|
136 |
+
"text": [
|
137 |
+
"Extracting semantics...\n",
|
138 |
+
"Tokenizing semantics...\n",
|
139 |
+
"Creating coarse and fine prompts...\n",
|
140 |
+
"Done!\n"
|
141 |
+
]
|
142 |
+
}
|
143 |
+
],
|
144 |
+
"source": [
|
145 |
+
"wav_file = 'speaker.wav' # Put the path of the speaker you want to use here.\n",
|
146 |
+
"out_file = 'speaker.npz' # Put the path to save the cloned speaker to here.\n",
|
147 |
+
"\n",
|
148 |
+
"wav, sr = torchaudio.load(wav_file)\n",
|
149 |
+
"\n",
|
150 |
+
"wav_hubert = wav.to(device)\n",
|
151 |
+
"\n",
|
152 |
+
"if wav_hubert.shape[0] == 2: # Stereo to mono if needed\n",
|
153 |
+
" wav_hubert = wav_hubert.mean(0, keepdim=True)\n",
|
154 |
+
"\n",
|
155 |
+
"print('Extracting semantics...')\n",
|
156 |
+
"semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)\n",
|
157 |
+
"print('Tokenizing semantics...')\n",
|
158 |
+
"semantic_tokens = quant_model.get_token(semantic_vectors)\n",
|
159 |
+
"print('Creating coarse and fine prompts...')\n",
|
160 |
+
"wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)\n",
|
161 |
+
"\n",
|
162 |
+
"wav = wav.to(device)\n",
|
163 |
+
"\n",
|
164 |
+
"with torch.no_grad():\n",
|
165 |
+
" encoded_frames = encodec_model.encode(wav)\n",
|
166 |
+
"codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()\n",
|
167 |
+
"\n",
|
168 |
+
"codes = codes.cpu()\n",
|
169 |
+
"semantic_tokens = semantic_tokens.cpu()\n",
|
170 |
+
"\n",
|
171 |
+
"np.savez(out_file,\n",
|
172 |
+
" semantic_prompt=semantic_tokens,\n",
|
173 |
+
" fine_prompt=codes,\n",
|
174 |
+
" coarse_prompt=codes[:2, :]\n",
|
175 |
+
" )\n",
|
176 |
+
"\n",
|
177 |
+
"print('Done!')"
|
178 |
+
]
|
179 |
+
}
|
180 |
+
],
|
181 |
+
"metadata": {
|
182 |
+
"kernelspec": {
|
183 |
+
"display_name": "Python 3",
|
184 |
+
"language": "python",
|
185 |
+
"name": "python3"
|
186 |
+
},
|
187 |
+
"language_info": {
|
188 |
+
"codemirror_mode": {
|
189 |
+
"name": "ipython",
|
190 |
+
"version": 2
|
191 |
+
},
|
192 |
+
"file_extension": ".py",
|
193 |
+
"mimetype": "text/x-python",
|
194 |
+
"name": "python",
|
195 |
+
"nbconvert_exporter": "python",
|
196 |
+
"pygments_lexer": "ipython2",
|
197 |
+
"version": "2.7.6"
|
198 |
+
}
|
199 |
+
},
|
200 |
+
"nbformat": 4,
|
201 |
+
"nbformat_minor": 0
|
202 |
+
}
|
data/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
data/models/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
data/models/hubert/hubert_base_ls960.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1703cf8d2cdc76f8c046f5f6a9bcd224e0e6caf4744cad1a1f4199c32cac8c8d
|
3 |
+
size 1136468879
|
data/models/hubert/quantifier_V1_hubert_base_ls960_23.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d94c5dd646bcfe1a8bb470372f0004c189acf65d913831f3a6ed6414c9ba86f
|
3 |
+
size 243656111
|
data/models/hubert/quantifier_hubert_base_ls960_14.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9cf7eeab58835c5fc1cfbd3fd19c457fbd07859a5f036a6bfea4b6840716c1e7
|
3 |
+
size 103981977
|
examples/biden_example.mov
ADDED
Binary file (73.7 kB). View file
|
|
install_hubert.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SETUP
|
2 |
+
large_quant_model = False # Use the larger pretrained model
|
3 |
+
device = 'cuda' # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
import torchaudio
|
8 |
+
from encodec import EncodecModel
|
9 |
+
from encodec.utils import convert_audio
|
10 |
+
from bark_hubert_quantizer.hubert_manager import HuBERTManager
|
11 |
+
from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
|
12 |
+
from bark_hubert_quantizer.customtokenizer import CustomTokenizer
|
13 |
+
|
14 |
+
model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else (
|
15 |
+
'quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')
|
16 |
+
|
17 |
+
print('Loading HuBERT...')
|
18 |
+
hubert_model = CustomHubert(
|
19 |
+
HuBERTManager.make_sure_hubert_installed(), device=device)
|
20 |
+
print('Loading Quantizer...')
|
21 |
+
quant_model = CustomTokenizer.load_from_checkpoint(
|
22 |
+
HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
|
23 |
+
print('Loading Encodec...')
|
24 |
+
encodec_model = EncodecModel.encodec_model_24khz()
|
25 |
+
encodec_model.set_target_bandwidth(6.0)
|
26 |
+
encodec_model.to(device)
|
27 |
+
|
28 |
+
print('Downloaded and loaded models!')
|
notebook.ipynb
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"collapsed": false
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"# Bark text-to-speech voice cloning.\n",
|
10 |
+
"Clone voices to create speaker history prompt files (.npz) for [bark text-to-speech](https://github.com/suno-ai/bark)."
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "markdown",
|
15 |
+
"metadata": {
|
16 |
+
"collapsed": false
|
17 |
+
},
|
18 |
+
"source": [
|
19 |
+
"## Install packages"
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"cell_type": "code",
|
24 |
+
"execution_count": null,
|
25 |
+
"metadata": {
|
26 |
+
"collapsed": true
|
27 |
+
},
|
28 |
+
"outputs": [],
|
29 |
+
"source": [
|
30 |
+
"%pip install -r requirements.txt\n",
|
31 |
+
"%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117"
|
32 |
+
]
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"cell_type": "markdown",
|
36 |
+
"metadata": {
|
37 |
+
"collapsed": false
|
38 |
+
},
|
39 |
+
"source": [
|
40 |
+
"## Load models"
|
41 |
+
]
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"cell_type": "code",
|
45 |
+
"execution_count": 1,
|
46 |
+
"metadata": {
|
47 |
+
"collapsed": false
|
48 |
+
},
|
49 |
+
"outputs": [
|
50 |
+
{
|
51 |
+
"name": "stderr",
|
52 |
+
"output_type": "stream",
|
53 |
+
"text": [
|
54 |
+
"2023-05-26 21:27:49 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"name": "stdout",
|
59 |
+
"output_type": "stream",
|
60 |
+
"text": [
|
61 |
+
"Loading HuBERT...\n",
|
62 |
+
"Loading Quantizer...\n",
|
63 |
+
"Loading Encodec...\n",
|
64 |
+
"Downloaded and loaded models!\n"
|
65 |
+
]
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"source": [
|
69 |
+
"large_quant_model = False # Use the larger pretrained model\n",
|
70 |
+
"device = 'cuda' # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')\n",
|
71 |
+
"\n",
|
72 |
+
"import numpy as np\n",
|
73 |
+
"import torch\n",
|
74 |
+
"import torchaudio\n",
|
75 |
+
"from encodec import EncodecModel\n",
|
76 |
+
"from encodec.utils import convert_audio\n",
|
77 |
+
"from bark_hubert_quantizer.hubert_manager import HuBERTManager\n",
|
78 |
+
"from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n",
|
79 |
+
"from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n",
|
80 |
+
"\n",
|
81 |
+
"model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')\n",
|
82 |
+
"\n",
|
83 |
+
"print('Loading HuBERT...')\n",
|
84 |
+
"hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)\n",
|
85 |
+
"print('Loading Quantizer...')\n",
|
86 |
+
"quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)\n",
|
87 |
+
"print('Loading Encodec...')\n",
|
88 |
+
"encodec_model = EncodecModel.encodec_model_24khz()\n",
|
89 |
+
"encodec_model.set_target_bandwidth(6.0)\n",
|
90 |
+
"encodec_model.to(device)\n",
|
91 |
+
"\n",
|
92 |
+
"print('Downloaded and loaded models!')"
|
93 |
+
]
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"cell_type": "markdown",
|
97 |
+
"metadata": {
|
98 |
+
"collapsed": false
|
99 |
+
},
|
100 |
+
"source": [
|
101 |
+
"## Load wav and create speaker history prompt"
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"cell_type": "code",
|
106 |
+
"execution_count": 2,
|
107 |
+
"metadata": {
|
108 |
+
"collapsed": false
|
109 |
+
},
|
110 |
+
"outputs": [
|
111 |
+
{
|
112 |
+
"name": "stdout",
|
113 |
+
"output_type": "stream",
|
114 |
+
"text": [
|
115 |
+
"Extracting semantics...\n",
|
116 |
+
"Tokenizing semantics...\n",
|
117 |
+
"Creating coarse and fine prompts...\n",
|
118 |
+
"Done!\n"
|
119 |
+
]
|
120 |
+
}
|
121 |
+
],
|
122 |
+
"source": [
|
123 |
+
"wav_file = 'speaker.wav' # Put the path of the speaker you want to use here.\n",
|
124 |
+
"out_file = 'speaker.npz' # Put the path to save the cloned speaker to here.\n",
|
125 |
+
"\n",
|
126 |
+
"wav, sr = torchaudio.load(wav_file)\n",
|
127 |
+
"\n",
|
128 |
+
"wav_hubert = wav.to(device)\n",
|
129 |
+
"\n",
|
130 |
+
"if wav_hubert.shape[0] == 2: # Stereo to mono if needed\n",
|
131 |
+
" wav_hubert = wav_hubert.mean(0, keepdim=True)\n",
|
132 |
+
"\n",
|
133 |
+
"print('Extracting semantics...')\n",
|
134 |
+
"semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)\n",
|
135 |
+
"print('Tokenizing semantics...')\n",
|
136 |
+
"semantic_tokens = quant_model.get_token(semantic_vectors)\n",
|
137 |
+
"print('Creating coarse and fine prompts...')\n",
|
138 |
+
"wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)\n",
|
139 |
+
"\n",
|
140 |
+
"wav = wav.to(device)\n",
|
141 |
+
"\n",
|
142 |
+
"with torch.no_grad():\n",
|
143 |
+
" encoded_frames = encodec_model.encode(wav)\n",
|
144 |
+
"codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()\n",
|
145 |
+
"\n",
|
146 |
+
"codes = codes.cpu()\n",
|
147 |
+
"semantic_tokens = semantic_tokens.cpu()\n",
|
148 |
+
"\n",
|
149 |
+
"np.savez(out_file,\n",
|
150 |
+
" semantic_prompt=semantic_tokens,\n",
|
151 |
+
" fine_prompt=codes,\n",
|
152 |
+
" coarse_prompt=codes[:2, :]\n",
|
153 |
+
" )\n",
|
154 |
+
"\n",
|
155 |
+
"print('Done!')"
|
156 |
+
]
|
157 |
+
}
|
158 |
+
],
|
159 |
+
"metadata": {
|
160 |
+
"kernelspec": {
|
161 |
+
"display_name": "Python 3",
|
162 |
+
"language": "python",
|
163 |
+
"name": "python3"
|
164 |
+
},
|
165 |
+
"language_info": {
|
166 |
+
"codemirror_mode": {
|
167 |
+
"name": "ipython",
|
168 |
+
"version": 2
|
169 |
+
},
|
170 |
+
"file_extension": ".py",
|
171 |
+
"mimetype": "text/x-python",
|
172 |
+
"name": "python",
|
173 |
+
"nbconvert_exporter": "python",
|
174 |
+
"pygments_lexer": "ipython2",
|
175 |
+
"version": "2.7.6"
|
176 |
+
}
|
177 |
+
},
|
178 |
+
"nbformat": 4,
|
179 |
+
"nbformat_minor": 0
|
180 |
+
}
|
predict.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
from cog import BasePredictor, Input, Path, BaseModel
|
3 |
+
|
4 |
+
|
5 |
+
class ModelOutput(BaseModel):
|
6 |
+
prompt_npz: Optional[Path]
|
7 |
+
audio_out: Path
|
8 |
+
|
9 |
+
|
10 |
+
class Predictor(BasePredictor):
|
11 |
+
|
12 |
+
def setup(self):
|
13 |
+
"""Load the model into memory to make running multiple predictions efficient"""
|
14 |
+
|
15 |
+
def predict(
|
16 |
+
self,
|
17 |
+
speaker: Path = Input(
|
18 |
+
description="Reference audio.", default=None),
|
19 |
+
) -> ModelOutput:
|
20 |
+
"""Run a single prediction on the model"""
|
21 |
+
# SETUP
|
22 |
+
large_quant_model = False # Use the larger pretrained model
|
23 |
+
device = 'cuda' # 'cuda', 'cpu', 'cuda:0', 0, -1, torch.device('cuda')
|
24 |
+
|
25 |
+
import numpy as np
|
26 |
+
import torch
|
27 |
+
import torchaudio
|
28 |
+
from encodec import EncodecModel
|
29 |
+
from encodec.utils import convert_audio
|
30 |
+
from bark_hubert_quantizer.hubert_manager import HuBERTManager
|
31 |
+
from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert
|
32 |
+
from bark_hubert_quantizer.customtokenizer import CustomTokenizer
|
33 |
+
|
34 |
+
model = ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') if large_quant_model else (
|
35 |
+
'quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')
|
36 |
+
|
37 |
+
print('Loading HuBERT...')
|
38 |
+
hubert_model = CustomHubert(
|
39 |
+
HuBERTManager.make_sure_hubert_installed(), device=device)
|
40 |
+
print('Loading Quantizer...')
|
41 |
+
quant_model = CustomTokenizer.load_from_checkpoint(
|
42 |
+
HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
|
43 |
+
print('Loading Encodec...')
|
44 |
+
encodec_model = EncodecModel.encodec_model_24khz()
|
45 |
+
encodec_model.set_target_bandwidth(6.0)
|
46 |
+
encodec_model.to(device)
|
47 |
+
|
48 |
+
print('Downloaded and loaded models!')
|
49 |
+
# PREDICT
|
50 |
+
# Put the path of the speaker you want to use here.
|
51 |
+
wav_file = speaker
|
52 |
+
# Put the path to save the cloned speaker to here.
|
53 |
+
out_file = 'speaker.npz'
|
54 |
+
|
55 |
+
wav, sr = torchaudio.load(wav_file)
|
56 |
+
|
57 |
+
wav_hubert = wav.to(device)
|
58 |
+
|
59 |
+
if wav_hubert.shape[0] == 2: # Stereo to mono if needed
|
60 |
+
wav_hubert = wav_hubert.mean(0, keepdim=True)
|
61 |
+
|
62 |
+
print('Extracting semantics...')
|
63 |
+
semantic_vectors = hubert_model.forward(wav_hubert, input_sample_hz=sr)
|
64 |
+
print('Tokenizing semantics...')
|
65 |
+
semantic_tokens = quant_model.get_token(semantic_vectors)
|
66 |
+
print('Creating coarse and fine prompts...')
|
67 |
+
wav = convert_audio(wav, sr, encodec_model.sample_rate, 1).unsqueeze(0)
|
68 |
+
|
69 |
+
wav = wav.to(device)
|
70 |
+
|
71 |
+
with torch.no_grad():
|
72 |
+
encoded_frames = encodec_model.encode(wav)
|
73 |
+
codes = torch.cat([encoded[0]
|
74 |
+
for encoded in encoded_frames], dim=-1).squeeze()
|
75 |
+
|
76 |
+
codes = codes.cpu()
|
77 |
+
semantic_tokens = semantic_tokens.cpu()
|
78 |
+
|
79 |
+
np.savez(out_file,
|
80 |
+
semantic_prompt=semantic_tokens,
|
81 |
+
fine_prompt=codes,
|
82 |
+
coarse_prompt=codes[:2, :]
|
83 |
+
)
|
84 |
+
|
85 |
+
print('Done!')
|
86 |
+
|
87 |
+
return ModelOutput(audio_out=Path('speaker.npz'))
|
prepare.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import zipfile
|
4 |
+
|
5 |
+
import numpy
|
6 |
+
import torchaudio
|
7 |
+
|
8 |
+
from hubert.pre_kmeans_hubert import CustomHubert
|
9 |
+
|
10 |
+
import torch
|
11 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
12 |
+
|
13 |
+
def prepare(path):
|
14 |
+
"""
|
15 |
+
Put all the training data in one folder
|
16 |
+
:param path: The path to the training data, with 2 subdirectories with zips, "semantic" and "wav", with equal pairs in both directories
|
17 |
+
"""
|
18 |
+
path = os.path.abspath(path)
|
19 |
+
raw_data_paths = {
|
20 |
+
'semantic': os.path.join(path, 'semantic'),
|
21 |
+
'wav': os.path.join(path, 'wav')
|
22 |
+
}
|
23 |
+
prepared_path = os.path.join(path, 'prepared')
|
24 |
+
|
25 |
+
if not os.path.isdir(prepared_path):
|
26 |
+
os.mkdir(prepared_path)
|
27 |
+
|
28 |
+
offset = 0
|
29 |
+
|
30 |
+
for zip_file in os.listdir(raw_data_paths['semantic']):
|
31 |
+
print(f'Extracting {os.path.basename(zip_file)}')
|
32 |
+
offset = extract_files({
|
33 |
+
'semantic': os.path.join(raw_data_paths['semantic'], zip_file),
|
34 |
+
'wav': os.path.join(raw_data_paths['wav'], zip_file)
|
35 |
+
}, prepared_path, offset)
|
36 |
+
|
37 |
+
|
38 |
+
def extract_files(zip_files: dict[str, str], out: str, start_offset: int = 0) -> int:
|
39 |
+
new_offset = start_offset
|
40 |
+
with zipfile.ZipFile(zip_files['semantic'], 'r') as semantic_zip:
|
41 |
+
with zipfile.ZipFile(zip_files['wav'], 'r') as wav_zip:
|
42 |
+
for file in semantic_zip.infolist():
|
43 |
+
for file2 in wav_zip.infolist():
|
44 |
+
if ''.join(file.filename.split('.')[:-1]).lower() == ''.join(file2.filename.split('.')[:-1]):
|
45 |
+
semantic_zip.extract(file, out)
|
46 |
+
shutil.move(os.path.join(out, file.filename), os.path.join(out, f'{new_offset}_semantic.npy'))
|
47 |
+
wav_zip.extract(file2, out)
|
48 |
+
shutil.move(os.path.join(out, file2.filename), os.path.join(out, f'{new_offset}_wav.wav'))
|
49 |
+
new_offset += 1
|
50 |
+
wav_zip.close()
|
51 |
+
semantic_zip.close()
|
52 |
+
|
53 |
+
return new_offset
|
54 |
+
|
55 |
+
|
56 |
+
def prepare2(path, model):
|
57 |
+
prepared = os.path.join(path, 'prepared')
|
58 |
+
ready = os.path.join(path, 'ready')
|
59 |
+
hubert_model = CustomHubert(checkpoint_path=model, device=device)
|
60 |
+
if not os.path.isdir(ready):
|
61 |
+
os.mkdir(ready)
|
62 |
+
|
63 |
+
wav_string = '_wav.wav'
|
64 |
+
sem_string = '_semantic.npy'
|
65 |
+
|
66 |
+
for input_file in os.listdir(prepared):
|
67 |
+
input_path = os.path.join(prepared, input_file)
|
68 |
+
if input_file.endswith(wav_string):
|
69 |
+
file_num = int(input_file[:-len(wav_string)])
|
70 |
+
fname = f'{file_num}_semantic_features.npy'
|
71 |
+
print('Processing', input_file)
|
72 |
+
if os.path.isfile(fname):
|
73 |
+
continue
|
74 |
+
wav, sr = torchaudio.load(input_path)
|
75 |
+
wav = wav.to(device)
|
76 |
+
|
77 |
+
if wav.shape[0] == 2: # Stereo to mono if needed
|
78 |
+
wav = wav.mean(0, keepdim=True)
|
79 |
+
|
80 |
+
output = hubert_model.forward(wav, input_sample_hz=sr)
|
81 |
+
out_array = output.cpu().numpy()
|
82 |
+
numpy.save(os.path.join(ready, fname), out_array)
|
83 |
+
elif input_file.endswith(sem_string):
|
84 |
+
fname = os.path.join(ready, input_file)
|
85 |
+
if os.path.isfile(fname):
|
86 |
+
continue
|
87 |
+
shutil.copy(input_path, fname)
|
88 |
+
print('All set! We\'re ready to train!')
|
process.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os.path
|
2 |
+
|
3 |
+
from args import args
|
4 |
+
from prepare import prepare, prepare2
|
5 |
+
from test_hubert import test_hubert
|
6 |
+
from hubert.customtokenizer import auto_train
|
7 |
+
|
8 |
+
path = args.path
|
9 |
+
mode = args.mode
|
10 |
+
model = args.hubert_model
|
11 |
+
|
12 |
+
if mode == 'prepare':
|
13 |
+
prepare(path)
|
14 |
+
|
15 |
+
elif mode == 'prepare2':
|
16 |
+
prepare2(path, model)
|
17 |
+
|
18 |
+
elif mode == 'train':
|
19 |
+
auto_train(path, load_model=os.path.join(path, 'model.pth'), save_epochs=args.train_save_epochs)
|
20 |
+
|
21 |
+
elif mode == 'test':
|
22 |
+
test_hubert(path, model)
|
readme.md
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Bark voice cloning
|
2 |
+
|
3 |
+
## Please read
|
4 |
+
This code works on python 3.10, i have not tested it on other versions. Some older versions will have issues.
|
5 |
+
|
6 |
+
## Voice cloning with bark in high quality?
|
7 |
+
It's possible now.
|
8 |
+
|
9 |
+
https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/assets/36931363/516375e2-d699-44fe-a928-cd0411982049
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
## How do I clone a voice?
|
14 |
+
For developers:
|
15 |
+
* [code examples on huggingface model page](https://huggingface.co/GitMylo/bark-voice-cloning)
|
16 |
+
|
17 |
+
For everyone:
|
18 |
+
* [audio-webui with bark and voice cloning](https://github.com/gitmylo/audio-webui)
|
19 |
+
* [online huggingface voice cloning space](https://huggingface.co/spaces/GitMylo/bark-voice-cloning)
|
20 |
+
* [interactive python notebook](notebook.ipynb)
|
21 |
+
|
22 |
+
## Voices cloned aren't very convincing, why are other people's cloned voices better than mine?
|
23 |
+
Make sure these things are **NOT** in your voice input: (in no particular order)
|
24 |
+
* Noise (You can use a noise remover before)
|
25 |
+
* Music (There are also music remover tools) (Unless you want music in the background)
|
26 |
+
* A cut-off at the end (This will cause it to try and continue on the generation)
|
27 |
+
* Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)
|
28 |
+
|
29 |
+
What makes for good prompt audio? (in no particular order)
|
30 |
+
* Clearly spoken
|
31 |
+
* No weird background noises
|
32 |
+
* Only one speaker
|
33 |
+
* Audio which ends after a sentence ends
|
34 |
+
* Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
|
35 |
+
* Around 10 seconds of data
|
36 |
+
|
37 |
+
## Pretrained models
|
38 |
+
### Official
|
39 |
+
|
40 |
+
| Name | HuBERT Model | Quantizer Version | Epoch | Language | Dataset |
|
41 |
+
|----------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|-------------------|-------|----------|--------------------------------------------------------------------------------------------------|
|
42 |
+
| [quantifier_hubert_base_ls960.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_hubert_base_ls960.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 0 | 3 | ENG | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
|
43 |
+
| [quantifier_hubert_base_ls960_14.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_hubert_base_ls960_14.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 0 | 14 | ENG | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
|
44 |
+
| [quantifier_V1_hubert_base_ls960_23.pth](https://huggingface.co/GitMylo/bark-voice-cloning/blob/main/quantifier_V1_hubert_base_ls960_23.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1 | 23 | ENG | [GitMylo/bark-semantic-training](https://huggingface.co/datasets/GitMylo/bark-semantic-training) |
|
45 |
+
|
46 |
+
### Community
|
47 |
+
|
48 |
+
| Author | Name | HuBERT Model | Quantizer Version | Epoch | Language | Dataset |
|
49 |
+
|---------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|-------------------|-------|----------|------------------------------------------------------------------------------------------------------------------------------|
|
50 |
+
| [HobisPL](https://github.com/HobisPL) | [polish-HuBERT-quantizer_8_epoch.pth](https://huggingface.co/Hobis/bark-voice-cloning-polish-HuBERT-quantizer/blob/main/polish-HuBERT-quantizer_8_epoch.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1 | 8 | POL | [Hobis/bark-polish-semantic-wav-training](https://huggingface.co/datasets/Hobis/bark-polish-semantic-wav-training) |
|
51 |
+
| [C0untFloyd](https://github.com/C0untFloyd) | [ german-HuBERT-quantizer_14_epoch.pth](https://huggingface.co/CountFloyd/bark-voice-cloning-german-HuBERT-quantizer/blob/main/german-HuBERT-quantizer_14_epoch.pth) | [HuBERT Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | 1 | 14 | GER | [CountFloyd/bark-german-semantic-wav-training](https://huggingface.co/datasets/CountFloyd/bark-german-semantic-wav-training) |
|
52 |
+
|
53 |
+
|
54 |
+
## For developers: Implementing voice cloning in your bark projects
|
55 |
+
* Simply copy the files from [this directory](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/tree/master/bark_hubert_quantizer) into your project.
|
56 |
+
* The [hubert manager](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/blob/master/hubert/hubert_manager.py) contains methods to download HuBERT and the custom Quantizer model.
|
57 |
+
* Loading the [CustomHuBERT](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer/blob/master/hubert/pre_kmeans_hubert.py) should be pretty straightforward
|
58 |
+
* The [notebook](notebook.ipynb) contains code to use on cuda or cpu. Instead of just cpu.
|
59 |
+
```python
|
60 |
+
from hubert.pre_kmeans_hubert import CustomHubert
|
61 |
+
import torchaudio
|
62 |
+
|
63 |
+
# Load the HuBERT model,
|
64 |
+
# checkpoint_path should work fine with data/models/hubert/hubert.pt for the default config
|
65 |
+
hubert_model = CustomHubert(checkpoint_path='path/to/checkpoint')
|
66 |
+
|
67 |
+
# Run the model to extract semantic features from an audio file, where wav is your audio file
|
68 |
+
wav, sr = torchaudio.load('path/to/wav') # This is where you load your wav, with soundfile or torchaudio for example
|
69 |
+
|
70 |
+
if wav.shape[0] == 2: # Stereo to mono if needed
|
71 |
+
wav = wav.mean(0, keepdim=True)
|
72 |
+
|
73 |
+
semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
|
74 |
+
```
|
75 |
+
* Loading and running the [custom kmeans](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer)
|
76 |
+
|
77 |
+
```python
|
78 |
+
import torch
|
79 |
+
from hubert.customtokenizer import CustomTokenizer
|
80 |
+
|
81 |
+
# Load the CustomTokenizer model from a checkpoint
|
82 |
+
# With default config, you can use the pretrained model from huggingface
|
83 |
+
# With the default setup from HuBERTManager, this will be in data/models/hubert/tokenizer.pth
|
84 |
+
tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth') # Automatically uses the right layers
|
85 |
+
|
86 |
+
# Process the semantic vectors from the previous HuBERT run (This works in batches, so you can send the entire HuBERT output)
|
87 |
+
semantic_tokens = tokenizer.get_token(semantic_vectors)
|
88 |
+
|
89 |
+
# Congratulations! You now have semantic tokens which can be used inside of a speaker prompt file.
|
90 |
+
```
|
91 |
+
|
92 |
+
## How do I train it myself?
|
93 |
+
Simply run the training commands.
|
94 |
+
|
95 |
+
A simple way to create semantic data and wavs for training, is with my script: [bark-data-gen](https://github.com/gitmylo/bark-data-gen). But remember that the creation of the wavs will take around the same time if not longer than the creation of the semantics. This can take a while to generate because of that.
|
96 |
+
|
97 |
+
For example, if you have a dataset with zips containing audio files, one zip for semantics, and one for the wav files. Inside of a folder called "Literature"
|
98 |
+
|
99 |
+
You should run `process.py --path Literature --mode prepare` for extracting all the data to one directory
|
100 |
+
|
101 |
+
You should run `process.py --path Literature --mode prepare2` for creating HuBERT semantic vectors, ready for training
|
102 |
+
|
103 |
+
You should run `process.py --path Literature --mode train` for training
|
104 |
+
|
105 |
+
And when your model has trained enough, you can run `process.py --path Literature --mode test` to test the latest model.
|
106 |
+
|
107 |
+
## Disclaimer
|
108 |
+
I am not responsible for audio generated using semantics created by this model. Just don't use it for illegal purposes.
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audiolm-pytorch==1.1.4
|
2 |
+
fairseq
|
3 |
+
huggingface-hub
|
4 |
+
sentencepiece
|
5 |
+
transformers
|
6 |
+
encodec
|
7 |
+
soundfile; platform_system == "Windows"
|
8 |
+
sox; platform_system != "Windows"
|
sample-speaker.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba7c59faa843a892cb35733b5bdad5a6bd3eebadf70494d48694a06c2fefbad6
|
3 |
+
size 1324090
|
setup.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup
|
2 |
+
|
3 |
+
setup(
|
4 |
+
name='bark_hubert_quantizer',
|
5 |
+
version='0.0.4',
|
6 |
+
packages=['bark_hubert_quantizer'],
|
7 |
+
install_requires=[
|
8 |
+
'audiolm-pytorch==1.1.4',
|
9 |
+
'fairseq',
|
10 |
+
'huggingface-hub',
|
11 |
+
'sentencepiece',
|
12 |
+
'transformers',
|
13 |
+
'encodec',
|
14 |
+
'soundfile; platform_system == "Windows"',
|
15 |
+
'sox; platform_system != "Windows"'
|
16 |
+
],
|
17 |
+
)
|
test_hubert.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import numpy
|
4 |
+
import torch
|
5 |
+
import torchaudio
|
6 |
+
|
7 |
+
from hubert.customtokenizer import CustomTokenizer
|
8 |
+
from hubert.pre_kmeans_hubert import CustomHubert
|
9 |
+
|
10 |
+
|
11 |
+
def test_hubert(path: str, model: str = 'model/hubert/hubert_base_ls960.pt', tokenizer: str = 'model.pth'):
|
12 |
+
hubert_model = CustomHubert(checkpoint_path=model)
|
13 |
+
customtokenizer = CustomTokenizer.load_from_checkpoint(os.path.join(path, tokenizer))
|
14 |
+
|
15 |
+
wav, sr = torchaudio.load(os.path.join(path, 'test', 'wav.wav'))
|
16 |
+
original = numpy.load(os.path.join(path, 'test', 'semantic.npy'))
|
17 |
+
|
18 |
+
out = hubert_model.forward(wav, input_sample_hz=sr)
|
19 |
+
out_tokenized = customtokenizer.get_token(out)
|
20 |
+
|
21 |
+
# print(out.shape, out_tokenized.shape)
|
22 |
+
print(original[:-1], out_tokenized)
|
23 |
+
numpy.save(os.path.join(path, 'test', 'gen_semantic.npy'), out_tokenized)
|