Spaces:
Runtime error
Runtime error
build added
Browse files- build/lib/deep_voice_cloning/__init__.py +0 -0
- build/lib/deep_voice_cloning/cloning/__init__.py +0 -0
- build/lib/deep_voice_cloning/cloning/config.json +7 -0
- build/lib/deep_voice_cloning/cloning/model.py +57 -0
- build/lib/deep_voice_cloning/data/__init__.py +0 -0
- build/lib/deep_voice_cloning/data/collator.py +45 -0
- build/lib/deep_voice_cloning/data/dataset.py +63 -0
- build/lib/deep_voice_cloning/transcriber/__init__.py +0 -0
- build/lib/deep_voice_cloning/transcriber/config.json +7 -0
- build/lib/deep_voice_cloning/transcriber/model.py +22 -0
- scripts/output/audio.wav +0 -0
build/lib/deep_voice_cloning/__init__.py
ADDED
File without changes
|
build/lib/deep_voice_cloning/cloning/__init__.py
ADDED
File without changes
|
build/lib/deep_voice_cloning/cloning/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"en": {
|
3 |
+
"model_path": "microsoft/speecht5_tts",
|
4 |
+
"vocoder_name": "microsoft/speecht5_hifigan",
|
5 |
+
"speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
|
6 |
+
}
|
7 |
+
}
|
build/lib/deep_voice_cloning/cloning/model.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from typing import Dict
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import torch
|
8 |
+
from speechbrain.pretrained import EncoderClassifier
|
9 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
10 |
+
|
11 |
+
|
12 |
+
class CloningModel:
|
13 |
+
def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
|
14 |
+
super(CloningModel, self).__init__()
|
15 |
+
if config is None:
|
16 |
+
self.speaker_embedding = None
|
17 |
+
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
|
18 |
+
self.config = json.load(f)[lang]
|
19 |
+
else:
|
20 |
+
self.config = config
|
21 |
+
self.speaker_embedding = torch.load(Path(self.config['model_path']) / "speaker_embedding.pt")[0]
|
22 |
+
self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
|
23 |
+
self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
|
24 |
+
self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
|
25 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
26 |
+
self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
|
27 |
+
self.to(self.device)
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
def to(self, device: torch.device):
|
32 |
+
self.model = self.model.to(device)
|
33 |
+
self.vocoder = self.vocoder.to(device)
|
34 |
+
|
35 |
+
def save_pretrained(self, save_directory: str):
|
36 |
+
self.model.save_pretrained(save_directory)
|
37 |
+
self.processor.save_pretrained(save_directory)
|
38 |
+
torch.save(self.speaker_embedding, Path(save_directory) / "speaker_embedding.pt")
|
39 |
+
|
40 |
+
def forward(self, text: str) -> np.array:
|
41 |
+
# tokenize text
|
42 |
+
inputs = self.processor(text=text, return_tensors="pt")
|
43 |
+
# generate spectrogram using backbone model
|
44 |
+
spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
|
45 |
+
self.speaker_embedding.to(self.device))
|
46 |
+
# decode spectrogram into waveform using vocoder
|
47 |
+
with torch.no_grad():
|
48 |
+
waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
|
49 |
+
return waveform_array
|
50 |
+
|
51 |
+
def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
|
52 |
+
with torch.no_grad():
|
53 |
+
speaker_embeddings = self.speaker_model.encode_batch(waveform)
|
54 |
+
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
|
55 |
+
self.speaker_embedding = speaker_embeddings
|
56 |
+
speaker_embeddings = speaker_embeddings.squeeze()
|
57 |
+
return speaker_embeddings
|
build/lib/deep_voice_cloning/data/__init__.py
ADDED
File without changes
|
build/lib/deep_voice_cloning/data/collator.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from typing import Any, Dict, List, Union
|
3 |
+
|
4 |
+
|
5 |
+
class TTSDataCollatorWithPadding:
|
6 |
+
|
7 |
+
def __init__(self, model, processor):
|
8 |
+
self.model = model
|
9 |
+
self.processor = processor
|
10 |
+
|
11 |
+
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
|
12 |
+
input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
|
13 |
+
label_features = [{"input_values": feature["labels"]} for feature in features]
|
14 |
+
speaker_features = [feature["speaker_embeddings"] for feature in features]
|
15 |
+
|
16 |
+
# collate the inputs and targets into a batch
|
17 |
+
batch = self.processor.pad(
|
18 |
+
input_ids=input_ids,
|
19 |
+
labels=label_features,
|
20 |
+
return_tensors="pt",
|
21 |
+
)
|
22 |
+
|
23 |
+
# replace padding with -100 to ignore loss correctly
|
24 |
+
batch["labels"] = batch["labels"].masked_fill(
|
25 |
+
batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
|
26 |
+
)
|
27 |
+
|
28 |
+
# not used during fine-tuning
|
29 |
+
del batch["decoder_attention_mask"]
|
30 |
+
|
31 |
+
# round down target lengths to multiple of reduction factor
|
32 |
+
if self.model.config.reduction_factor > 1:
|
33 |
+
target_lengths = torch.tensor([
|
34 |
+
len(feature["input_values"]) for feature in label_features
|
35 |
+
])
|
36 |
+
target_lengths = target_lengths.new([
|
37 |
+
length - length % self.model.config.reduction_factor for length in target_lengths
|
38 |
+
])
|
39 |
+
max_length = max(target_lengths)
|
40 |
+
batch["labels"] = batch["labels"][:, :max_length]
|
41 |
+
|
42 |
+
# add the speaker embeddings
|
43 |
+
batch["speaker_embeddings"] = torch.tensor(speaker_features)
|
44 |
+
|
45 |
+
return batch
|
build/lib/deep_voice_cloning/data/dataset.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Any
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
from datasets import Dataset
|
7 |
+
|
8 |
+
from ..cloning.model import CloningModel
|
9 |
+
from ..transcriber.model import TranscriberModel
|
10 |
+
|
11 |
+
|
12 |
+
def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
|
13 |
+
"""
|
14 |
+
Prepare a single example for training
|
15 |
+
"""
|
16 |
+
# feature extraction and tokenization
|
17 |
+
processed_example = model.processor(
|
18 |
+
text=example["normalized_text"],
|
19 |
+
audio_target=example["audio"]["array"],
|
20 |
+
sampling_rate=16000,
|
21 |
+
return_attention_mask=False,
|
22 |
+
)
|
23 |
+
|
24 |
+
# strip off the batch dimension
|
25 |
+
if len(torch.tensor(processed_example['input_ids']).shape) > 1:
|
26 |
+
processed_example['input_ids'] = processed_example['input_ids'][0]
|
27 |
+
|
28 |
+
processed_example["labels"] = processed_example["labels"][0]
|
29 |
+
|
30 |
+
# use SpeechBrain to obtain x-vector
|
31 |
+
processed_example["speaker_embeddings"] = model.create_speaker_embedding(
|
32 |
+
torch.tensor(example["audio"]["array"])
|
33 |
+
).numpy()
|
34 |
+
|
35 |
+
return processed_example
|
36 |
+
|
37 |
+
|
38 |
+
def get_cloning_dataset(input_audio_path: str,
|
39 |
+
transcriber_model: TranscriberModel,
|
40 |
+
cloning_model: CloningModel,
|
41 |
+
sampling_rate: int = 16000,
|
42 |
+
window_size_secs: int = 5) -> Dataset:
|
43 |
+
"""
|
44 |
+
Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
|
45 |
+
"""
|
46 |
+
speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
|
47 |
+
|
48 |
+
# split a waveform into splits of 5 secs each
|
49 |
+
speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
|
50 |
+
texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
|
51 |
+
for speech_array in speech_arrays]
|
52 |
+
|
53 |
+
dataset = Dataset.from_list([
|
54 |
+
{'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
|
55 |
+
for i in range(len(speech_arrays))]
|
56 |
+
)
|
57 |
+
|
58 |
+
dataset = dataset.map(
|
59 |
+
prepare_dataset, fn_kwargs={'model': cloning_model},
|
60 |
+
remove_columns=dataset.column_names,
|
61 |
+
)
|
62 |
+
|
63 |
+
return dataset
|
build/lib/deep_voice_cloning/transcriber/__init__.py
ADDED
File without changes
|
build/lib/deep_voice_cloning/transcriber/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"language_model_names": {
|
3 |
+
"en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
|
4 |
+
"fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
|
5 |
+
"de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
|
6 |
+
}
|
7 |
+
}
|
build/lib/deep_voice_cloning/transcriber/model.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
7 |
+
|
8 |
+
|
9 |
+
class TranscriberModel:
|
10 |
+
def __init__(self, lang: str = 'en'):
|
11 |
+
with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
|
12 |
+
config = json.load(f)
|
13 |
+
self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
|
14 |
+
self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
|
15 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
16 |
+
|
17 |
+
def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
|
18 |
+
model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
|
19 |
+
with torch.no_grad():
|
20 |
+
logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
|
21 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
22 |
+
return self.processor.batch_decode(predicted_ids)
|
scripts/output/audio.wav
DELETED
Binary file (34.9 kB)
|
|