konverner commited on
Commit
7865f10
1 Parent(s): 7f1be45

build added

Browse files
build/lib/deep_voice_cloning/__init__.py ADDED
File without changes
build/lib/deep_voice_cloning/cloning/__init__.py ADDED
File without changes
build/lib/deep_voice_cloning/cloning/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "en": {
3
+ "model_path": "microsoft/speecht5_tts",
4
+ "vocoder_name": "microsoft/speecht5_hifigan",
5
+ "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
6
+ }
7
+ }
build/lib/deep_voice_cloning/cloning/model.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import Dict
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import torch
8
+ from speechbrain.pretrained import EncoderClassifier
9
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
10
+
11
+
12
+ class CloningModel:
13
+ def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
14
+ super(CloningModel, self).__init__()
15
+ if config is None:
16
+ self.speaker_embedding = None
17
+ with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
18
+ self.config = json.load(f)[lang]
19
+ else:
20
+ self.config = config
21
+ self.speaker_embedding = torch.load(Path(self.config['model_path']) / "speaker_embedding.pt")[0]
22
+ self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
23
+ self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
24
+ self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
25
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+ self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
27
+ self.to(self.device)
28
+
29
+
30
+
31
+ def to(self, device: torch.device):
32
+ self.model = self.model.to(device)
33
+ self.vocoder = self.vocoder.to(device)
34
+
35
+ def save_pretrained(self, save_directory: str):
36
+ self.model.save_pretrained(save_directory)
37
+ self.processor.save_pretrained(save_directory)
38
+ torch.save(self.speaker_embedding, Path(save_directory) / "speaker_embedding.pt")
39
+
40
+ def forward(self, text: str) -> np.array:
41
+ # tokenize text
42
+ inputs = self.processor(text=text, return_tensors="pt")
43
+ # generate spectrogram using backbone model
44
+ spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
45
+ self.speaker_embedding.to(self.device))
46
+ # decode spectrogram into waveform using vocoder
47
+ with torch.no_grad():
48
+ waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
49
+ return waveform_array
50
+
51
+ def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
52
+ with torch.no_grad():
53
+ speaker_embeddings = self.speaker_model.encode_batch(waveform)
54
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
55
+ self.speaker_embedding = speaker_embeddings
56
+ speaker_embeddings = speaker_embeddings.squeeze()
57
+ return speaker_embeddings
build/lib/deep_voice_cloning/data/__init__.py ADDED
File without changes
build/lib/deep_voice_cloning/data/collator.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Any, Dict, List, Union
3
+
4
+
5
+ class TTSDataCollatorWithPadding:
6
+
7
+ def __init__(self, model, processor):
8
+ self.model = model
9
+ self.processor = processor
10
+
11
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
12
+ input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
13
+ label_features = [{"input_values": feature["labels"]} for feature in features]
14
+ speaker_features = [feature["speaker_embeddings"] for feature in features]
15
+
16
+ # collate the inputs and targets into a batch
17
+ batch = self.processor.pad(
18
+ input_ids=input_ids,
19
+ labels=label_features,
20
+ return_tensors="pt",
21
+ )
22
+
23
+ # replace padding with -100 to ignore loss correctly
24
+ batch["labels"] = batch["labels"].masked_fill(
25
+ batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
26
+ )
27
+
28
+ # not used during fine-tuning
29
+ del batch["decoder_attention_mask"]
30
+
31
+ # round down target lengths to multiple of reduction factor
32
+ if self.model.config.reduction_factor > 1:
33
+ target_lengths = torch.tensor([
34
+ len(feature["input_values"]) for feature in label_features
35
+ ])
36
+ target_lengths = target_lengths.new([
37
+ length - length % self.model.config.reduction_factor for length in target_lengths
38
+ ])
39
+ max_length = max(target_lengths)
40
+ batch["labels"] = batch["labels"][:, :max_length]
41
+
42
+ # add the speaker embeddings
43
+ batch["speaker_embeddings"] = torch.tensor(speaker_features)
44
+
45
+ return batch
build/lib/deep_voice_cloning/data/dataset.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+
3
+ import torch
4
+ import librosa
5
+ import numpy as np
6
+ from datasets import Dataset
7
+
8
+ from ..cloning.model import CloningModel
9
+ from ..transcriber.model import TranscriberModel
10
+
11
+
12
+ def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
13
+ """
14
+ Prepare a single example for training
15
+ """
16
+ # feature extraction and tokenization
17
+ processed_example = model.processor(
18
+ text=example["normalized_text"],
19
+ audio_target=example["audio"]["array"],
20
+ sampling_rate=16000,
21
+ return_attention_mask=False,
22
+ )
23
+
24
+ # strip off the batch dimension
25
+ if len(torch.tensor(processed_example['input_ids']).shape) > 1:
26
+ processed_example['input_ids'] = processed_example['input_ids'][0]
27
+
28
+ processed_example["labels"] = processed_example["labels"][0]
29
+
30
+ # use SpeechBrain to obtain x-vector
31
+ processed_example["speaker_embeddings"] = model.create_speaker_embedding(
32
+ torch.tensor(example["audio"]["array"])
33
+ ).numpy()
34
+
35
+ return processed_example
36
+
37
+
38
+ def get_cloning_dataset(input_audio_path: str,
39
+ transcriber_model: TranscriberModel,
40
+ cloning_model: CloningModel,
41
+ sampling_rate: int = 16000,
42
+ window_size_secs: int = 5) -> Dataset:
43
+ """
44
+ Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
45
+ """
46
+ speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
47
+
48
+ # split a waveform into splits of 5 secs each
49
+ speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
50
+ texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
51
+ for speech_array in speech_arrays]
52
+
53
+ dataset = Dataset.from_list([
54
+ {'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
55
+ for i in range(len(speech_arrays))]
56
+ )
57
+
58
+ dataset = dataset.map(
59
+ prepare_dataset, fn_kwargs={'model': cloning_model},
60
+ remove_columns=dataset.column_names,
61
+ )
62
+
63
+ return dataset
build/lib/deep_voice_cloning/transcriber/__init__.py ADDED
File without changes
build/lib/deep_voice_cloning/transcriber/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "language_model_names": {
3
+ "en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
4
+ "fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
5
+ "de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
6
+ }
7
+ }
build/lib/deep_voice_cloning/transcriber/model.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ import numpy as np
5
+ import torch
6
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
7
+
8
+
9
+ class TranscriberModel:
10
+ def __init__(self, lang: str = 'en'):
11
+ with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
12
+ config = json.load(f)
13
+ self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
14
+ self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
15
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+
17
+ def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
18
+ model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
19
+ with torch.no_grad():
20
+ logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
21
+ predicted_ids = torch.argmax(logits, dim=-1)
22
+ return self.processor.batch_decode(predicted_ids)
scripts/output/audio.wav DELETED
Binary file (34.9 kB)