Spaces:

drewThomasson
/

ebook2audiobook_v2.0_Beta

Running

App Files Files Community

drewThomasson commited on Dec 15, 2024

Commit

f341d4d

verified ·

1 Parent(s): 017ec2f

Upload 310 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

TTS/__init__.py +33 -0
TTS/api.py +458 -0
TTS/bin/__init__.py +0 -0
TTS/bin/collect_env_info.py +49 -0
TTS/bin/compute_attention_masks.py +169 -0
TTS/bin/compute_embeddings.py +201 -0
TTS/bin/compute_statistics.py +100 -0
TTS/bin/eval_encoder.py +92 -0
TTS/bin/extract_tts_spectrograms.py +290 -0
TTS/bin/find_unique_chars.py +40 -0
TTS/bin/find_unique_phonemes.py +79 -0
TTS/bin/remove_silence_using_vad.py +128 -0
TTS/bin/resample.py +90 -0
TTS/bin/synthesize.py +486 -0
TTS/bin/train_encoder.py +340 -0
TTS/bin/train_tts.py +75 -0
TTS/bin/train_vocoder.py +81 -0
TTS/bin/tune_wavegrad.py +107 -0
TTS/config/__init__.py +138 -0
TTS/config/shared_configs.py +268 -0
TTS/demos/xtts_ft_demo/requirements.txt +2 -0
TTS/demos/xtts_ft_demo/utils/formatter.py +161 -0
TTS/demos/xtts_ft_demo/utils/gpt_train.py +171 -0
TTS/demos/xtts_ft_demo/xtts_demo.py +433 -0
TTS/encoder/README.md +18 -0
TTS/encoder/__init__.py +0 -0
TTS/encoder/configs/base_encoder_config.py +61 -0
TTS/encoder/configs/emotion_encoder_config.py +12 -0
TTS/encoder/configs/speaker_encoder_config.py +11 -0
TTS/encoder/dataset.py +146 -0
TTS/encoder/losses.py +230 -0
TTS/encoder/models/base_encoder.py +165 -0
TTS/encoder/models/lstm.py +99 -0
TTS/encoder/models/resnet.py +198 -0
TTS/encoder/requirements.txt +2 -0
TTS/encoder/utils/__init__.py +0 -0
TTS/encoder/utils/generic_utils.py +141 -0
TTS/encoder/utils/prepare_voxceleb.py +226 -0
TTS/encoder/utils/training.py +99 -0
TTS/encoder/utils/visual.py +53 -0
TTS/model.py +66 -0
TTS/server/README.md +21 -0
TTS/server/__init__.py +0 -0
TTS/server/conf.json +12 -0
TTS/server/server.py +262 -0
TTS/server/static/coqui-log-green-TTS.png +0 -0
TTS/server/templates/details.html +131 -0
TTS/server/templates/index.html +154 -0
TTS/tts/__init__.py +0 -0
TTS/tts/configs/__init__.py +17 -0

TTS/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import importlib.metadata
+from TTS.utils.generic_utils import is_pytorch_at_least_2_4
+__version__ = importlib.metadata.version("coqui-tts")
+if is_pytorch_at_least_2_4():
+    import _codecs
+    from collections import defaultdict
+    import numpy as np
+    import torch
+    from TTS.config.shared_configs import BaseDatasetConfig
+    from TTS.tts.configs.xtts_config import XttsConfig
+    from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
+    from TTS.utils.radam import RAdam
+    torch.serialization.add_safe_globals([dict, defaultdict, RAdam])
+    # Bark
+    torch.serialization.add_safe_globals(
+        [
+            np.core.multiarray.scalar,
+            np.dtype,
+            np.dtypes.Float64DType,
+            _codecs.encode,  # TODO: safe by default from Pytorch 2.5
+        ]
+    )
+    # XTTS
+    torch.serialization.add_safe_globals([BaseDatasetConfig, XttsConfig, XttsAudioConfig, XttsArgs])

TTS/api.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import logging
+import tempfile
+import warnings
+from pathlib import Path
+from torch import nn
+from TTS.config import load_config
+from TTS.utils.audio.numpy_transforms import save_wav
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+logger = logging.getLogger(__name__)
+class TTS(nn.Module):
+    """TODO: Add voice conversion and Capacitron support."""
+    def __init__(
+        self,
+        model_name: str = "",
+        model_path: str = None,
+        config_path: str = None,
+        vocoder_path: str = None,
+        vocoder_config_path: str = None,
+        progress_bar: bool = True,
+        gpu=False,
+    ):
+        """🐸TTS python interface that allows to load and use the released models.
+        Example with a multi-speaker model:
+            >>> from TTS.api import TTS
+            >>> tts = TTS(TTS.list_models()[0])
+            >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
+            >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
+        Example with a single-speaker model:
+            >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
+            >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
+        Example loading a model from a path:
+            >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
+            >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
+        Example voice cloning with YourTTS in English, French and Portuguese:
+            >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+            >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
+            >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
+            >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
+        Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
+            >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
+            >>> tts.tts_to_file("This is a test.", file_path="output.wav")
+        Args:
+            model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
+            model_path (str, optional): Path to the model checkpoint. Defaults to None.
+            config_path (str, optional): Path to the model config. Defaults to None.
+            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
+            vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
+            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+        super().__init__()
+        self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar)
+        self.config = load_config(config_path) if config_path else None
+        self.synthesizer = None
+        self.voice_converter = None
+        self.model_name = ""
+        if gpu:
+            warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
+        if model_name is not None and len(model_name) > 0:
+            if "tts_models" in model_name:
+                self.load_tts_model_by_name(model_name, gpu)
+            elif "voice_conversion_models" in model_name:
+                self.load_vc_model_by_name(model_name, gpu)
+            else:
+                self.load_model_by_name(model_name, gpu)
+        if model_path:
+            self.load_tts_model_by_path(
+                model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
+            )
+    @property
+    def models(self):
+        return self.manager.list_tts_models()
+    @property
+    def is_multi_speaker(self):
+        if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
+            return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
+        return False
+    @property
+    def is_multi_lingual(self):
+        # Not sure what sets this to None, but applied a fix to prevent crashing.
+        if (
+            isinstance(self.model_name, str)
+            and "xtts" in self.model_name
+            or self.config
+            and ("xtts" in self.config.model or "languages" in self.config and len(self.config.languages) > 1)
+        ):
+            return True
+        if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
+            return self.synthesizer.tts_model.language_manager.num_languages > 1
+        return False
+    @property
+    def speakers(self):
+        if not self.is_multi_speaker:
+            return None
+        return self.synthesizer.tts_model.speaker_manager.speaker_names
+    @property
+    def languages(self):
+        if not self.is_multi_lingual:
+            return None
+        return self.synthesizer.tts_model.language_manager.language_names
+    @staticmethod
+    def get_models_file_path():
+        return Path(__file__).parent / ".models.json"
+    @staticmethod
+    def list_models():
+        return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models()
+    def download_model_by_name(self, model_name: str):
+        model_path, config_path, model_item = self.manager.download_model(model_name)
+        if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
+            # return model directory if there are multiple files
+            # we assume that the model knows how to load itself
+            return None, None, None, None, model_path
+        if model_item.get("default_vocoder") is None:
+            return model_path, config_path, None, None, None
+        vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
+        return model_path, config_path, vocoder_path, vocoder_config_path, None
+    def load_model_by_name(self, model_name: str, gpu: bool = False):
+        """Load one of the 🐸TTS models by name.
+        Args:
+            model_name (str): Model name to load. You can list models by ```tts.models```.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+        self.load_tts_model_by_name(model_name, gpu)
+    def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
+        """Load one of the voice conversion models by name.
+        Args:
+            model_name (str): Model name to load. You can list models by ```tts.models```.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+        self.model_name = model_name
+        model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
+        self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
+    def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
+        """Load one of 🐸TTS models by name.
+        Args:
+            model_name (str): Model name to load. You can list models by ```tts.models```.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        TODO: Add tests
+        """
+        self.synthesizer = None
+        self.model_name = model_name
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(model_name)
+        # init synthesizer
+        # None values are fetch from the model
+        self.synthesizer = Synthesizer(
+            tts_checkpoint=model_path,
+            tts_config_path=config_path,
+            tts_speakers_file=None,
+            tts_languages_file=None,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
+            encoder_checkpoint=None,
+            encoder_config=None,
+            model_dir=model_dir,
+            use_cuda=gpu,
+        )
+    def load_tts_model_by_path(
+        self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
+    ):
+        """Load a model from a path.
+        Args:
+            model_path (str): Path to the model checkpoint.
+            config_path (str): Path to the model config.
+            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
+            vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+        self.synthesizer = Synthesizer(
+            tts_checkpoint=model_path,
+            tts_config_path=config_path,
+            tts_speakers_file=None,
+            tts_languages_file=None,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config,
+            encoder_checkpoint=None,
+            encoder_config=None,
+            use_cuda=gpu,
+        )
+    def _check_arguments(
+        self,
+        speaker: str = None,
+        language: str = None,
+        speaker_wav: str = None,
+        emotion: str = None,
+        speed: float = None,
+        **kwargs,
+    ) -> None:
+        """Check if the arguments are valid for the model."""
+        # check for the coqui tts models
+        if self.is_multi_speaker and (speaker is None and speaker_wav is None):
+            raise ValueError("Model is multi-speaker but no `speaker` is provided.")
+        if self.is_multi_lingual and language is None:
+            raise ValueError("Model is multi-lingual but no `language` is provided.")
+        if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
+            raise ValueError("Model is not multi-speaker but `speaker` is provided.")
+        if not self.is_multi_lingual and language is not None:
+            raise ValueError("Model is not multi-lingual but `language` is provided.")
+        if emotion is not None and speed is not None:
+            raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
+    def tts(
+        self,
+        text: str,
+        speaker: str = None,
+        language: str = None,
+        speaker_wav: str = None,
+        emotion: str = None,
+        speed: float = None,
+        split_sentences: bool = True,
+        **kwargs,
+    ):
+        """Convert text to speech.
+        Args:
+            text (str):
+                Input text to synthesize.
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
+                supported by `XTTS` model.
+            speaker_wav (str, optional):
+                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+                Defaults to None.
+            emotion (str, optional):
+                Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
+            speed (float, optional):
+                Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
+                Defaults to None.
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
+            kwargs (dict, optional):
+                Additional arguments for the model.
+        """
+        self._check_arguments(
+            speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
+        )
+        wav = self.synthesizer.tts(
+            text=text,
+            speaker_name=speaker,
+            language_name=language,
+            speaker_wav=speaker_wav,
+            reference_wav=None,
+            style_wav=None,
+            style_text=None,
+            reference_speaker_name=None,
+            split_sentences=split_sentences,
+            **kwargs,
+        )
+        return wav
+    def tts_to_file(
+        self,
+        text: str,
+        speaker: str = None,
+        language: str = None,
+        speaker_wav: str = None,
+        emotion: str = None,
+        speed: float = 1.0,
+        pipe_out=None,
+        file_path: str = "output.wav",
+        split_sentences: bool = True,
+        **kwargs,
+    ):
+        """Convert text to speech.
+        Args:
+            text (str):
+                Input text to synthesize.
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            language (str, optional):
+                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+            speaker_wav (str, optional):
+                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+                Defaults to None.
+            emotion (str, optional):
+                Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
+            speed (float, optional):
+                Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
+            pipe_out (BytesIO, optional):
+                Flag to stdout the generated TTS wav file for shell pipe.
+            file_path (str, optional):
+                Output file path. Defaults to "output.wav".
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
+            kwargs (dict, optional):
+                Additional arguments for the model.
+        """
+        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
+        wav = self.tts(
+            text=text,
+            speaker=speaker,
+            language=language,
+            speaker_wav=speaker_wav,
+            split_sentences=split_sentences,
+            **kwargs,
+        )
+        self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
+        return file_path
+    def voice_conversion(
+        self,
+        source_wav: str,
+        target_wav: str,
+    ):
+        """Voice conversion with FreeVC. Convert source wav to target speaker.
+        Args:``
+            source_wav (str):
+                Path to the source wav file.
+            target_wav (str):`
+                Path to the target wav file.
+        """
+        wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
+        return wav
+    def voice_conversion_to_file(
+        self,
+        source_wav: str,
+        target_wav: str,
+        file_path: str = "output.wav",
+    ):
+        """Voice conversion with FreeVC. Convert source wav to target speaker.
+        Args:
+            source_wav (str):
+                Path to the source wav file.
+            target_wav (str):
+                Path to the target wav file.
+            file_path (str, optional):
+                Output file path. Defaults to "output.wav".
+        """
+        wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
+        save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
+        return file_path
+    def tts_with_vc(
+        self,
+        text: str,
+        language: str = None,
+        speaker_wav: str = None,
+        speaker: str = None,
+        split_sentences: bool = True,
+    ):
+        """Convert text to speech with voice conversion.
+        It combines tts with voice conversion to fake voice cloning.
+        - Convert text to speech with tts.
+        - Convert the output wav to target speaker with voice conversion.
+        Args:
+            text (str):
+                Input text to synthesize.
+            language (str, optional):
+                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+            speaker_wav (str, optional):
+                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+                Defaults to None.
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
+        """
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+            # Lazy code... save it to a temp file to resample it while reading it for VC
+            self.tts_to_file(
+                text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
+            )
+        if self.voice_converter is None:
+            self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
+        wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
+        return wav
+    def tts_with_vc_to_file(
+        self,
+        text: str,
+        language: str = None,
+        speaker_wav: str = None,
+        file_path: str = "output.wav",
+        speaker: str = None,
+        split_sentences: bool = True,
+    ):
+        """Convert text to speech with voice conversion and save to file.
+        Check `tts_with_vc` for more details.
+        Args:
+            text (str):
+                Input text to synthesize.
+            language (str, optional):
+                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+            speaker_wav (str, optional):
+                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+                Defaults to None.
+            file_path (str, optional):
+                Output file path. Defaults to "output.wav".
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            split_sentences (bool, optional):
+                Split text into sentences, synthesize them separately and concatenate the file audio.
+                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
+                applicable to the 🐸TTS models. Defaults to True.
+        """
+        wav = self.tts_with_vc(
+            text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
+        )
+        save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)

TTS/bin/__init__.py ADDED Viewed

File without changes

TTS/bin/collect_env_info.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Get detailed info about the working environment."""
+import json
+import os
+import platform
+import sys
+import numpy
+import torch
+import TTS
+sys.path += [os.path.abspath(".."), os.path.abspath(".")]
+def system_info():
+    return {
+        "OS": platform.system(),
+        "architecture": platform.architecture(),
+        "version": platform.version(),
+        "processor": platform.processor(),
+        "python": platform.python_version(),
+    }
+def cuda_info():
+    return {
+        "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
+        "available": torch.cuda.is_available(),
+        "version": torch.version.cuda,
+    }
+def package_info():
+    return {
+        "numpy": numpy.__version__,
+        "PyTorch_version": torch.__version__,
+        "PyTorch_debug": torch.version.debug,
+        "TTS": TTS.__version__,
+    }
+def main():
+    details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
+    print(json.dumps(details, indent=4, sort_keys=True))
+if __name__ == "__main__":
+    main()

TTS/bin/compute_attention_masks.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import argparse
+import importlib
+import logging
+import os
+from argparse import RawTextHelpFormatter
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from trainer.io import load_checkpoint
+from TTS.config import load_config
+from TTS.tts.datasets.TTSDataset import TTSDataset
+from TTS.tts.models import setup_model
+from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+if __name__ == "__main__":
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    # pylint: disable=bad-option-value
+    parser = argparse.ArgumentParser(
+        description="""Extract attention masks from trained Tacotron/Tacotron2 models.
+These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
+        """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
+(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
+        """
+Example run:
+    CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
+        --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
+        --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
+        --dataset_metafile metadata.csv
+        --data_path /root/LJSpeech-1.1/
+        --batch_size 32
+        --dataset ljspeech
+        --use_cuda
+""",
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        required=True,
+        help="Path to Tacotron/Tacotron2 config file.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="",
+        required=True,
+        help="Target dataset processor name from TTS.tts.dataset.preprocess.",
+    )
+    parser.add_argument(
+        "--dataset_metafile",
+        type=str,
+        default="",
+        required=True,
+        help="Dataset metafile inclusing file paths with transcripts.",
+    )
+    parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
+    parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="enable/disable cuda.")
+    parser.add_argument(
+        "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
+    )
+    args = parser.parse_args()
+    C = load_config(args.config_path)
+    ap = AudioProcessor(**C.audio)
+    # if the vocabulary was passed, replace the default
+    if "characters" in C.keys():
+        symbols, phonemes = make_symbols(**C.characters)  # noqa: F811
+    # load the model
+    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
+    # TODO: handle multi-speaker
+    model = setup_model(C)
+    model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
+    # data loader
+    preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
+    preprocessor = getattr(preprocessor, args.dataset)
+    meta_data = preprocessor(args.data_path, args.dataset_metafile)
+    dataset = TTSDataset(
+        model.decoder.r,
+        C.text_cleaner,
+        compute_linear_spec=False,
+        ap=ap,
+        meta_data=meta_data,
+        characters=C.characters if "characters" in C.keys() else None,
+        add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
+        use_phonemes=C.use_phonemes,
+        phoneme_cache_path=C.phoneme_cache_path,
+        phoneme_language=C.phoneme_language,
+        enable_eos_bos=C.enable_eos_bos_chars,
+    )
+    dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
+    loader = DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        num_workers=4,
+        collate_fn=dataset.collate_fn,
+        shuffle=False,
+        drop_last=False,
+    )
+    # compute attentions
+    file_paths = []
+    with torch.no_grad():
+        for data in tqdm(loader):
+            # setup input data
+            text_input = data[0]
+            text_lengths = data[1]
+            linear_input = data[3]
+            mel_input = data[4]
+            mel_lengths = data[5]
+            stop_targets = data[6]
+            item_idxs = data[7]
+            # dispatch data to GPU
+            if args.use_cuda:
+                text_input = text_input.cuda()
+                text_lengths = text_lengths.cuda()
+                mel_input = mel_input.cuda()
+                mel_lengths = mel_lengths.cuda()
+            model_outputs = model.forward(text_input, text_lengths, mel_input)
+            alignments = model_outputs["alignments"].detach()
+            for idx, alignment in enumerate(alignments):
+                item_idx = item_idxs[idx]
+                # interpolate if r > 1
+                alignment = (
+                    torch.nn.functional.interpolate(
+                        alignment.transpose(0, 1).unsqueeze(0),
+                        size=None,
+                        scale_factor=model.decoder.r,
+                        mode="nearest",
+                        align_corners=None,
+                        recompute_scale_factor=None,
+                    )
+                    .squeeze(0)
+                    .transpose(0, 1)
+                )
+                # remove paddings
+                alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
+                # set file paths
+                wav_file_name = os.path.basename(item_idx)
+                align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
+                file_path = item_idx.replace(wav_file_name, align_file_name)
+                # save output
+                wav_file_abs_path = os.path.abspath(item_idx)
+                file_abs_path = os.path.abspath(file_path)
+                file_paths.append([wav_file_abs_path, file_abs_path])
+                np.save(file_path, alignment)
+        # ourput metafile
+        metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
+        with open(metafile, "w", encoding="utf-8") as f:
+            for p in file_paths:
+                f.write(f"{p[0]}|{p[1]}\n")
+        print(f" >> Metafile created: {metafile}")

TTS/bin/compute_embeddings.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import argparse
+import logging
+import os
+from argparse import RawTextHelpFormatter
+import torch
+from tqdm import tqdm
+from TTS.config import load_config
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.managers import save_file
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+def compute_embeddings(
+    model_path,
+    config_path,
+    output_path,
+    old_speakers_file=None,
+    old_append=False,
+    config_dataset_path=None,
+    formatter_name=None,
+    dataset_name=None,
+    dataset_path=None,
+    meta_file_train=None,
+    meta_file_val=None,
+    disable_cuda=False,
+    no_eval=False,
+):
+    use_cuda = torch.cuda.is_available() and not disable_cuda
+    if config_dataset_path is not None:
+        c_dataset = load_config(config_dataset_path)
+        meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
+    else:
+        c_dataset = BaseDatasetConfig()
+        c_dataset.formatter = formatter_name
+        c_dataset.dataset_name = dataset_name
+        c_dataset.path = dataset_path
+        if meta_file_train is not None:
+            c_dataset.meta_file_train = meta_file_train
+        if meta_file_val is not None:
+            c_dataset.meta_file_val = meta_file_val
+        meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
+    if meta_data_eval is None:
+        samples = meta_data_train
+    else:
+        samples = meta_data_train + meta_data_eval
+    encoder_manager = SpeakerManager(
+        encoder_model_path=model_path,
+        encoder_config_path=config_path,
+        d_vectors_file_path=old_speakers_file,
+        use_cuda=use_cuda,
+    )
+    class_name_key = encoder_manager.encoder_config.class_name_key
+    # compute speaker embeddings
+    if old_speakers_file is not None and old_append:
+        speaker_mapping = encoder_manager.embeddings
+    else:
+        speaker_mapping = {}
+    for fields in tqdm(samples):
+        class_name = fields[class_name_key]
+        audio_file = fields["audio_file"]
+        embedding_key = fields["audio_unique_name"]
+        # Only update the speaker name when the embedding is already in the old file.
+        if embedding_key in speaker_mapping:
+            speaker_mapping[embedding_key]["name"] = class_name
+            continue
+        if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
+            # get the embedding from the old file
+            embedd = encoder_manager.get_embedding_by_clip(embedding_key)
+        else:
+            # extract the embedding
+            embedd = encoder_manager.compute_embedding_from_clip(audio_file)
+        # create speaker_mapping if target dataset is defined
+        speaker_mapping[embedding_key] = {}
+        speaker_mapping[embedding_key]["name"] = class_name
+        speaker_mapping[embedding_key]["embedding"] = embedd
+    if speaker_mapping:
+        # save speaker_mapping if target dataset is defined
+        if os.path.isdir(output_path):
+            mapping_file_path = os.path.join(output_path, "speakers.pth")
+        else:
+            mapping_file_path = output_path
+        if os.path.dirname(mapping_file_path) != "":
+            os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
+        save_file(speaker_mapping, mapping_file_path)
+        print("Speaker embeddings saved at:", mapping_file_path)
+if __name__ == "__main__":
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    parser = argparse.ArgumentParser(
+        description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
+        """
+        Example runs:
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
+        """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        help="Path to model checkpoint file. It defaults to the released speaker encoder.",
+        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        help="Path to model config file. It defaults to the released speaker encoder config.",
+        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+    )
+    parser.add_argument(
+        "--config_dataset_path",
+        type=str,
+        help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
+        default=None,
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        help="Path for output `pth` or `json` file.",
+        default="speakers.pth",
+    )
+    parser.add_argument(
+        "--old_file",
+        type=str,
+        help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
+        default=None,
+    )
+    parser.add_argument(
+        "--old_append",
+        help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
+    parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
+    parser.add_argument(
+        "--formatter_name",
+        type=str,
+        help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        help="Path to the dataset. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_train",
+        type=str,
+        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_val",
+        type=str,
+        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    args = parser.parse_args()
+    compute_embeddings(
+        args.model_path,
+        args.config_path,
+        args.output_path,
+        old_speakers_file=args.old_file,
+        old_append=args.old_append,
+        config_dataset_path=args.config_dataset_path,
+        formatter_name=args.formatter_name,
+        dataset_name=args.dataset_name,
+        dataset_path=args.dataset_path,
+        meta_file_train=args.meta_file_train,
+        meta_file_val=args.meta_file_val,
+        disable_cuda=args.disable_cuda,
+        no_eval=args.no_eval,
+    )

TTS/bin/compute_statistics.py ADDED Viewed

	@@ -0,0 +1,100 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import argparse
+import glob
+import logging
+import os
+import numpy as np
+from tqdm import tqdm
+# from TTS.utils.io import load_config
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+def main():
+    """Run preprocessing process."""
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
+    parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
+    parser.add_argument("out_path", type=str, help="save path (directory and filename).")
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        required=False,
+        help="folder including the target set of wavs overriding dataset config.",
+    )
+    args, overrides = parser.parse_known_args()
+    CONFIG = load_config(args.config_path)
+    CONFIG.parse_known_args(overrides, relaxed_parser=True)
+    # load config
+    CONFIG.audio.signal_norm = False  # do not apply earlier normalization
+    CONFIG.audio.stats_path = None  # discard pre-defined stats
+    # load audio processor
+    ap = AudioProcessor(**CONFIG.audio.to_dict())
+    # load the meta data of target dataset
+    if args.data_path:
+        dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
+    else:
+        dataset_items = load_tts_samples(CONFIG.datasets)[0]  # take only train data
+    print(f" > There are {len(dataset_items)} files.")
+    mel_sum = 0
+    mel_square_sum = 0
+    linear_sum = 0
+    linear_square_sum = 0
+    N = 0
+    for item in tqdm(dataset_items):
+        # compute features
+        wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
+        linear = ap.spectrogram(wav)
+        mel = ap.melspectrogram(wav)
+        # compute stats
+        N += mel.shape[1]
+        mel_sum += mel.sum(1)
+        linear_sum += linear.sum(1)
+        mel_square_sum += (mel**2).sum(axis=1)
+        linear_square_sum += (linear**2).sum(axis=1)
+    mel_mean = mel_sum / N
+    mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
+    linear_mean = linear_sum / N
+    linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
+    output_file_path = args.out_path
+    stats = {}
+    stats["mel_mean"] = mel_mean
+    stats["mel_std"] = mel_scale
+    stats["linear_mean"] = linear_mean
+    stats["linear_std"] = linear_scale
+    print(f" > Avg mel spec mean: {mel_mean.mean()}")
+    print(f" > Avg mel spec scale: {mel_scale.mean()}")
+    print(f" > Avg linear spec mean: {linear_mean.mean()}")
+    print(f" > Avg linear spec scale: {linear_scale.mean()}")
+    # set default config values for mean-var scaling
+    CONFIG.audio.stats_path = output_file_path
+    CONFIG.audio.signal_norm = True
+    # remove redundant values
+    del CONFIG.audio.max_norm
+    del CONFIG.audio.min_level_db
+    del CONFIG.audio.symmetric_norm
+    del CONFIG.audio.clip_norm
+    stats["audio_config"] = CONFIG.audio.to_dict()
+    np.save(output_file_path, stats, allow_pickle=True)
+    print(f" > stats saved to {output_file_path}")
+if __name__ == "__main__":
+    main()

TTS/bin/eval_encoder.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import argparse
+import logging
+from argparse import RawTextHelpFormatter
+import torch
+from tqdm import tqdm
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+def compute_encoder_accuracy(dataset_items, encoder_manager):
+    class_name_key = encoder_manager.encoder_config.class_name_key
+    map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
+    class_acc_dict = {}
+    # compute embeddings for all wav_files
+    for item in tqdm(dataset_items):
+        class_name = item[class_name_key]
+        wav_file = item["audio_file"]
+        # extract the embedding
+        embedd = encoder_manager.compute_embedding_from_clip(wav_file)
+        if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
+            embedding = torch.FloatTensor(embedd).unsqueeze(0)
+            if encoder_manager.use_cuda:
+                embedding = embedding.cuda()
+            class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
+            predicted_label = map_classid_to_classname[str(class_id)]
+        else:
+            predicted_label = None
+        if class_name is not None and predicted_label is not None:
+            is_equal = int(class_name == predicted_label)
+            if class_name not in class_acc_dict:
+                class_acc_dict[class_name] = [is_equal]
+            else:
+                class_acc_dict[class_name].append(is_equal)
+        else:
+            raise RuntimeError("Error: class_name or/and predicted_label are None")
+    acc_avg = 0
+    for key, values in class_acc_dict.items():
+        acc = sum(values) / len(values)
+        print("Class", key, "Accuracy:", acc)
+        acc_avg += acc
+    print("Average Accuracy:", acc_avg / len(class_acc_dict))
+if __name__ == "__main__":
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    parser = argparse.ArgumentParser(
+        description="""Compute the accuracy of the encoder.\n\n"""
+        """
+        Example runs:
+        python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json  dataset_config.json
+        """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
+    parser.add_argument(
+        "config_path",
+        type=str,
+        help="Path to model config file.",
+    )
+    parser.add_argument(
+        "config_dataset_path",
+        type=str,
+        help="Path to dataset config file.",
+    )
+    parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True)
+    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
+    args = parser.parse_args()
+    c_dataset = load_config(args.config_dataset_path)
+    meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
+    items = meta_data_train + meta_data_eval
+    enc_manager = SpeakerManager(
+        encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
+    )
+    compute_encoder_accuracy(items, enc_manager)

TTS/bin/extract_tts_spectrograms.py ADDED Viewed

	@@ -0,0 +1,290 @@

+#!/usr/bin/env python3
+"""Extract Mel spectrograms with teacher forcing."""
+import argparse
+import logging
+import os
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from trainer.generic_utils import count_parameters
+from TTS.config import load_config
+from TTS.tts.datasets import TTSDataset, load_tts_samples
+from TTS.tts.models import setup_model
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.audio.numpy_transforms import quantize
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+use_cuda = torch.cuda.is_available()
+def setup_loader(ap, r):
+    tokenizer, _ = TTSTokenizer.init_from_config(c)
+    dataset = TTSDataset(
+        outputs_per_step=r,
+        compute_linear_spec=False,
+        samples=meta_data,
+        tokenizer=tokenizer,
+        ap=ap,
+        batch_group_size=0,
+        min_text_len=c.min_text_len,
+        max_text_len=c.max_text_len,
+        min_audio_len=c.min_audio_len,
+        max_audio_len=c.max_audio_len,
+        phoneme_cache_path=c.phoneme_cache_path,
+        precompute_num_workers=0,
+        use_noise_augment=False,
+        speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
+        d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
+    )
+    if c.use_phonemes and c.compute_input_seq_cache:
+        # precompute phonemes to have a better estimate of sequence lengths.
+        dataset.compute_input_seq(c.num_loader_workers)
+    dataset.preprocess_samples()
+    loader = DataLoader(
+        dataset,
+        batch_size=c.batch_size,
+        shuffle=False,
+        collate_fn=dataset.collate_fn,
+        drop_last=False,
+        sampler=None,
+        num_workers=c.num_loader_workers,
+        pin_memory=False,
+    )
+    return loader
+def set_filename(wav_path, out_path):
+    wav_file = os.path.basename(wav_path)
+    file_name = wav_file.split(".")[0]
+    os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
+    os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
+    os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
+    os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
+    wavq_path = os.path.join(out_path, "quant", file_name)
+    mel_path = os.path.join(out_path, "mel", file_name)
+    wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
+    wav_path = os.path.join(out_path, "wav", file_name + ".wav")
+    return file_name, wavq_path, mel_path, wav_gl_path, wav_path
+def format_data(data):
+    # setup input data
+    text_input = data["token_id"]
+    text_lengths = data["token_id_lengths"]
+    mel_input = data["mel"]
+    mel_lengths = data["mel_lengths"]
+    item_idx = data["item_idxs"]
+    d_vectors = data["d_vectors"]
+    speaker_ids = data["speaker_ids"]
+    attn_mask = data["attns"]
+    avg_text_length = torch.mean(text_lengths.float())
+    avg_spec_length = torch.mean(mel_lengths.float())
+    # dispatch data to GPU
+    if use_cuda:
+        text_input = text_input.cuda(non_blocking=True)
+        text_lengths = text_lengths.cuda(non_blocking=True)
+        mel_input = mel_input.cuda(non_blocking=True)
+        mel_lengths = mel_lengths.cuda(non_blocking=True)
+        if speaker_ids is not None:
+            speaker_ids = speaker_ids.cuda(non_blocking=True)
+        if d_vectors is not None:
+            d_vectors = d_vectors.cuda(non_blocking=True)
+        if attn_mask is not None:
+            attn_mask = attn_mask.cuda(non_blocking=True)
+    return (
+        text_input,
+        text_lengths,
+        mel_input,
+        mel_lengths,
+        speaker_ids,
+        d_vectors,
+        avg_text_length,
+        avg_spec_length,
+        attn_mask,
+        item_idx,
+    )
+@torch.no_grad()
+def inference(
+    model_name,
+    model,
+    ap,
+    text_input,
+    text_lengths,
+    mel_input,
+    mel_lengths,
+    speaker_ids=None,
+    d_vectors=None,
+):
+    if model_name == "glow_tts":
+        speaker_c = None
+        if speaker_ids is not None:
+            speaker_c = speaker_ids
+        elif d_vectors is not None:
+            speaker_c = d_vectors
+        outputs = model.inference_with_MAS(
+            text_input,
+            text_lengths,
+            mel_input,
+            mel_lengths,
+            aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
+        )
+        model_output = outputs["model_outputs"]
+        model_output = model_output.detach().cpu().numpy()
+    elif "tacotron" in model_name:
+        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
+        outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
+        postnet_outputs = outputs["model_outputs"]
+        # normalize tacotron output
+        if model_name == "tacotron":
+            mel_specs = []
+            postnet_outputs = postnet_outputs.data.cpu().numpy()
+            for b in range(postnet_outputs.shape[0]):
+                postnet_output = postnet_outputs[b]
+                mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
+            model_output = torch.stack(mel_specs).cpu().numpy()
+        elif model_name == "tacotron2":
+            model_output = postnet_outputs.detach().cpu().numpy()
+    return model_output
+def extract_spectrograms(
+    data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
+):
+    model.eval()
+    export_metadata = []
+    for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
+        # format data
+        (
+            text_input,
+            text_lengths,
+            mel_input,
+            mel_lengths,
+            speaker_ids,
+            d_vectors,
+            _,
+            _,
+            _,
+            item_idx,
+        ) = format_data(data)
+        model_output = inference(
+            c.model.lower(),
+            model,
+            ap,
+            text_input,
+            text_lengths,
+            mel_input,
+            mel_lengths,
+            speaker_ids,
+            d_vectors,
+        )
+        for idx in range(text_input.shape[0]):
+            wav_file_path = item_idx[idx]
+            wav = ap.load_wav(wav_file_path)
+            _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
+            # quantize and save wav
+            if quantize_bits > 0:
+                wavq = quantize(wav, quantize_bits)
+                np.save(wavq_path, wavq)
+            # save TTS mel
+            mel = model_output[idx]
+            mel_length = mel_lengths[idx]
+            mel = mel[:mel_length, :].T
+            np.save(mel_path, mel)
+            export_metadata.append([wav_file_path, mel_path])
+            if save_audio:
+                ap.save_wav(wav, wav_path)
+            if debug:
+                print("Audio for debug saved at:", wav_gl_path)
+                wav = ap.inv_melspectrogram(mel)
+                ap.save_wav(wav, wav_gl_path)
+    with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
+        for data in export_metadata:
+            f.write(f"{data[0]}|{data[1]+'.npy'}\n")
+def main(args):  # pylint: disable=redefined-outer-name
+    # pylint: disable=global-variable-undefined
+    global meta_data, speaker_manager
+    # Audio processor
+    ap = AudioProcessor(**c.audio)
+    # load data instances
+    meta_data_train, meta_data_eval = load_tts_samples(
+        c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+    )
+    # use eval and training partitions
+    meta_data = meta_data_train + meta_data_eval
+    # init speaker manager
+    if c.use_speaker_embedding:
+        speaker_manager = SpeakerManager(data_items=meta_data)
+    elif c.use_d_vector_file:
+        speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
+    else:
+        speaker_manager = None
+    # setup model
+    model = setup_model(c)
+    # restore model
+    model.load_checkpoint(c, args.checkpoint_path, eval=True)
+    if use_cuda:
+        model.cuda()
+    num_params = count_parameters(model)
+    print("\n > Model has {} parameters".format(num_params), flush=True)
+    # set r
+    r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
+    own_loader = setup_loader(ap, r)
+    extract_spectrograms(
+        own_loader,
+        model,
+        ap,
+        args.output_path,
+        quantize_bits=args.quantize_bits,
+        save_audio=args.save_audio,
+        debug=args.debug,
+        metada_name="metada.txt",
+    )
+if __name__ == "__main__":
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
+    parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
+    parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
+    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
+    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
+    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
+    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
+    args = parser.parse_args()
+    c = load_config(args.config_path)
+    c.audio.trim_silence = False
+    main(args)

TTS/bin/find_unique_chars.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Find all the unique characters in a dataset"""
+import argparse
+import logging
+from argparse import RawTextHelpFormatter
+from TTS.config import load_config
+from TTS.tts.datasets import find_unique_chars, load_tts_samples
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+def main():
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    # pylint: disable=bad-option-value
+    parser = argparse.ArgumentParser(
+        description="""Find all the unique characters or phonemes in a dataset.\n\n"""
+        """
+    Example runs:
+    python TTS/bin/find_unique_chars.py --config_path config.json
+    """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
+    args = parser.parse_args()
+    c = load_config(args.config_path)
+    # load all datasets
+    train_items, eval_items = load_tts_samples(
+        c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+    )
+    items = train_items + eval_items
+    find_unique_chars(items)
+if __name__ == "__main__":
+    main()

TTS/bin/find_unique_phonemes.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Find all the unique characters in a dataset"""
+import argparse
+import logging
+import multiprocessing
+from argparse import RawTextHelpFormatter
+from tqdm.contrib.concurrent import process_map
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.text.phonemizers import Gruut
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+def compute_phonemes(item):
+    text = item["text"]
+    ph = phonemizer.phonemize(text).replace("|", "")
+    return set(ph)
+def main():
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    # pylint: disable=W0601
+    global c, phonemizer
+    # pylint: disable=bad-option-value
+    parser = argparse.ArgumentParser(
+        description="""Find all the unique characters or phonemes in a dataset.\n\n"""
+        """
+    Example runs:
+    python TTS/bin/find_unique_phonemes.py --config_path config.json
+    """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
+    args = parser.parse_args()
+    c = load_config(args.config_path)
+    # load all datasets
+    train_items, eval_items = load_tts_samples(
+        c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+    )
+    items = train_items + eval_items
+    print("Num items:", len(items))
+    language_list = [item["language"] for item in items]
+    is_lang_def = all(language_list)
+    if not c.phoneme_language or not is_lang_def:
+        raise ValueError("Phoneme language must be defined in config.")
+    if not language_list.count(language_list[0]) == len(language_list):
+        raise ValueError(
+            "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
+        )
+    phonemizer = Gruut(language=language_list[0], keep_puncs=True)
+    phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
+    phones = []
+    for ph in phonemes:
+        phones.extend(ph)
+    phones = set(phones)
+    lower_phones = filter(lambda c: c.islower(), phones)
+    phones_force_lower = [c.lower() for c in phones]
+    phones_force_lower = set(phones_force_lower)
+    print(f" > Number of unique phonemes: {len(phones)}")
+    print(f" > Unique phonemes: {''.join(sorted(phones))}")
+    print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
+    print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
+if __name__ == "__main__":
+    main()

TTS/bin/remove_silence_using_vad.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import argparse
+import glob
+import logging
+import multiprocessing
+import os
+import pathlib
+import torch
+from tqdm import tqdm
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+from TTS.utils.vad import get_vad_model_and_utils, remove_silence
+torch.set_num_threads(1)
+def adjust_path_and_remove_silence(audio_path):
+    output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
+    # ignore if the file exists
+    if os.path.exists(output_path) and not args.force:
+        return output_path, False
+    # create all directory structure
+    pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    # remove the silence and save the audio
+    output_path, is_speech = remove_silence(
+        model_and_utils,
+        audio_path,
+        output_path,
+        trim_just_beginning_and_end=args.trim_just_beginning_and_end,
+        use_cuda=args.use_cuda,
+    )
+    return output_path, is_speech
+def preprocess_audios():
+    files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
+    print("> Number of files: ", len(files))
+    if not args.force:
+        print("> Ignoring files that already exist in the output idrectory.")
+    if args.trim_just_beginning_and_end:
+        print("> Trimming just the beginning and the end with nonspeech parts.")
+    else:
+        print("> Trimming all nonspeech parts.")
+    filtered_files = []
+    if files:
+        # create threads
+        # num_threads = multiprocessing.cpu_count()
+        # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
+        if args.num_processes > 1:
+            with multiprocessing.Pool(processes=args.num_processes) as pool:
+                results = list(
+                    tqdm(
+                        pool.imap_unordered(adjust_path_and_remove_silence, files),
+                        total=len(files),
+                        desc="Processing audio files",
+                    )
+                )
+            for output_path, is_speech in results:
+                if not is_speech:
+                    filtered_files.append(output_path)
+        else:
+            for f in tqdm(files):
+                output_path, is_speech = adjust_path_and_remove_silence(f)
+                if not is_speech:
+                    filtered_files.append(output_path)
+        # write files that do not have speech
+        with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
+            for file in filtered_files:
+                f.write(str(file) + "\n")
+    else:
+        print("> No files Found !")
+if __name__ == "__main__":
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    parser = argparse.ArgumentParser(
+        description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end"
+    )
+    parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
+    parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
+    parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
+    parser.add_argument(
+        "-g",
+        "--glob",
+        type=str,
+        default="**/*.wav",
+        help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
+    )
+    parser.add_argument(
+        "-t",
+        "--trim_just_beginning_and_end",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trimmed.",
+    )
+    parser.add_argument(
+        "-c",
+        "--use_cuda",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="If True use cuda",
+    )
+    parser.add_argument(
+        "--use_onnx",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="If True use onnx",
+    )
+    parser.add_argument(
+        "--num_processes",
+        type=int,
+        default=1,
+        help="Number of processes to use",
+    )
+    args = parser.parse_args()
+    if args.output_dir == "":
+        args.output_dir = args.input_dir
+    # load the model and utils
+    model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
+    preprocess_audios()

TTS/bin/resample.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import argparse
+import glob
+import os
+from argparse import RawTextHelpFormatter
+from multiprocessing import Pool
+from shutil import copytree
+import librosa
+import soundfile as sf
+from tqdm import tqdm
+def resample_file(func_args):
+    filename, output_sr = func_args
+    y, sr = librosa.load(filename, sr=output_sr)
+    sf.write(filename, y, sr)
+def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
+    if output_dir:
+        print("Recursively copying the input folder...")
+        copytree(input_dir, output_dir)
+        input_dir = output_dir
+    print("Resampling the audio files...")
+    audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
+    print(f"Found {len(audio_files)} files...")
+    audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
+    with Pool(processes=n_jobs) as p:
+        with tqdm(total=len(audio_files)) as pbar:
+            for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
+                pbar.update()
+    print("Done !")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""Resample a folder recusively with librosa
+                       Can be used in place or create a copy of the folder as an output.\n\n
+                       Example run:
+                            python TTS/bin/resample.py
+                                --input_dir /root/LJSpeech-1.1/
+                                --output_sr 22050
+                                --output_dir /root/resampled_LJSpeech-1.1/
+                                --file_ext wav
+                                --n_jobs 24
+                    """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="Path of the folder containing the audio files to resample",
+    )
+    parser.add_argument(
+        "--output_sr",
+        type=int,
+        default=22050,
+        required=False,
+        help="Samlple rate to which the audio files should be resampled",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="Path of the destination folder. If not defined, the operation is done in place",
+    )
+    parser.add_argument(
+        "--file_ext",
+        type=str,
+        default="wav",
+        required=False,
+        help="Extension of the audio files to resample",
+    )
+    parser.add_argument(
+        "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
+    )
+    args = parser.parse_args()
+    resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)

TTS/bin/synthesize.py ADDED Viewed

	@@ -0,0 +1,486 @@

+#!/usr/bin/env python3
+"""Command line interface."""
+import argparse
+import contextlib
+import logging
+import sys
+from argparse import RawTextHelpFormatter
+# pylint: disable=redefined-outer-name, unused-argument
+from pathlib import Path
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+logger = logging.getLogger(__name__)
+description = """
+Synthesize speech on command line.
+You can either use your trained model or choose a model from the provided list.
+If you don't specify any models, then it uses LJSpeech based English model.
+#### Single Speaker Models
+- List provided models:
+  ```
+  $ tts --list_models
+  ```
+- Get model info (for both tts_models and vocoder_models):
+  - Query by type/name:
+    The model_info_by_name uses the name as it from the --list_models.
+    ```
+    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+    ```
+    For example:
+    ```
+    $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+    $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+    ```
+  - Query by type/idx:
+    The model_query_idx uses the corresponding idx from --list_models.
+    ```
+    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
+    ```
+    For example:
+    ```
+    $ tts --model_info_by_idx tts_models/3
+    ```
+  - Query info for model info by full name:
+    ```
+    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+    ```
+- Run TTS with default models:
+  ```
+  $ tts --text "Text for TTS" --out_path output/path/speech.wav
+  ```
+- Run TTS and pipe out the generated TTS wav file data:
+  ```
+  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
+  ```
+- Run a TTS model with its default vocoder model:
+  ```
+  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```
+  For example:
+  ```
+  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
+  ```
+- Run with specific TTS and vocoder models from the list:
+  ```
+  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+  ```
+  For example:
+  ```
+  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+  ```
+- Run your own TTS model (Using Griffin-Lim Vocoder):
+  ```
+  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+  ```
+- Run your own TTS and Vocoder models:
+  ```
+  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+      --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+  ```
+#### Multi-speaker Models
+- List the available speakers and choose a <speaker_id> among them:
+  ```
+  $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
+  ```
+- Run the multi-speaker TTS model with the target speaker ID:
+  ```
+  $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
+  ```
+- Run your own multi-speaker TTS model:
+  ```
+  $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+  ```
+### Voice Conversion Models
+```
+$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
+```
+"""
+def parse_args() -> argparse.Namespace:
+    """Parse arguments."""
+    parser = argparse.ArgumentParser(
+        description=description.replace("    ```\n", ""),
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--list_models",
+        action="store_true",
+        help="list available pre-trained TTS and vocoder models.",
+    )
+    parser.add_argument(
+        "--model_info_by_idx",
+        type=str,
+        default=None,
+        help="model info using query format: <model_type>/<model_query_idx>",
+    )
+    parser.add_argument(
+        "--model_info_by_name",
+        type=str,
+        default=None,
+        help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
+    )
+    parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
+    # Args for running pre-trained TTS models.
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="tts_models/en/ljspeech/tacotron2-DDC",
+        help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
+    )
+    parser.add_argument(
+        "--vocoder_name",
+        type=str,
+        default=None,
+        help="Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
+    )
+    # Args for running custom models
+    parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default=None,
+        help="Path to model file.",
+    )
+    parser.add_argument(
+        "--out_path",
+        type=str,
+        default="tts_output.wav",
+        help="Output wav file path.",
+    )
+    parser.add_argument("--use_cuda", action="store_true", help="Run model on CUDA.")
+    parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
+    parser.add_argument(
+        "--vocoder_path",
+        type=str,
+        help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
+        default=None,
+    )
+    parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
+    parser.add_argument(
+        "--encoder_path",
+        type=str,
+        help="Path to speaker encoder model file.",
+        default=None,
+    )
+    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
+    parser.add_argument(
+        "--pipe_out",
+        help="stdout the generated TTS wav file for shell pipe.",
+        action="store_true",
+    )
+    # args for multi-speaker synthesis
+    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
+    parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
+    parser.add_argument(
+        "--speaker_idx",
+        type=str,
+        help="Target speaker ID for a multi-speaker TTS model.",
+        default=None,
+    )
+    parser.add_argument(
+        "--language_idx",
+        type=str,
+        help="Target language ID for a multi-lingual TTS model.",
+        default=None,
+    )
+    parser.add_argument(
+        "--speaker_wav",
+        nargs="+",
+        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
+        default=None,
+    )
+    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
+    parser.add_argument(
+        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
+    )
+    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
+    parser.add_argument(
+        "--list_speaker_idxs",
+        help="List available speaker ids for the defined multi-speaker model.",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--list_language_idxs",
+        help="List available language ids for the defined multi-lingual model.",
+        action="store_true",
+    )
+    # aux args
+    parser.add_argument(
+        "--save_spectogram",
+        action="store_true",
+        help="Save raw spectogram for further (vocoder) processing in out_path.",
+    )
+    parser.add_argument(
+        "--reference_wav",
+        type=str,
+        help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
+        default=None,
+    )
+    parser.add_argument(
+        "--reference_speaker_idx",
+        type=str,
+        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
+        default=None,
+    )
+    parser.add_argument(
+        "--progress_bar",
+        action=argparse.BooleanOptionalAction,
+        help="Show a progress bar for the model download.",
+        default=True,
+    )
+    # voice conversion args
+    parser.add_argument(
+        "--source_wav",
+        type=str,
+        default=None,
+        help="Original audio file to convert in the voice of the target_wav",
+    )
+    parser.add_argument(
+        "--target_wav",
+        type=str,
+        default=None,
+        help="Target audio file to convert in the voice of the source_wav",
+    )
+    parser.add_argument(
+        "--voice_dir",
+        type=str,
+        default=None,
+        help="Voice dir for tortoise model",
+    )
+    args = parser.parse_args()
+    # print the description if either text or list_models is not set
+    check_args = [
+        args.text,
+        args.list_models,
+        args.list_speaker_idxs,
+        args.list_language_idxs,
+        args.reference_wav,
+        args.model_info_by_idx,
+        args.model_info_by_name,
+        args.source_wav,
+        args.target_wav,
+    ]
+    if not any(check_args):
+        parser.parse_args(["-h"])
+    return args
+def main():
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    args = parse_args()
+    pipe_out = sys.stdout if args.pipe_out else None
+    with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
+        # Late-import to make things load faster
+        from TTS.utils.manage import ModelManager
+        from TTS.utils.synthesizer import Synthesizer
+        # load model manager
+        path = Path(__file__).parent / "../.models.json"
+        manager = ModelManager(path, progress_bar=args.progress_bar)
+        tts_path = None
+        tts_config_path = None
+        speakers_file_path = None
+        language_ids_file_path = None
+        vocoder_path = None
+        vocoder_config_path = None
+        encoder_path = None
+        encoder_config_path = None
+        vc_path = None
+        vc_config_path = None
+        model_dir = None
+        # CASE1 #list : list pre-trained TTS models
+        if args.list_models:
+            manager.list_models()
+            sys.exit()
+        # CASE2 #info : model info for pre-trained TTS models
+        if args.model_info_by_idx:
+            model_query = args.model_info_by_idx
+            manager.model_info_by_idx(model_query)
+            sys.exit()
+        if args.model_info_by_name:
+            model_query_full_name = args.model_info_by_name
+            manager.model_info_by_full_name(model_query_full_name)
+            sys.exit()
+        # CASE3: load pre-trained model paths
+        if args.model_name is not None and not args.model_path:
+            model_path, config_path, model_item = manager.download_model(args.model_name)
+            # tts model
+            if model_item["model_type"] == "tts_models":
+                tts_path = model_path
+                tts_config_path = config_path
+                if args.vocoder_name is None and "default_vocoder" in model_item:
+                    args.vocoder_name = model_item["default_vocoder"]
+            # voice conversion model
+            if model_item["model_type"] == "voice_conversion_models":
+                vc_path = model_path
+                vc_config_path = config_path
+            # tts model with multiple files to be loaded from the directory path
+            if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
+                model_dir = model_path
+                tts_path = None
+                tts_config_path = None
+                args.vocoder_name = None
+        # load vocoder
+        if args.vocoder_name is not None and not args.vocoder_path:
+            vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
+        # CASE4: set custom model paths
+        if args.model_path is not None:
+            tts_path = args.model_path
+            tts_config_path = args.config_path
+            speakers_file_path = args.speakers_file_path
+            language_ids_file_path = args.language_ids_file_path
+        if args.vocoder_path is not None:
+            vocoder_path = args.vocoder_path
+            vocoder_config_path = args.vocoder_config_path
+        if args.encoder_path is not None:
+            encoder_path = args.encoder_path
+            encoder_config_path = args.encoder_config_path
+        device = args.device
+        if args.use_cuda:
+            device = "cuda"
+        # load models
+        synthesizer = Synthesizer(
+            tts_path,
+            tts_config_path,
+            speakers_file_path,
+            language_ids_file_path,
+            vocoder_path,
+            vocoder_config_path,
+            encoder_path,
+            encoder_config_path,
+            vc_path,
+            vc_config_path,
+            model_dir,
+            args.voice_dir,
+        ).to(device)
+        # query speaker ids of a multi-speaker model.
+        if args.list_speaker_idxs:
+            if synthesizer.tts_model.speaker_manager is None:
+                logger.info("Model only has a single speaker.")
+                return
+            logger.info(
+                "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
+            )
+            logger.info(synthesizer.tts_model.speaker_manager.name_to_id)
+            return
+        # query langauge ids of a multi-lingual model.
+        if args.list_language_idxs:
+            if synthesizer.tts_model.language_manager is None:
+                logger.info("Monolingual model.")
+                return
+            logger.info(
+                "Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
+            )
+            logger.info(synthesizer.tts_model.language_manager.name_to_id)
+            return
+        # check the arguments against a multi-speaker model.
+        if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
+            logger.error(
+                "Looks like you use a multi-speaker model. Define `--speaker_idx` to "
+                "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
+            )
+            return
+        # RUN THE SYNTHESIS
+        if args.text:
+            logger.info("Text: %s", args.text)
+        # kick it
+        if tts_path is not None:
+            wav = synthesizer.tts(
+                args.text,
+                speaker_name=args.speaker_idx,
+                language_name=args.language_idx,
+                speaker_wav=args.speaker_wav,
+                reference_wav=args.reference_wav,
+                style_wav=args.capacitron_style_wav,
+                style_text=args.capacitron_style_text,
+                reference_speaker_name=args.reference_speaker_idx,
+            )
+        elif vc_path is not None:
+            wav = synthesizer.voice_conversion(
+                source_wav=args.source_wav,
+                target_wav=args.target_wav,
+            )
+        elif model_dir is not None:
+            wav = synthesizer.tts(
+                args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
+            )
+        # save the results
+        synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
+        logger.info("Saved output to %s", args.out_path)
+if __name__ == "__main__":
+    main()

TTS/bin/train_encoder.py ADDED Viewed

	@@ -0,0 +1,340 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import logging
+import os
+import sys
+import time
+import traceback
+import warnings
+import torch
+from torch.utils.data import DataLoader
+from trainer.generic_utils import count_parameters, remove_experiment_folder
+from trainer.io import copy_model_files, save_best_model, save_checkpoint
+from trainer.torch import NoamLR
+from trainer.trainer_utils import get_optimizer
+from TTS.encoder.dataset import EncoderDataset
+from TTS.encoder.utils.generic_utils import setup_encoder_model
+from TTS.encoder.utils.training import init_training
+from TTS.encoder.utils.visual import plot_embeddings
+from TTS.tts.datasets import load_tts_samples
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+from TTS.utils.samplers import PerfectBatchSampler
+from TTS.utils.training import check_update
+torch.backends.cudnn.enabled = True
+torch.backends.cudnn.benchmark = True
+torch.manual_seed(54321)
+use_cuda = torch.cuda.is_available()
+num_gpus = torch.cuda.device_count()
+print(" > Using CUDA: ", use_cuda)
+print(" > Number of GPUs: ", num_gpus)
+def setup_loader(ap: AudioProcessor, is_val: bool = False):
+    num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
+    num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
+    dataset = EncoderDataset(
+        c,
+        ap,
+        meta_data_eval if is_val else meta_data_train,
+        voice_len=c.voice_len,
+        num_utter_per_class=num_utter_per_class,
+        num_classes_in_batch=num_classes_in_batch,
+        augmentation_config=c.audio_augmentation if not is_val else None,
+        use_torch_spec=c.model_params.get("use_torch_spec", False),
+    )
+    # get classes list
+    classes = dataset.get_class_list()
+    sampler = PerfectBatchSampler(
+        dataset.items,
+        classes,
+        batch_size=num_classes_in_batch * num_utter_per_class,  # total batch size
+        num_classes_in_batch=num_classes_in_batch,
+        num_gpus=1,
+        shuffle=not is_val,
+        drop_last=True,
+    )
+    if len(classes) < num_classes_in_batch:
+        if is_val:
+            raise RuntimeError(
+                f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
+            )
+        raise RuntimeError(
+            f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
+        )
+    # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
+    if is_val:
+        dataset.set_classes(train_classes)
+    loader = DataLoader(
+        dataset,
+        num_workers=c.num_loader_workers,
+        batch_sampler=sampler,
+        collate_fn=dataset.collate_fn,
+    )
+    return loader, classes, dataset.get_map_classid_to_classname()
+def evaluation(model, criterion, data_loader, global_step):
+    eval_loss = 0
+    for _, data in enumerate(data_loader):
+        with torch.no_grad():
+            # setup input data
+            inputs, labels = data
+            # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
+            labels = torch.transpose(
+                labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
+            ).reshape(labels.shape)
+            inputs = torch.transpose(
+                inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
+            ).reshape(inputs.shape)
+            # dispatch data to GPU
+            if use_cuda:
+                inputs = inputs.cuda(non_blocking=True)
+                labels = labels.cuda(non_blocking=True)
+            # forward pass model
+            outputs = model(inputs)
+            # loss computation
+            loss = criterion(
+                outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
+            )
+            eval_loss += loss.item()
+    eval_avg_loss = eval_loss / len(data_loader)
+    # save stats
+    dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
+    try:
+        # plot the last batch in the evaluation
+        figures = {
+            "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+        }
+        dashboard_logger.eval_figures(global_step, figures)
+    except ImportError:
+        warnings.warn("Install the `umap-learn` package to see embedding plots.")
+    return eval_avg_loss
+def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
+    model.train()
+    best_loss = {"train_loss": None, "eval_loss": float("inf")}
+    avg_loader_time = 0
+    end_time = time.time()
+    for epoch in range(c.epochs):
+        tot_loss = 0
+        epoch_time = 0
+        for _, data in enumerate(data_loader):
+            start_time = time.time()
+            # setup input data
+            inputs, labels = data
+            # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
+            labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
+                labels.shape
+            )
+            inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
+                inputs.shape
+            )
+            # ToDo: move it to a unit test
+            # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
+            # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+            # idx = 0
+            # for j in range(0, c.num_classes_in_batch, 1):
+            #     for i in range(j, len(labels), c.num_classes_in_batch):
+            #         if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
+            #             print("Invalid")
+            #             print(labels)
+            #             exit()
+            #         idx += 1
+            # labels = labels_converted
+            # inputs = inputs_converted
+            loader_time = time.time() - end_time
+            global_step += 1
+            optimizer.zero_grad()
+            # dispatch data to GPU
+            if use_cuda:
+                inputs = inputs.cuda(non_blocking=True)
+                labels = labels.cuda(non_blocking=True)
+            # forward pass model
+            outputs = model(inputs)
+            # loss computation
+            loss = criterion(
+                outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
+            )
+            loss.backward()
+            grad_norm, _ = check_update(model, c.grad_clip)
+            optimizer.step()
+            # setup lr
+            if c.lr_decay:
+                scheduler.step()
+            step_time = time.time() - start_time
+            epoch_time += step_time
+            # acumulate the total epoch loss
+            tot_loss += loss.item()
+            # Averaged Loader Time
+            num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
+            avg_loader_time = (
+                1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
+                if avg_loader_time != 0
+                else loader_time
+            )
+            current_lr = optimizer.param_groups[0]["lr"]
+            if global_step % c.steps_plot_stats == 0:
+                # Plot Training Epoch Stats
+                train_stats = {
+                    "loss": loss.item(),
+                    "lr": current_lr,
+                    "grad_norm": grad_norm,
+                    "step_time": step_time,
+                    "avg_loader_time": avg_loader_time,
+                }
+                dashboard_logger.train_epoch_stats(global_step, train_stats)
+                figures = {
+                    "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+                }
+                dashboard_logger.train_figures(global_step, figures)
+            if global_step % c.print_step == 0:
+                print(
+                    "   | > Step:{}  Loss:{:.5f}  GradNorm:{:.5f}  "
+                    "StepTime:{:.2f}  LoaderTime:{:.2f}  AvGLoaderTime:{:.2f}  LR:{:.6f}".format(
+                        global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
+                    ),
+                    flush=True,
+                )
+            if global_step % c.save_step == 0:
+                # save model
+                save_checkpoint(
+                    c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
+                )
+            end_time = time.time()
+        print("")
+        print(
+            ">>> Epoch:{}  AvgLoss: {:.5f} GradNorm:{:.5f}  "
+            "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
+                epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
+            ),
+            flush=True,
+        )
+        # evaluation
+        if c.run_eval:
+            model.eval()
+            eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
+            print("\n\n")
+            print("--> EVAL PERFORMANCE")
+            print(
+                "   | > Epoch:{}  AvgLoss: {:.5f} ".format(epoch, eval_loss),
+                flush=True,
+            )
+            # save the best checkpoint
+            best_loss = save_best_model(
+                {"train_loss": None, "eval_loss": eval_loss},
+                best_loss,
+                c,
+                model,
+                optimizer,
+                None,
+                global_step,
+                epoch,
+                OUT_PATH,
+                criterion=criterion.state_dict(),
+            )
+            model.train()
+    return best_loss, global_step
+def main(args):  # pylint: disable=redefined-outer-name
+    # pylint: disable=global-variable-undefined
+    global meta_data_train
+    global meta_data_eval
+    global train_classes
+    ap = AudioProcessor(**c.audio)
+    model = setup_encoder_model(c)
+    optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
+    # pylint: disable=redefined-outer-name
+    meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
+    train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False)
+    if c.run_eval:
+        eval_data_loader, _, _ = setup_loader(ap, is_val=True)
+    else:
+        eval_data_loader = None
+    num_classes = len(train_classes)
+    criterion = model.get_criterion(c, num_classes)
+    if c.loss == "softmaxproto" and c.model != "speaker_encoder":
+        c.map_classid_to_classname = map_classid_to_classname
+        copy_model_files(c, OUT_PATH, new_fields={})
+    if args.restore_path:
+        criterion, args.restore_step = model.load_checkpoint(
+            c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
+        )
+        print(" > Model restored from step %d" % args.restore_step, flush=True)
+    else:
+        args.restore_step = 0
+    if c.lr_decay:
+        scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
+    else:
+        scheduler = None
+    num_params = count_parameters(model)
+    print("\n > Model has {} parameters".format(num_params), flush=True)
+    if use_cuda:
+        model = model.cuda()
+        criterion.cuda()
+    global_step = args.restore_step
+    _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
+if __name__ == "__main__":
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
+    try:
+        main(args)
+    except KeyboardInterrupt:
+        remove_experiment_folder(OUT_PATH)
+        try:
+            sys.exit(0)
+        except SystemExit:
+            os._exit(0)  # pylint: disable=protected-access
+    except Exception:  # pylint: disable=broad-except
+        remove_experiment_folder(OUT_PATH)
+        traceback.print_exc()
+        sys.exit(1)

TTS/bin/train_tts.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import logging
+import os
+from dataclasses import dataclass, field
+from trainer import Trainer, TrainerArgs
+from TTS.config import load_config, register_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models import setup_model
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+@dataclass
+class TrainTTSArgs(TrainerArgs):
+    config_path: str = field(default=None, metadata={"help": "Path to the config file."})
+def main():
+    """Run `tts` model training directly by a `config.json` file."""
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    # init trainer args
+    train_args = TrainTTSArgs()
+    parser = train_args.init_argparse(arg_prefix="")
+    # override trainer args from comman-line args
+    args, config_overrides = parser.parse_known_args()
+    train_args.parse_args(args)
+    # load config.json and register
+    if args.config_path or args.continue_path:
+        if args.config_path:
+            # init from a file
+            config = load_config(args.config_path)
+            if len(config_overrides) > 0:
+                config.parse_known_args(config_overrides, relaxed_parser=True)
+        elif args.continue_path:
+            # continue from a prev experiment
+            config = load_config(os.path.join(args.continue_path, "config.json"))
+            if len(config_overrides) > 0:
+                config.parse_known_args(config_overrides, relaxed_parser=True)
+        else:
+            # init from console args
+            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel
+            config_base = BaseTrainingConfig()
+            config_base.parse_known_args(config_overrides)
+            config = register_config(config_base.model)()
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        config.datasets,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+    # init the model from config
+    model = setup_model(config, train_samples + eval_samples)
+    # init the trainer and 🚀
+    trainer = Trainer(
+        train_args,
+        model.config,
+        config.output_path,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+        parse_command_line_args=False,
+    )
+    trainer.fit()
+if __name__ == "__main__":
+    main()

TTS/bin/train_vocoder.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import logging
+import os
+from dataclasses import dataclass, field
+from trainer import Trainer, TrainerArgs
+from TTS.config import load_config, register_config
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
+from TTS.vocoder.models import setup_model
+@dataclass
+class TrainVocoderArgs(TrainerArgs):
+    config_path: str = field(default=None, metadata={"help": "Path to the config file."})
+def main():
+    """Run `tts` model training directly by a `config.json` file."""
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    # init trainer args
+    train_args = TrainVocoderArgs()
+    parser = train_args.init_argparse(arg_prefix="")
+    # override trainer args from comman-line args
+    args, config_overrides = parser.parse_known_args()
+    train_args.parse_args(args)
+    # load config.json and register
+    if args.config_path or args.continue_path:
+        if args.config_path:
+            # init from a file
+            config = load_config(args.config_path)
+            if len(config_overrides) > 0:
+                config.parse_known_args(config_overrides, relaxed_parser=True)
+        elif args.continue_path:
+            # continue from a prev experiment
+            config = load_config(os.path.join(args.continue_path, "config.json"))
+            if len(config_overrides) > 0:
+                config.parse_known_args(config_overrides, relaxed_parser=True)
+        else:
+            # init from console args
+            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel
+            config_base = BaseTrainingConfig()
+            config_base.parse_known_args(config_overrides)
+            config = register_config(config_base.model)()
+    # load training samples
+    if "feature_path" in config and config.feature_path:
+        # load pre-computed features
+        print(f" > Loading features from: {config.feature_path}")
+        eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
+    else:
+        # load data raw wav files
+        eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
+    # setup audio processor
+    ap = AudioProcessor(**config.audio)
+    # init the model from config
+    model = setup_model(config)
+    # init the trainer and 🚀
+    trainer = Trainer(
+        train_args,
+        config,
+        config.output_path,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+        training_assets={"audio_processor": ap},
+        parse_command_line_args=False,
+    )
+    trainer.fit()
+if __name__ == "__main__":
+    main()

TTS/bin/tune_wavegrad.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
+import argparse
+import logging
+from itertools import product as cartesian_product
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from TTS.config import load_config
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+from TTS.vocoder.datasets.preprocess import load_wav_data
+from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
+from TTS.vocoder.models import setup_model
+if __name__ == "__main__":
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
+    parser.add_argument("--config_path", type=str, help="Path to model config file.")
+    parser.add_argument("--data_path", type=str, help="Path to data directory.")
+    parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
+    parser.add_argument(
+        "--num_iter",
+        type=int,
+        help="Number of model inference iterations that you like to optimize noise schedule for.",
+    )
+    parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
+    parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
+    parser.add_argument(
+        "--search_depth",
+        type=int,
+        default=3,
+        help="Search granularity. Increasing this increases the run-time exponentially.",
+    )
+    # load config
+    args = parser.parse_args()
+    config = load_config(args.config_path)
+    # setup audio processor
+    ap = AudioProcessor(**config.audio)
+    # load dataset
+    _, train_data = load_wav_data(args.data_path, 0)
+    train_data = train_data[: args.num_samples]
+    dataset = WaveGradDataset(
+        ap=ap,
+        items=train_data,
+        seq_len=-1,
+        hop_len=ap.hop_length,
+        pad_short=config.pad_short,
+        conv_pad=config.conv_pad,
+        is_training=True,
+        return_segments=False,
+        use_noise_augment=False,
+        use_cache=False,
+    )
+    loader = DataLoader(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        collate_fn=dataset.collate_full_clips,
+        drop_last=False,
+        num_workers=config.num_loader_workers,
+        pin_memory=False,
+    )
+    # setup the model
+    model = setup_model(config)
+    if args.use_cuda:
+        model.cuda()
+    # setup optimization parameters
+    base_values = sorted(10 * np.random.uniform(size=args.search_depth))
+    print(f" > base values: {base_values}")
+    exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
+    best_error = float("inf")
+    best_schedule = None  # pylint: disable=C0103
+    total_search_iter = len(base_values) ** args.num_iter
+    for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
+        beta = exponents * base
+        model.compute_noise_level(beta)
+        for data in loader:
+            mel, audio = data
+            y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
+            if args.use_cuda:
+                y_hat = y_hat.cpu()
+            y_hat = y_hat.numpy()
+            mel_hat = []
+            for i in range(y_hat.shape[0]):
+                m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
+                mel_hat.append(torch.from_numpy(m))
+            mel_hat = torch.stack(mel_hat)
+            mse = torch.sum((mel - mel_hat) ** 2).mean()
+            if mse.item() < best_error:
+                best_error = mse.item()
+                best_schedule = {"beta": beta}
+                print(f" > Found a better schedule. - MSE: {mse.item()}")
+                np.save(args.output_path, best_schedule)

TTS/config/__init__.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import json
+import os
+import re
+from typing import Dict
+import fsspec
+import yaml
+from coqpit import Coqpit
+from TTS.config.shared_configs import *
+from TTS.utils.generic_utils import find_module
+def read_json_with_comments(json_path):
+    """for backward compat."""
+    # fallback to json
+    with fsspec.open(json_path, "r", encoding="utf-8") as f:
+        input_str = f.read()
+    # handle comments but not urls with //
+    input_str = re.sub(
+        r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str
+    )
+    return json.loads(input_str)
+def register_config(model_name: str) -> Coqpit:
+    """Find the right config for the given model name.
+    Args:
+        model_name (str): Model name.
+    Raises:
+        ModuleNotFoundError: No matching config for the model name.
+    Returns:
+        Coqpit: config class.
+    """
+    config_class = None
+    config_name = model_name + "_config"
+    # TODO: fix this
+    if model_name == "xtts":
+        from TTS.tts.configs.xtts_config import XttsConfig
+        config_class = XttsConfig
+    paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
+    for path in paths:
+        try:
+            config_class = find_module(path, config_name)
+        except ModuleNotFoundError:
+            pass
+    if config_class is None:
+        raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
+    return config_class
+def _process_model_name(config_dict: Dict) -> str:
+    """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
+    Args:
+        config_dict (Dict): A dictionary including the config fields.
+    Returns:
+        str: Formatted modelname.
+    """
+    model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
+    model_name = model_name.replace("_generator", "").replace("_discriminator", "")
+    return model_name
+def load_config(config_path: str) -> Coqpit:
+    """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
+    to find the corresponding Config class. Then initialize the Config.
+    Args:
+        config_path (str): path to the config file.
+    Raises:
+        TypeError: given config file has an unknown type.
+    Returns:
+        Coqpit: TTS config object.
+    """
+    config_dict = {}
+    ext = os.path.splitext(config_path)[1]
+    if ext in (".yml", ".yaml"):
+        with fsspec.open(config_path, "r", encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+    elif ext == ".json":
+        try:
+            with fsspec.open(config_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+        except json.decoder.JSONDecodeError:
+            # backwards compat.
+            data = read_json_with_comments(config_path)
+    else:
+        raise TypeError(f" [!] Unknown config file type {ext}")
+    config_dict.update(data)
+    model_name = _process_model_name(config_dict)
+    config_class = register_config(model_name.lower())
+    config = config_class()
+    config.from_dict(config_dict)
+    return config
+def check_config_and_model_args(config, arg_name, value):
+    """Check the give argument in `config.model_args` if exist or in `config` for
+    the given value.
+    Return False if the argument does not exist in `config.model_args` or `config`.
+    This is to patch up the compatibility between models with and without `model_args`.
+    TODO: Remove this in the future with a unified approach.
+    """
+    if hasattr(config, "model_args"):
+        if arg_name in config.model_args:
+            return config.model_args[arg_name] == value
+    if hasattr(config, arg_name):
+        return config[arg_name] == value
+    return False
+def get_from_config_or_model_args(config, arg_name):
+    """Get the given argument from `config.model_args` if exist or in `config`."""
+    if hasattr(config, "model_args"):
+        if arg_name in config.model_args:
+            return config.model_args[arg_name]
+    return config[arg_name]
+def get_from_config_or_model_args_with_default(config, arg_name, def_val):
+    """Get the given argument from `config.model_args` if exist or in `config`."""
+    if hasattr(config, "model_args"):
+        if arg_name in config.model_args:
+            return config.model_args[arg_name]
+    if hasattr(config, arg_name):
+        return config[arg_name]
+    return def_val

TTS/config/shared_configs.py ADDED Viewed

	@@ -0,0 +1,268 @@

+from dataclasses import asdict, dataclass
+from typing import List
+from coqpit import Coqpit, check_argument
+from trainer import TrainerConfig
+@dataclass
+class BaseAudioConfig(Coqpit):
+    """Base config to definge audio processing parameters. It is used to initialize
+    ```TTS.utils.audio.AudioProcessor.```
+    Args:
+        fft_size (int):
+            Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
+        win_length (int):
+            Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
+            ```fft_size```. Defaults to 1024.
+        hop_length (int):
+            Number of audio samples between adjacent STFT columns. Defaults to 1024.
+        frame_shift_ms (int):
+            Set ```hop_length``` based on milliseconds and sampling rate.
+        frame_length_ms (int):
+            Set ```win_length``` based on milliseconds and sampling rate.
+        stft_pad_mode (str):
+            Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
+        sample_rate (int):
+            Audio sampling rate. Defaults to 22050.
+        resample (bool):
+            Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
+        preemphasis (float):
+            Preemphasis coefficient. Defaults to 0.0.
+        ref_level_db (int): 20
+            Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
+            Defaults to 20.
+        do_sound_norm (bool):
+            Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
+        log_func (str):
+            Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
+        do_trim_silence (bool):
+            Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
+        do_amp_to_db_linear (bool, optional):
+            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
+        do_amp_to_db_mel (bool, optional):
+            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
+        pitch_fmax (float, optional):
+            Maximum frequency of the F0 frames. Defaults to ```640```.
+        pitch_fmin (float, optional):
+            Minimum frequency of the F0 frames. Defaults to ```1```.
+        trim_db (int):
+            Silence threshold used for silence trimming. Defaults to 45.
+        do_rms_norm (bool, optional):
+            enable/disable RMS volume normalization when loading an audio file. Defaults to False.
+        db_level (int, optional):
+            dB level used for rms normalization. The range is -99 to 0. Defaults to None.
+        power (float):
+            Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
+            artifacts in the synthesized voice. Defaults to 1.5.
+        griffin_lim_iters (int):
+            Number of Griffing Lim iterations. Defaults to 60.
+        num_mels (int):
+            Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
+        mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
+            It needs to be adjusted for a dataset. Defaults to 0.
+        mel_fmax (float):
+            Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
+        spec_gain (int):
+            Gain applied when converting amplitude to DB. Defaults to 20.
+        signal_norm (bool):
+            enable/disable signal normalization. Defaults to True.
+        min_level_db (int):
+            minimum db threshold for the computed melspectrograms. Defaults to -100.
+        symmetric_norm (bool):
+            enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
+            [0, k], Defaults to True.
+        max_norm (float):
+            ```k``` defining the normalization range. Defaults to 4.0.
+        clip_norm (bool):
+            enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
+        stats_path (str):
+            Path to the computed stats file. Defaults to None.
+    """
+    # stft parameters
+    fft_size: int = 1024
+    win_length: int = 1024
+    hop_length: int = 256
+    frame_shift_ms: int = None
+    frame_length_ms: int = None
+    stft_pad_mode: str = "reflect"
+    # audio processing parameters
+    sample_rate: int = 22050
+    resample: bool = False
+    preemphasis: float = 0.0
+    ref_level_db: int = 20
+    do_sound_norm: bool = False
+    log_func: str = "np.log10"
+    # silence trimming
+    do_trim_silence: bool = True
+    trim_db: int = 45
+    # rms volume normalization
+    do_rms_norm: bool = False
+    db_level: float = None
+    # griffin-lim params
+    power: float = 1.5
+    griffin_lim_iters: int = 60
+    # mel-spec params
+    num_mels: int = 80
+    mel_fmin: float = 0.0
+    mel_fmax: float = None
+    spec_gain: int = 20
+    do_amp_to_db_linear: bool = True
+    do_amp_to_db_mel: bool = True
+    # f0 params
+    pitch_fmax: float = 640.0
+    pitch_fmin: float = 1.0
+    # normalization params
+    signal_norm: bool = True
+    min_level_db: int = -100
+    symmetric_norm: bool = True
+    max_norm: float = 4.0
+    clip_norm: bool = True
+    stats_path: str = None
+    def check_values(
+        self,
+    ):
+        """Check config fields"""
+        c = asdict(self)
+        check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
+        check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
+        check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
+        check_argument(
+            "frame_length_ms",
+            c,
+            restricted=True,
+            min_val=10,
+            max_val=1000,
+            alternative="win_length",
+        )
+        check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
+        check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
+        check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
+        check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
+        check_argument("power", c, restricted=True, min_val=1, max_val=5)
+        check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
+        # normalization parameters
+        check_argument("signal_norm", c, restricted=True)
+        check_argument("symmetric_norm", c, restricted=True)
+        check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
+        check_argument("clip_norm", c, restricted=True)
+        check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
+        check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
+        check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
+        check_argument("do_trim_silence", c, restricted=True)
+        check_argument("trim_db", c, restricted=True)
+@dataclass
+class BaseDatasetConfig(Coqpit):
+    """Base config for TTS datasets.
+    Args:
+        formatter (str):
+            Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
+        dataset_name (str):
+            Unique name for the dataset. Defaults to `""`.
+        path (str):
+            Root path to the dataset files. Defaults to `""`.
+        meta_file_train (str):
+            Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
+            Defaults to `""`.
+        ignored_speakers (List):
+            List of speakers IDs that are not used at the training. Default None.
+        language (str):
+            Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
+        phonemizer (str):
+            Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
+        meta_file_val (str):
+            Name of the dataset meta file that defines the instances used at validation.
+        meta_file_attn_mask (str):
+            Path to the file that lists the attention mask files used with models that require attention masks to
+            train the duration predictor.
+    """
+    formatter: str = ""
+    dataset_name: str = ""
+    path: str = ""
+    meta_file_train: str = ""
+    ignored_speakers: List[str] = None
+    language: str = ""
+    phonemizer: str = ""
+    meta_file_val: str = ""
+    meta_file_attn_mask: str = ""
+    def check_values(
+        self,
+    ):
+        """Check config fields"""
+        c = asdict(self)
+        check_argument("formatter", c, restricted=True)
+        check_argument("path", c, restricted=True)
+        check_argument("meta_file_train", c, restricted=True)
+        check_argument("meta_file_val", c, restricted=False)
+        check_argument("meta_file_attn_mask", c, restricted=False)
+@dataclass
+class BaseTrainingConfig(TrainerConfig):
+    """Base config to define the basic 🐸TTS training parameters that are shared
+    among all the models. It is based on ```Trainer.TrainingConfig```.
+    Args:
+        model (str):
+            Name of the model that is used in the training.
+        num_loader_workers (int):
+            Number of workers for training time dataloader.
+        num_eval_loader_workers (int):
+            Number of workers for evaluation time dataloader.
+    """
+    model: str = None
+    # dataloading
+    num_loader_workers: int = 0
+    num_eval_loader_workers: int = 0
+    use_noise_augment: bool = False

TTS/demos/xtts_ft_demo/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ faster_whisper==0.9.0
2	+ gradio==4.7.1

TTS/demos/xtts_ft_demo/utils/formatter.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import gc
+import os
+import pandas
+import torch
+import torchaudio
+from faster_whisper import WhisperModel
+from tqdm import tqdm
+# torch.set_num_threads(1)
+from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
+torch.set_num_threads(16)
+audio_types = (".wav", ".mp3", ".flac")
+def list_audios(basePath, contains=None):
+    # return the set of files that are valid
+    return list_files(basePath, validExts=audio_types, contains=contains)
+def list_files(basePath, validExts=None, contains=None):
+    # loop over the directory structure
+    for rootDir, dirNames, filenames in os.walk(basePath):
+        # loop over the filenames in the current directory
+        for filename in filenames:
+            # if the contains string is not none and the filename does not contain
+            # the supplied string, then ignore the file
+            if contains is not None and filename.find(contains) == -1:
+                continue
+            # determine the file extension of the current file
+            ext = filename[filename.rfind(".") :].lower()
+            # check to see if the file is an audio and should be processed
+            if validExts is None or ext.endswith(validExts):
+                # construct the path to the audio and yield it
+                audioPath = os.path.join(rootDir, filename)
+                yield audioPath
+def format_audio_list(
+    audio_files,
+    target_language="en",
+    out_path=None,
+    buffer=0.2,
+    eval_percentage=0.15,
+    speaker_name="coqui",
+    gradio_progress=None,
+):
+    audio_total_size = 0
+    # make sure that ooutput file exists
+    os.makedirs(out_path, exist_ok=True)
+    # Loading Whisper
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print("Loading Whisper Model!")
+    asr_model = WhisperModel("large-v2", device=device, compute_type="float16")
+    metadata = {"audio_file": [], "text": [], "speaker_name": []}
+    if gradio_progress is not None:
+        tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
+    else:
+        tqdm_object = tqdm(audio_files)
+    for audio_path in tqdm_object:
+        wav, sr = torchaudio.load(audio_path)
+        # stereo to mono if needed
+        if wav.size(0) != 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+        wav = wav.squeeze()
+        audio_total_size += wav.size(-1) / sr
+        segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
+        segments = list(segments)
+        i = 0
+        sentence = ""
+        sentence_start = None
+        first_word = True
+        # added all segments words in a unique list
+        words_list = []
+        for _, segment in enumerate(segments):
+            words = list(segment.words)
+            words_list.extend(words)
+        # process each word
+        for word_idx, word in enumerate(words_list):
+            if first_word:
+                sentence_start = word.start
+                # If it is the first sentence, add buffer or get the begining of the file
+                if word_idx == 0:
+                    sentence_start = max(sentence_start - buffer, 0)  # Add buffer to the sentence start
+                else:
+                    # get previous sentence end
+                    previous_word_end = words_list[word_idx - 1].end
+                    # add buffer or get the silence midle between the previous sentence and the current one
+                    sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start) / 2)
+                sentence = word.word
+                first_word = False
+            else:
+                sentence += word.word
+            if word.word[-1] in ["!", ".", "?"]:
+                sentence = sentence[1:]
+                # Expand number and abbreviations plus normalization
+                sentence = multilingual_cleaners(sentence, target_language)
+                audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))
+                audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
+                # Check for the next word's existence
+                if word_idx + 1 < len(words_list):
+                    next_word_start = words_list[word_idx + 1].start
+                else:
+                    # If don't have more words it means that it is the last sentence then use the audio len as next word start
+                    next_word_start = (wav.shape[0] - 1) / sr
+                # Average the current word end and next word start
+                word_end = min((word.end + next_word_start) / 2, word.end + buffer)
+                absoulte_path = os.path.join(out_path, audio_file)
+                os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
+                i += 1
+                first_word = True
+                audio = wav[int(sr * sentence_start) : int(sr * word_end)].unsqueeze(0)
+                # if the audio is too short ignore it (i.e < 0.33 seconds)
+                if audio.size(-1) >= sr / 3:
+                    torchaudio.save(absoulte_path, audio, sr)
+                else:
+                    continue
+                metadata["audio_file"].append(audio_file)
+                metadata["text"].append(sentence)
+                metadata["speaker_name"].append(speaker_name)
+    df = pandas.DataFrame(metadata)
+    df = df.sample(frac=1)
+    num_val_samples = int(len(df) * eval_percentage)
+    df_eval = df[:num_val_samples]
+    df_train = df[num_val_samples:]
+    df_train = df_train.sort_values("audio_file")
+    train_metadata_path = os.path.join(out_path, "metadata_train.csv")
+    df_train.to_csv(train_metadata_path, sep="|", index=False)
+    eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
+    df_eval = df_eval.sort_values("audio_file")
+    df_eval.to_csv(eval_metadata_path, sep="|", index=False)
+    # deallocate VRAM and RAM
+    del asr_model, df_train, df_eval, df, metadata
+    gc.collect()
+    return train_metadata_path, eval_metadata_path, audio_total_size

TTS/demos/xtts_ft_demo/utils/gpt_train.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import gc
+import os
+from trainer import Trainer, TrainerArgs
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.utils.manage import ModelManager
+def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995):
+    #  Logging parameters
+    RUN_NAME = "GPT_XTTS_FT"
+    PROJECT_NAME = "XTTS_trainer"
+    DASHBOARD_LOGGER = "tensorboard"
+    LOGGER_URI = None
+    # Set here the path that the checkpoints will be saved. Default: ./run/training/
+    OUT_PATH = os.path.join(output_path, "run", "training")
+    # Training Parameters
+    OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
+    START_WITH_EVAL = False  # if True it will star with evaluation
+    BATCH_SIZE = batch_size  # set here the batch size
+    GRAD_ACUMM_STEPS = grad_acumm  # set here the grad accumulation steps
+    # Define here the dataset that you want to use for the fine-tuning on.
+    config_dataset = BaseDatasetConfig(
+        formatter="coqui",
+        dataset_name="ft_dataset",
+        path=os.path.dirname(train_csv),
+        meta_file_train=train_csv,
+        meta_file_val=eval_csv,
+        language=language,
+    )
+    # Add here the configs of the datasets
+    DATASETS_CONFIG_LIST = [config_dataset]
+    # Define the path where XTTS v2.0.1 files will be downloaded
+    CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
+    os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
+    # DVAE files
+    DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
+    MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
+    # Set the path to the downloaded files
+    DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
+    MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
+    # download DVAE files if needed
+    if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
+        print(" > Downloading DVAE files!")
+        ModelManager._download_model_files(
+            [MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+        )
+    # Download XTTS v2.0 checkpoint if needed
+    TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
+    XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
+    XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"
+    # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
+    TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
+    XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file
+    XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK))  # config.json file
+    # download XTTS v2.0 files if needed
+    if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
+        print(" > Downloading XTTS v2.0 files!")
+        ModelManager._download_model_files(
+            [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+        )
+    # init args and config
+    model_args = GPTArgs(
+        max_conditioning_length=132300,  # 6 secs
+        min_conditioning_length=66150,  # 3 secs
+        debug_loading_failures=False,
+        max_wav_length=max_audio_length,  # ~11.6 seconds
+        max_text_length=200,
+        mel_norm_file=MEL_NORM_FILE,
+        dvae_checkpoint=DVAE_CHECKPOINT,
+        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+        tokenizer_file=TOKENIZER_FILE,
+        gpt_num_audio_tokens=1026,
+        gpt_start_audio_token=1024,
+        gpt_stop_audio_token=1025,
+        gpt_use_masking_gt_prompt_approach=True,
+        gpt_use_perceiver_resampler=True,
+    )
+    # define audio config
+    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
+    # training parameters config
+    config = GPTTrainerConfig(
+        epochs=num_epochs,
+        output_path=OUT_PATH,
+        model_args=model_args,
+        run_name=RUN_NAME,
+        project_name=PROJECT_NAME,
+        run_description="""
+            GPT XTTS training
+            """,
+        dashboard_logger=DASHBOARD_LOGGER,
+        logger_uri=LOGGER_URI,
+        audio=audio_config,
+        batch_size=BATCH_SIZE,
+        batch_group_size=48,
+        eval_batch_size=BATCH_SIZE,
+        num_loader_workers=8,
+        eval_split_max_size=256,
+        print_step=50,
+        plot_step=100,
+        log_model_step=100,
+        save_step=1000,
+        save_n_checkpoints=1,
+        save_checkpoints=True,
+        # target_loss="loss",
+        print_eval=False,
+        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+        optimizer="AdamW",
+        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+        lr=5e-06,  # learning rate
+        lr_scheduler="MultiStepLR",
+        # it was adjusted accordly for the new step scheme
+        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+        test_sentences=[],
+    )
+    # init the model from config
+    model = GPTTrainer.init_from_config(config)
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        DATASETS_CONFIG_LIST,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+    # init the trainer and 🚀
+    trainer = Trainer(
+        TrainerArgs(
+            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+            skip_train_epoch=False,
+            start_with_eval=START_WITH_EVAL,
+            grad_accum_steps=GRAD_ACUMM_STEPS,
+        ),
+        config,
+        output_path=OUT_PATH,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+    )
+    trainer.fit()
+    # get the longest text audio file to use as speaker reference
+    samples_len = [len(item["text"].split(" ")) for item in train_samples]
+    longest_text_idx = samples_len.index(max(samples_len))
+    speaker_ref = train_samples[longest_text_idx]["audio_file"]
+    trainer_out_path = trainer.output_path
+    # deallocate VRAM and RAM
+    del model, trainer, train_samples, eval_samples
+    gc.collect()
+    return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref

TTS/demos/xtts_ft_demo/xtts_demo.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import argparse
+import logging
+import os
+import sys
+import tempfile
+import traceback
+import gradio as gr
+import torch
+import torchaudio
+from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
+from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+def clear_gpu_cache():
+    # clear the GPU cache
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+XTTS_MODEL = None
+def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
+    global XTTS_MODEL
+    clear_gpu_cache()
+    if not xtts_checkpoint or not xtts_config or not xtts_vocab:
+        return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
+    config = XttsConfig()
+    config.load_json(xtts_config)
+    XTTS_MODEL = Xtts.init_from_config(config)
+    print("Loading XTTS model! ")
+    XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
+    if torch.cuda.is_available():
+        XTTS_MODEL.cuda()
+    print("Model Loaded!")
+    return "Model Loaded!"
+def run_tts(lang, tts_text, speaker_audio_file):
+    if XTTS_MODEL is None or not speaker_audio_file:
+        return "You need to run the previous step to load the model !!", None, None
+    gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
+        audio_path=speaker_audio_file,
+        gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
+        max_ref_length=XTTS_MODEL.config.max_ref_len,
+        sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
+    )
+    out = XTTS_MODEL.inference(
+        text=tts_text,
+        language=lang,
+        gpt_cond_latent=gpt_cond_latent,
+        speaker_embedding=speaker_embedding,
+        temperature=XTTS_MODEL.config.temperature,  # Add custom parameters here
+        length_penalty=XTTS_MODEL.config.length_penalty,
+        repetition_penalty=XTTS_MODEL.config.repetition_penalty,
+        top_k=XTTS_MODEL.config.top_k,
+        top_p=XTTS_MODEL.config.top_p,
+    )
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
+        out_path = fp.name
+        torchaudio.save(out_path, out["wav"], 24000)
+    return "Speech generated !", out_path, speaker_audio_file
+# define a logger to redirect
+class Logger:
+    def __init__(self, filename="log.out"):
+        self.log_file = filename
+        self.terminal = sys.stdout
+        self.log = open(self.log_file, "w")
+    def write(self, message):
+        self.terminal.write(message)
+        self.log.write(message)
+    def flush(self):
+        self.terminal.flush()
+        self.log.flush()
+    def isatty(self):
+        return False
+# redirect stdout and stderr to a file
+sys.stdout = Logger()
+sys.stderr = sys.stdout
+# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(sys.stdout)]
+)
+def read_logs():
+    sys.stdout.flush()
+    with open(sys.stdout.log_file, "r") as f:
+        return f.read()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""XTTS fine-tuning demo\n\n"""
+        """
+        Example runs:
+        python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port
+        """,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        help="Port to run the gradio demo. Default: 5003",
+        default=5003,
+    )
+    parser.add_argument(
+        "--out_path",
+        type=str,
+        help="Output path (where data and checkpoints will be saved) Default: /tmp/xtts_ft/",
+        default="/tmp/xtts_ft/",
+    )
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        help="Number of epochs to train. Default: 10",
+        default=10,
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        help="Batch size. Default: 4",
+        default=4,
+    )
+    parser.add_argument(
+        "--grad_acumm",
+        type=int,
+        help="Grad accumulation steps. Default: 1",
+        default=1,
+    )
+    parser.add_argument(
+        "--max_audio_length",
+        type=int,
+        help="Max permitted audio size in seconds. Default: 11",
+        default=11,
+    )
+    args = parser.parse_args()
+    with gr.Blocks() as demo:
+        with gr.Tab("1 - Data processing"):
+            out_path = gr.Textbox(
+                label="Output path (where data and checkpoints will be saved):",
+                value=args.out_path,
+            )
+            # upload_file = gr.Audio(
+            #     sources="upload",
+            #     label="Select here the audio files that you want to use for XTTS trainining !",
+            #     type="filepath",
+            # )
+            upload_file = gr.File(
+                file_count="multiple",
+                label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)",
+            )
+            lang = gr.Dropdown(
+                label="Dataset Language",
+                value="en",
+                choices=[
+                    "en",
+                    "es",
+                    "fr",
+                    "de",
+                    "it",
+                    "pt",
+                    "pl",
+                    "tr",
+                    "ru",
+                    "nl",
+                    "cs",
+                    "ar",
+                    "zh",
+                    "hu",
+                    "ko",
+                    "ja",
+                    "hi",
+                ],
+            )
+            progress_data = gr.Label(label="Progress:")
+            logs = gr.Textbox(
+                label="Logs:",
+                interactive=False,
+            )
+            demo.load(read_logs, None, logs, every=1)
+            prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
+            def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)):
+                clear_gpu_cache()
+                out_path = os.path.join(out_path, "dataset")
+                os.makedirs(out_path, exist_ok=True)
+                if audio_path is None:
+                    return (
+                        "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!",
+                        "",
+                        "",
+                    )
+                else:
+                    try:
+                        train_meta, eval_meta, audio_total_size = format_audio_list(
+                            audio_path, target_language=language, out_path=out_path, gradio_progress=progress
+                        )
+                    except:
+                        traceback.print_exc()
+                        error = traceback.format_exc()
+                        return (
+                            f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}",
+                            "",
+                            "",
+                        )
+                clear_gpu_cache()
+                # if audio total len is less than 2 minutes raise an error
+                if audio_total_size < 120:
+                    message = "The sum of the duration of the audios that you provided should be at least 2 minutes!"
+                    print(message)
+                    return message, "", ""
+                print("Dataset Processed!")
+                return "Dataset Processed!", train_meta, eval_meta
+        with gr.Tab("2 - Fine-tuning XTTS Encoder"):
+            train_csv = gr.Textbox(
+                label="Train CSV:",
+            )
+            eval_csv = gr.Textbox(
+                label="Eval CSV:",
+            )
+            num_epochs = gr.Slider(
+                label="Number of epochs:",
+                minimum=1,
+                maximum=100,
+                step=1,
+                value=args.num_epochs,
+            )
+            batch_size = gr.Slider(
+                label="Batch size:",
+                minimum=2,
+                maximum=512,
+                step=1,
+                value=args.batch_size,
+            )
+            grad_acumm = gr.Slider(
+                label="Grad accumulation steps:",
+                minimum=2,
+                maximum=128,
+                step=1,
+                value=args.grad_acumm,
+            )
+            max_audio_length = gr.Slider(
+                label="Max permitted audio size in seconds:",
+                minimum=2,
+                maximum=20,
+                step=1,
+                value=args.max_audio_length,
+            )
+            progress_train = gr.Label(label="Progress:")
+            logs_tts_train = gr.Textbox(
+                label="Logs:",
+                interactive=False,
+            )
+            demo.load(read_logs, None, logs_tts_train, every=1)
+            train_btn = gr.Button(value="Step 2 - Run the training")
+            def train_model(
+                language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length
+            ):
+                clear_gpu_cache()
+                if not train_csv or not eval_csv:
+                    return (
+                        "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !",
+                        "",
+                        "",
+                        "",
+                        "",
+                    )
+                try:
+                    # convert seconds to waveform frames
+                    max_audio_length = int(max_audio_length * 22050)
+                    config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(
+                        language,
+                        num_epochs,
+                        batch_size,
+                        grad_acumm,
+                        train_csv,
+                        eval_csv,
+                        output_path=output_path,
+                        max_audio_length=max_audio_length,
+                    )
+                except:
+                    traceback.print_exc()
+                    error = traceback.format_exc()
+                    return (
+                        f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}",
+                        "",
+                        "",
+                        "",
+                        "",
+                    )
+                # copy original files to avoid parameters changes issues
+                os.system(f"cp {config_path} {exp_path}")
+                os.system(f"cp {vocab_file} {exp_path}")
+                ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
+                print("Model training done!")
+                clear_gpu_cache()
+                return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav
+        with gr.Tab("3 - Inference"):
+            with gr.Row():
+                with gr.Column() as col1:
+                    xtts_checkpoint = gr.Textbox(
+                        label="XTTS checkpoint path:",
+                        value="",
+                    )
+                    xtts_config = gr.Textbox(
+                        label="XTTS config path:",
+                        value="",
+                    )
+                    xtts_vocab = gr.Textbox(
+                        label="XTTS vocab path:",
+                        value="",
+                    )
+                    progress_load = gr.Label(label="Progress:")
+                    load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
+                with gr.Column() as col2:
+                    speaker_reference_audio = gr.Textbox(
+                        label="Speaker reference audio:",
+                        value="",
+                    )
+                    tts_language = gr.Dropdown(
+                        label="Language",
+                        value="en",
+                        choices=[
+                            "en",
+                            "es",
+                            "fr",
+                            "de",
+                            "it",
+                            "pt",
+                            "pl",
+                            "tr",
+                            "ru",
+                            "nl",
+                            "cs",
+                            "ar",
+                            "zh",
+                            "hu",
+                            "ko",
+                            "ja",
+                            "hi",
+                        ],
+                    )
+                    tts_text = gr.Textbox(
+                        label="Input Text.",
+                        value="This model sounds really good and above all, it's reasonably fast.",
+                    )
+                    tts_btn = gr.Button(value="Step 4 - Inference")
+                with gr.Column() as col3:
+                    progress_gen = gr.Label(label="Progress:")
+                    tts_output_audio = gr.Audio(label="Generated Audio.")
+                    reference_audio = gr.Audio(label="Reference audio used.")
+            prompt_compute_btn.click(
+                fn=preprocess_dataset,
+                inputs=[
+                    upload_file,
+                    lang,
+                    out_path,
+                ],
+                outputs=[
+                    progress_data,
+                    train_csv,
+                    eval_csv,
+                ],
+            )
+            train_btn.click(
+                fn=train_model,
+                inputs=[
+                    lang,
+                    train_csv,
+                    eval_csv,
+                    num_epochs,
+                    batch_size,
+                    grad_acumm,
+                    out_path,
+                    max_audio_length,
+                ],
+                outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio],
+            )
+            load_btn.click(
+                fn=load_model,
+                inputs=[xtts_checkpoint, xtts_config, xtts_vocab],
+                outputs=[progress_load],
+            )
+            tts_btn.click(
+                fn=run_tts,
+                inputs=[
+                    tts_language,
+                    tts_text,
+                    speaker_reference_audio,
+                ],
+                outputs=[progress_gen, tts_output_audio, reference_audio],
+            )
+    demo.launch(share=True, debug=False, server_port=args.port, server_name="0.0.0.0")

TTS/encoder/README.md ADDED Viewed

	@@ -0,0 +1,18 @@

+### Speaker Encoder
+This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
+With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
+Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
+![](umap.png)
+Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
+To run the code, you need to follow the same flow as in TTS.
+- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
+- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
+- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
+- Watch training on Tensorboard as in TTS

TTS/encoder/__init__.py ADDED Viewed

File without changes

TTS/encoder/configs/base_encoder_config.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from dataclasses import asdict, dataclass, field
+from typing import Dict, List
+from coqpit import MISSING
+from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
+@dataclass
+class BaseEncoderConfig(BaseTrainingConfig):
+    """Defines parameters for a Generic Encoder model."""
+    model: str = None
+    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+    # model params
+    model_params: Dict = field(
+        default_factory=lambda: {
+            "model_name": "lstm",
+            "input_dim": 80,
+            "proj_dim": 256,
+            "lstm_dim": 768,
+            "num_lstm_layers": 3,
+            "use_lstm_with_projection": True,
+        }
+    )
+    audio_augmentation: Dict = field(default_factory=lambda: {})
+    # training params
+    epochs: int = 10000
+    loss: str = "angleproto"
+    grad_clip: float = 3.0
+    lr: float = 0.0001
+    optimizer: str = "radam"
+    optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
+    lr_decay: bool = False
+    warmup_steps: int = 4000
+    # logging params
+    tb_model_param_stats: bool = False
+    steps_plot_stats: int = 10
+    save_step: int = 1000
+    print_step: int = 20
+    run_eval: bool = False
+    # data loader
+    num_classes_in_batch: int = MISSING
+    num_utter_per_class: int = MISSING
+    eval_num_classes_in_batch: int = None
+    eval_num_utter_per_class: int = None
+    num_loader_workers: int = MISSING
+    voice_len: float = 1.6
+    def check_values(self):
+        super().check_values()
+        c = asdict(self)
+        assert (
+            c["model_params"]["input_dim"] == self.audio.num_mels
+        ), " [!] model input dimendion must be equal to melspectrogram dimension."

TTS/encoder/configs/emotion_encoder_config.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from dataclasses import dataclass
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
+@dataclass
+class EmotionEncoderConfig(BaseEncoderConfig):
+    """Defines parameters for Emotion Encoder model."""
+    model: str = "emotion_encoder"
+    map_classid_to_classname: dict = None
+    class_name_key: str = "emotion_name"

TTS/encoder/configs/speaker_encoder_config.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from dataclasses import dataclass
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
+@dataclass
+class SpeakerEncoderConfig(BaseEncoderConfig):
+    """Defines parameters for Speaker Encoder model."""
+    model: str = "speaker_encoder"
+    class_name_key: str = "speaker_name"

TTS/encoder/dataset.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import logging
+import random
+import torch
+from torch.utils.data import Dataset
+from TTS.encoder.utils.generic_utils import AugmentWAV
+logger = logging.getLogger(__name__)
+class EncoderDataset(Dataset):
+    def __init__(
+        self,
+        config,
+        ap,
+        meta_data,
+        voice_len=1.6,
+        num_classes_in_batch=64,
+        num_utter_per_class=10,
+        augmentation_config=None,
+        use_torch_spec=None,
+    ):
+        """
+        Args:
+            ap (TTS.tts.utils.AudioProcessor): audio processor object.
+            meta_data (list): list of dataset instances.
+            seq_len (int): voice segment length in seconds.
+        """
+        super().__init__()
+        self.config = config
+        self.items = meta_data
+        self.sample_rate = ap.sample_rate
+        self.seq_len = int(voice_len * self.sample_rate)
+        self.num_utter_per_class = num_utter_per_class
+        self.ap = ap
+        self.use_torch_spec = use_torch_spec
+        self.classes, self.items = self.__parse_items()
+        self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
+        # Data Augmentation
+        self.augmentator = None
+        self.gaussian_augmentation_config = None
+        if augmentation_config:
+            self.data_augmentation_p = augmentation_config["p"]
+            if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
+                self.augmentator = AugmentWAV(ap, augmentation_config)
+            if "gaussian" in augmentation_config.keys():
+                self.gaussian_augmentation_config = augmentation_config["gaussian"]
+        logger.info("DataLoader initialization")
+        logger.info(" | Classes per batch: %d", num_classes_in_batch)
+        logger.info(" | Number of instances: %d", len(self.items))
+        logger.info(" | Sequence length: %d", self.seq_len)
+        logger.info(" | Number of classes: %d", len(self.classes))
+        logger.info(" | Classes: %s", self.classes)
+    def load_wav(self, filename):
+        audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
+        return audio
+    def __parse_items(self):
+        class_to_utters = {}
+        for item in self.items:
+            path_ = item["audio_file"]
+            class_name = item[self.config.class_name_key]
+            if class_name in class_to_utters.keys():
+                class_to_utters[class_name].append(path_)
+            else:
+                class_to_utters[class_name] = [
+                    path_,
+                ]
+        # skip classes with number of samples >= self.num_utter_per_class
+        class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
+        classes = list(class_to_utters.keys())
+        classes.sort()
+        new_items = []
+        for item in self.items:
+            path_ = item["audio_file"]
+            class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
+            # ignore filtered classes
+            if class_name not in classes:
+                continue
+            # ignore small audios
+            if self.load_wav(path_).shape[0] - self.seq_len <= 0:
+                continue
+            new_items.append({"wav_file_path": path_, "class_name": class_name})
+        return classes, new_items
+    def __len__(self):
+        return len(self.items)
+    def get_num_classes(self):
+        return len(self.classes)
+    def get_class_list(self):
+        return self.classes
+    def set_classes(self, classes):
+        self.classes = classes
+        self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
+    def get_map_classid_to_classname(self):
+        return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
+    def __getitem__(self, idx):
+        return self.items[idx]
+    def collate_fn(self, batch):
+        # get the batch class_ids
+        labels = []
+        feats = []
+        for item in batch:
+            utter_path = item["wav_file_path"]
+            class_name = item["class_name"]
+            # get classid
+            class_id = self.classname_to_classid[class_name]
+            # load wav file
+            wav = self.load_wav(utter_path)
+            offset = random.randint(0, wav.shape[0] - self.seq_len)
+            wav = wav[offset : offset + self.seq_len]
+            if self.augmentator is not None and self.data_augmentation_p:
+                if random.random() < self.data_augmentation_p:
+                    wav = self.augmentator.apply_one(wav)
+            if not self.use_torch_spec:
+                mel = self.ap.melspectrogram(wav)
+                feats.append(torch.FloatTensor(mel))
+            else:
+                feats.append(torch.FloatTensor(wav))
+            labels.append(class_id)
+        feats = torch.stack(feats)
+        labels = torch.LongTensor(labels)
+        return feats, labels

TTS/encoder/losses.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import logging
+import torch
+import torch.nn.functional as F
+from torch import nn
+logger = logging.getLogger(__name__)
+# adapted from https://github.com/cvqluu/GE2E-Loss
+class GE2ELoss(nn.Module):
+    def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
+        """
+        Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
+        Accepts an input of size (N, M, D)
+            where N is the number of speakers in the batch,
+            M is the number of utterances per speaker,
+            and D is the dimensionality of the embedding vector (e.g. d-vector)
+        Args:
+            - init_w (float): defines the initial value of w in Equation (5) of [1]
+            - init_b (float): definies the initial value of b in Equation (5) of [1]
+        """
+        super().__init__()
+        # pylint: disable=E1102
+        self.w = nn.Parameter(torch.tensor(init_w))
+        # pylint: disable=E1102
+        self.b = nn.Parameter(torch.tensor(init_b))
+        self.loss_method = loss_method
+        logger.info("Initialized Generalized End-to-End loss")
+        assert self.loss_method in ["softmax", "contrast"]
+        if self.loss_method == "softmax":
+            self.embed_loss = self.embed_loss_softmax
+        if self.loss_method == "contrast":
+            self.embed_loss = self.embed_loss_contrast
+    # pylint: disable=R0201
+    def calc_new_centroids(self, dvecs, centroids, spkr, utt):
+        """
+        Calculates the new centroids excluding the reference utterance
+        """
+        excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
+        excl = torch.mean(excl, 0)
+        new_centroids = []
+        for i, centroid in enumerate(centroids):
+            if i == spkr:
+                new_centroids.append(excl)
+            else:
+                new_centroids.append(centroid)
+        return torch.stack(new_centroids)
+    def calc_cosine_sim(self, dvecs, centroids):
+        """
+        Make the cosine similarity matrix with dims (N,M,N)
+        """
+        cos_sim_matrix = []
+        for spkr_idx, speaker in enumerate(dvecs):
+            cs_row = []
+            for utt_idx, utterance in enumerate(speaker):
+                new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
+                # vector based cosine similarity for speed
+                cs_row.append(
+                    torch.clamp(
+                        torch.mm(
+                            utterance.unsqueeze(1).transpose(0, 1),
+                            new_centroids.transpose(0, 1),
+                        )
+                        / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
+                        1e-6,
+                    )
+                )
+            cs_row = torch.cat(cs_row, dim=0)
+            cos_sim_matrix.append(cs_row)
+        return torch.stack(cos_sim_matrix)
+    # pylint: disable=R0201
+    def embed_loss_softmax(self, dvecs, cos_sim_matrix):
+        """
+        Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
+        """
+        N, M, _ = dvecs.shape
+        L = []
+        for j in range(N):
+            L_row = []
+            for i in range(M):
+                L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
+            L_row = torch.stack(L_row)
+            L.append(L_row)
+        return torch.stack(L)
+    # pylint: disable=R0201
+    def embed_loss_contrast(self, dvecs, cos_sim_matrix):
+        """
+        Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
+        """
+        N, M, _ = dvecs.shape
+        L = []
+        for j in range(N):
+            L_row = []
+            for i in range(M):
+                centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
+                excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
+                L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
+            L_row = torch.stack(L_row)
+            L.append(L_row)
+        return torch.stack(L)
+    def forward(self, x, _label=None):
+        """
+        Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+        assert x.size()[1] >= 2
+        centroids = torch.mean(x, 1)
+        cos_sim_matrix = self.calc_cosine_sim(x, centroids)
+        torch.clamp(self.w, 1e-6)
+        cos_sim_matrix = self.w * cos_sim_matrix + self.b
+        L = self.embed_loss(x, cos_sim_matrix)
+        return L.mean()
+# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
+class AngleProtoLoss(nn.Module):
+    """
+    Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
+        Accepts an input of size (N, M, D)
+            where N is the number of speakers in the batch,
+            M is the number of utterances per speaker,
+            and D is the dimensionality of the embedding vector
+        Args:
+            - init_w (float): defines the initial value of w
+            - init_b (float): definies the initial value of b
+    """
+    def __init__(self, init_w=10.0, init_b=-5.0):
+        super().__init__()
+        # pylint: disable=E1102
+        self.w = nn.Parameter(torch.tensor(init_w))
+        # pylint: disable=E1102
+        self.b = nn.Parameter(torch.tensor(init_b))
+        self.criterion = torch.nn.CrossEntropyLoss()
+        logger.info("Initialized Angular Prototypical loss")
+    def forward(self, x, _label=None):
+        """
+        Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+        assert x.size()[1] >= 2
+        out_anchor = torch.mean(x[:, 1:, :], 1)
+        out_positive = x[:, 0, :]
+        num_speakers = out_anchor.size()[0]
+        cos_sim_matrix = F.cosine_similarity(
+            out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
+            out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
+        )
+        torch.clamp(self.w, 1e-6)
+        cos_sim_matrix = cos_sim_matrix * self.w + self.b
+        label = torch.arange(num_speakers).to(cos_sim_matrix.device)
+        L = self.criterion(cos_sim_matrix, label)
+        return L
+class SoftmaxLoss(nn.Module):
+    """
+    Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
+        Args:
+            - embedding_dim (float): speaker embedding dim
+            - n_speakers (float): number of speakers
+    """
+    def __init__(self, embedding_dim, n_speakers):
+        super().__init__()
+        self.criterion = torch.nn.CrossEntropyLoss()
+        self.fc = nn.Linear(embedding_dim, n_speakers)
+        logger.info("Initialised Softmax Loss")
+    def forward(self, x, label=None):
+        # reshape for compatibility
+        x = x.reshape(-1, x.size()[-1])
+        label = label.reshape(-1)
+        x = self.fc(x)
+        L = self.criterion(x, label)
+        return L
+    def inference(self, embedding):
+        x = self.fc(embedding)
+        activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
+        class_id = torch.argmax(activations)
+        return class_id
+class SoftmaxAngleProtoLoss(nn.Module):
+    """
+    Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
+        Args:
+            - embedding_dim (float): speaker embedding dim
+            - n_speakers (float): number of speakers
+            - init_w (float): defines the initial value of w
+            - init_b (float): definies the initial value of b
+    """
+    def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
+        super().__init__()
+        self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
+        self.angleproto = AngleProtoLoss(init_w, init_b)
+        logger.info("Initialised SoftmaxAnglePrototypical Loss")
+    def forward(self, x, label=None):
+        """
+        Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+        Lp = self.angleproto(x)
+        Ls = self.softmax(x, label)
+        return Ls + Lp

TTS/encoder/models/base_encoder.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import logging
+import numpy as np
+import torch
+import torchaudio
+from coqpit import Coqpit
+from torch import nn
+from trainer.io import load_fsspec
+from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
+from TTS.utils.generic_utils import set_init_dict
+logger = logging.getLogger(__name__)
+class PreEmphasis(nn.Module):
+    def __init__(self, coefficient=0.97):
+        super().__init__()
+        self.coefficient = coefficient
+        self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
+    def forward(self, x):
+        assert len(x.size()) == 2
+        x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
+        return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
+class BaseEncoder(nn.Module):
+    """Base `encoder` class. Every new `encoder` model must inherit this.
+    It defines common `encoder` specific functions.
+    """
+    # pylint: disable=W0102
+    def __init__(self):
+        super(BaseEncoder, self).__init__()
+    def get_torch_mel_spectrogram_class(self, audio_config):
+        return torch.nn.Sequential(
+            PreEmphasis(audio_config["preemphasis"]),
+            # TorchSTFT(
+            #     n_fft=audio_config["fft_size"],
+            #     hop_length=audio_config["hop_length"],
+            #     win_length=audio_config["win_length"],
+            #     sample_rate=audio_config["sample_rate"],
+            #     window="hamming_window",
+            #     mel_fmin=0.0,
+            #     mel_fmax=None,
+            #     use_htk=True,
+            #     do_amp_to_db=False,
+            #     n_mels=audio_config["num_mels"],
+            #     power=2.0,
+            #     use_mel=True,
+            #     mel_norm=None,
+            # )
+            torchaudio.transforms.MelSpectrogram(
+                sample_rate=audio_config["sample_rate"],
+                n_fft=audio_config["fft_size"],
+                win_length=audio_config["win_length"],
+                hop_length=audio_config["hop_length"],
+                window_fn=torch.hamming_window,
+                n_mels=audio_config["num_mels"],
+            ),
+        )
+    @torch.no_grad()
+    def inference(self, x, l2_norm=True):
+        return self.forward(x, l2_norm)
+    @torch.no_grad()
+    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
+        """
+        Generate embeddings for a batch of utterances
+        x: 1xTxD
+        """
+        # map to the waveform size
+        if self.use_torch_spec:
+            num_frames = num_frames * self.audio_config["hop_length"]
+        max_len = x.shape[1]
+        if max_len < num_frames:
+            num_frames = max_len
+        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
+        frames_batch = []
+        for offset in offsets:
+            offset = int(offset)
+            end_offset = int(offset + num_frames)
+            frames = x[:, offset:end_offset]
+            frames_batch.append(frames)
+        frames_batch = torch.cat(frames_batch, dim=0)
+        embeddings = self.inference(frames_batch, l2_norm=l2_norm)
+        if return_mean:
+            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
+        return embeddings
+    def get_criterion(self, c: Coqpit, num_classes=None):
+        if c.loss == "ge2e":
+            criterion = GE2ELoss(loss_method="softmax")
+        elif c.loss == "angleproto":
+            criterion = AngleProtoLoss()
+        elif c.loss == "softmaxproto":
+            criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
+        else:
+            raise Exception("The %s  not is a loss supported" % c.loss)
+        return criterion
+    def load_checkpoint(
+        self,
+        config: Coqpit,
+        checkpoint_path: str,
+        eval: bool = False,
+        use_cuda: bool = False,
+        criterion=None,
+        cache=False,
+    ):
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
+        try:
+            self.load_state_dict(state["model"])
+            logger.info("Model fully restored. ")
+        except (KeyError, RuntimeError) as error:
+            # If eval raise the error
+            if eval:
+                raise error
+            logger.info("Partial model initialization.")
+            model_dict = self.state_dict()
+            model_dict = set_init_dict(model_dict, state["model"], c)
+            self.load_state_dict(model_dict)
+            del model_dict
+        # load the criterion for restore_path
+        if criterion is not None and "criterion" in state:
+            try:
+                criterion.load_state_dict(state["criterion"])
+            except (KeyError, RuntimeError) as error:
+                logger.exception("Criterion load ignored because of: %s", error)
+        # instance and load the criterion for the encoder classifier in inference time
+        if (
+            eval
+            and criterion is None
+            and "criterion" in state
+            and getattr(config, "map_classid_to_classname", None) is not None
+        ):
+            criterion = self.get_criterion(config, len(config.map_classid_to_classname))
+            criterion.load_state_dict(state["criterion"])
+        if use_cuda:
+            self.cuda()
+            if criterion is not None:
+                criterion = criterion.cuda()
+        if eval:
+            self.eval()
+            assert not self.training
+        if not eval:
+            return criterion, state["step"]
+        return criterion

TTS/encoder/models/lstm.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+from torch import nn
+from TTS.encoder.models.base_encoder import BaseEncoder
+class LSTMWithProjection(nn.Module):
+    def __init__(self, input_size, hidden_size, proj_size):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.proj_size = proj_size
+        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
+        self.linear = nn.Linear(hidden_size, proj_size, bias=False)
+    def forward(self, x):
+        self.lstm.flatten_parameters()
+        o, (_, _) = self.lstm(x)
+        return self.linear(o)
+class LSTMWithoutProjection(nn.Module):
+    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
+        super().__init__()
+        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
+        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        _, (hidden, _) = self.lstm(x)
+        return self.relu(self.linear(hidden[-1]))
+class LSTMSpeakerEncoder(BaseEncoder):
+    def __init__(
+        self,
+        input_dim,
+        proj_dim=256,
+        lstm_dim=768,
+        num_lstm_layers=3,
+        use_lstm_with_projection=True,
+        use_torch_spec=False,
+        audio_config=None,
+    ):
+        super().__init__()
+        self.use_lstm_with_projection = use_lstm_with_projection
+        self.use_torch_spec = use_torch_spec
+        self.audio_config = audio_config
+        self.proj_dim = proj_dim
+        layers = []
+        # choise LSTM layer
+        if use_lstm_with_projection:
+            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
+            for _ in range(num_lstm_layers - 1):
+                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
+            self.layers = nn.Sequential(*layers)
+        else:
+            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
+        self.instancenorm = nn.InstanceNorm1d(input_dim)
+        if self.use_torch_spec:
+            self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
+        else:
+            self.torch_spec = None
+        self._init_layers()
+    def _init_layers(self):
+        for name, param in self.layers.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0.0)
+            elif "weight" in name:
+                nn.init.xavier_normal_(param)
+    def forward(self, x, l2_norm=True):
+        """Forward pass of the model.
+        Args:
+            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
+                to compute the spectrogram on-the-fly.
+            l2_norm (bool): Whether to L2-normalize the outputs.
+        Shapes:
+            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
+        """
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                if self.use_torch_spec:
+                    x.squeeze_(1)
+                    x = self.torch_spec(x)
+                x = self.instancenorm(x).transpose(1, 2)
+        d = self.layers(x)
+        if self.use_lstm_with_projection:
+            d = d[:, -1]
+        if l2_norm:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
+        return d

TTS/encoder/models/resnet.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import torch
+from torch import nn
+# from TTS.utils.audio.torch_transforms import TorchSTFT
+from TTS.encoder.models.base_encoder import BaseEncoder
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=8):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel),
+            nn.Sigmoid(),
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+class SEBasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+        super(SEBasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes, reduction)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNetSpeakerEncoder(BaseEncoder):
+    """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
+    Adapted from: https://github.com/clovaai/voxceleb_trainer
+    """
+    # pylint: disable=W0102
+    def __init__(
+        self,
+        input_dim=64,
+        proj_dim=512,
+        layers=[3, 4, 6, 3],
+        num_filters=[32, 64, 128, 256],
+        encoder_type="ASP",
+        log_input=False,
+        use_torch_spec=False,
+        audio_config=None,
+    ):
+        super(ResNetSpeakerEncoder, self).__init__()
+        self.encoder_type = encoder_type
+        self.input_dim = input_dim
+        self.log_input = log_input
+        self.use_torch_spec = use_torch_spec
+        self.audio_config = audio_config
+        self.proj_dim = proj_dim
+        self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.bn1 = nn.BatchNorm2d(num_filters[0])
+        self.inplanes = num_filters[0]
+        self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
+        self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
+        self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
+        self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
+        self.instancenorm = nn.InstanceNorm1d(input_dim)
+        if self.use_torch_spec:
+            self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
+        else:
+            self.torch_spec = None
+        outmap_size = int(self.input_dim / 8)
+        self.attention = nn.Sequential(
+            nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
+            nn.Softmax(dim=2),
+        )
+        if self.encoder_type == "SAP":
+            out_dim = num_filters[3] * outmap_size
+        elif self.encoder_type == "ASP":
+            out_dim = num_filters[3] * outmap_size * 2
+        else:
+            raise ValueError("Undefined encoder")
+        self.fc = nn.Linear(out_dim, proj_dim)
+        self._init_layers()
+    def _init_layers(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    def create_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    # pylint: disable=R0201
+    def new_parameter(self, *size):
+        out = nn.Parameter(torch.FloatTensor(*size))
+        nn.init.xavier_normal_(out)
+        return out
+    def forward(self, x, l2_norm=False):
+        """Forward pass of the model.
+        Args:
+            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
+                to compute the spectrogram on-the-fly.
+            l2_norm (bool): Whether to L2-normalize the outputs.
+        Shapes:
+            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
+        """
+        x.squeeze_(1)
+        # if you torch spec compute it otherwise use the mel spec computed by the AP
+        if self.use_torch_spec:
+            x = self.torch_spec(x)
+        if self.log_input:
+            x = (x + 1e-6).log()
+        x = self.instancenorm(x).unsqueeze(1)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.bn1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = x.reshape(x.size()[0], -1, x.size()[-1])
+        w = self.attention(x)
+        if self.encoder_type == "SAP":
+            x = torch.sum(x * w, dim=2)
+        elif self.encoder_type == "ASP":
+            mu = torch.sum(x * w, dim=2)
+            sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
+            x = torch.cat((mu, sg), 1)
+        x = x.view(x.size()[0], -1)
+        x = self.fc(x)
+        if l2_norm:
+            x = torch.nn.functional.normalize(x, p=2, dim=1)
+        return x

TTS/encoder/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ umap-learn
2	+ numpy>=1.17.0

TTS/encoder/utils/__init__.py ADDED Viewed

File without changes

TTS/encoder/utils/generic_utils.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import glob
+import logging
+import os
+import random
+import numpy as np
+from scipy import signal
+from TTS.encoder.models.lstm import LSTMSpeakerEncoder
+from TTS.encoder.models.resnet import ResNetSpeakerEncoder
+logger = logging.getLogger(__name__)
+class AugmentWAV(object):
+    def __init__(self, ap, augmentation_config):
+        self.ap = ap
+        self.use_additive_noise = False
+        if "additive" in augmentation_config.keys():
+            self.additive_noise_config = augmentation_config["additive"]
+            additive_path = self.additive_noise_config["sounds_path"]
+            if additive_path:
+                self.use_additive_noise = True
+                # get noise types
+                self.additive_noise_types = []
+                for key in self.additive_noise_config.keys():
+                    if isinstance(self.additive_noise_config[key], dict):
+                        self.additive_noise_types.append(key)
+                additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
+                self.noise_list = {}
+                for wav_file in additive_files:
+                    noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
+                    # ignore not listed directories
+                    if noise_dir not in self.additive_noise_types:
+                        continue
+                    if noise_dir not in self.noise_list:
+                        self.noise_list[noise_dir] = []
+                    self.noise_list[noise_dir].append(wav_file)
+                logger.info(
+                    "Using Additive Noise Augmentation: with %d audios instances from %s",
+                    len(additive_files),
+                    self.additive_noise_types,
+                )
+        self.use_rir = False
+        if "rir" in augmentation_config.keys():
+            self.rir_config = augmentation_config["rir"]
+            if self.rir_config["rir_path"]:
+                self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
+                self.use_rir = True
+            logger.info("Using RIR Noise Augmentation: with %d audios instances", len(self.rir_files))
+        self.create_augmentation_global_list()
+    def create_augmentation_global_list(self):
+        if self.use_additive_noise:
+            self.global_noise_list = self.additive_noise_types
+        else:
+            self.global_noise_list = []
+        if self.use_rir:
+            self.global_noise_list.append("RIR_AUG")
+    def additive_noise(self, noise_type, audio):
+        clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
+        noise_list = random.sample(
+            self.noise_list[noise_type],
+            random.randint(
+                self.additive_noise_config[noise_type]["min_num_noises"],
+                self.additive_noise_config[noise_type]["max_num_noises"],
+            ),
+        )
+        audio_len = audio.shape[0]
+        noises_wav = None
+        for noise in noise_list:
+            noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
+            if noiseaudio.shape[0] < audio_len:
+                continue
+            noise_snr = random.uniform(
+                self.additive_noise_config[noise_type]["min_snr_in_db"],
+                self.additive_noise_config[noise_type]["max_num_noises"],
+            )
+            noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4)
+            noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
+            if noises_wav is None:
+                noises_wav = noise_wav
+            else:
+                noises_wav += noise_wav
+        # if all possible files is less than audio, choose other files
+        if noises_wav is None:
+            return self.additive_noise(noise_type, audio)
+        return audio + noises_wav
+    def reverberate(self, audio):
+        audio_len = audio.shape[0]
+        rir_file = random.choice(self.rir_files)
+        rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
+        rir = rir / np.sqrt(np.sum(rir**2))
+        return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
+    def apply_one(self, audio):
+        noise_type = random.choice(self.global_noise_list)
+        if noise_type == "RIR_AUG":
+            return self.reverberate(audio)
+        return self.additive_noise(noise_type, audio)
+def setup_encoder_model(config: "Coqpit"):
+    if config.model_params["model_name"].lower() == "lstm":
+        model = LSTMSpeakerEncoder(
+            config.model_params["input_dim"],
+            config.model_params["proj_dim"],
+            config.model_params["lstm_dim"],
+            config.model_params["num_lstm_layers"],
+            use_torch_spec=config.model_params.get("use_torch_spec", False),
+            audio_config=config.audio,
+        )
+    elif config.model_params["model_name"].lower() == "resnet":
+        model = ResNetSpeakerEncoder(
+            input_dim=config.model_params["input_dim"],
+            proj_dim=config.model_params["proj_dim"],
+            log_input=config.model_params.get("log_input", False),
+            use_torch_spec=config.model_params.get("use_torch_spec", False),
+            audio_config=config.audio,
+        )
+    return model

TTS/encoder/utils/prepare_voxceleb.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# coding=utf-8
+# Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Only support eager mode and TF>=2.0.0
+# pylint: disable=no-member, invalid-name, relative-beyond-top-level
+# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
+""" voxceleb 1 & 2 """
+import csv
+import hashlib
+import logging
+import os
+import subprocess
+import sys
+import zipfile
+import soundfile as sf
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+logger = logging.getLogger(__name__)
+SUBSETS = {
+    "vox1_dev_wav": [
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
+    ],
+    "vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"],
+    "vox2_dev_aac": [
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah",
+    ],
+    "vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"],
+}
+MD5SUM = {
+    "vox1_dev_wav": "ae63e55b951748cc486645f532ba230b",
+    "vox2_dev_aac": "bbc063c46078a602ca71605645c2a402",
+    "vox1_test_wav": "185fdc63c3c739954633d50379a3d102",
+    "vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312",
+}
+USER = {"user": "", "password": ""}
+speaker_id_dict = {}
+def download_and_extract(directory, subset, urls):
+    """Download and extract the given split of dataset.
+    Args:
+        directory: the directory where to put the downloaded data.
+        subset: subset name of the corpus.
+        urls: the list of urls to download the data file.
+    """
+    os.makedirs(directory, exist_ok=True)
+    try:
+        for url in urls:
+            zip_filepath = os.path.join(directory, url.split("/")[-1])
+            if os.path.exists(zip_filepath):
+                continue
+            logger.info("Downloading %s to %s" % (url, zip_filepath))
+            subprocess.call(
+                "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
+                shell=True,
+            )
+            statinfo = os.stat(zip_filepath)
+            logger.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
+        # concatenate all parts into zip files
+        if ".zip" not in zip_filepath:
+            zip_filepath = "_".join(zip_filepath.split("_")[:-1])
+            subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True)
+            zip_filepath += ".zip"
+        extract_path = zip_filepath.strip(".zip")
+        # check zip file md5sum
+        with open(zip_filepath, "rb") as f_zip:
+            md5 = hashlib.md5(f_zip.read()).hexdigest()
+        if md5 != MD5SUM[subset]:
+            raise ValueError("md5sum of %s mismatch" % zip_filepath)
+        with zipfile.ZipFile(zip_filepath, "r") as zfile:
+            zfile.extractall(directory)
+            extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename)
+            subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True)
+    finally:
+        # os.remove(zip_filepath)
+        pass
+def exec_cmd(cmd):
+    """Run a command in a subprocess.
+    Args:
+        cmd: command line to be executed.
+    Return:
+        int, the return code.
+    """
+    try:
+        retcode = subprocess.call(cmd, shell=True)
+        if retcode < 0:
+            logger.info(f"Child was terminated by signal {retcode}")
+    except OSError as e:
+        logger.info(f"Execution failed: {e}")
+        retcode = -999
+    return retcode
+def decode_aac_with_ffmpeg(aac_file, wav_file):
+    """Decode a given AAC file into WAV using ffmpeg.
+    Args:
+        aac_file: file path to input AAC file.
+        wav_file: file path to output WAV file.
+    Return:
+        bool, True if success.
+    """
+    cmd = f"ffmpeg -i {aac_file} {wav_file}"
+    logger.info(f"Decoding aac file using command line: {cmd}")
+    ret = exec_cmd(cmd)
+    if ret != 0:
+        logger.error(f"Failed to decode aac file with retcode {ret}")
+        logger.error("Please check your ffmpeg installation.")
+        return False
+    return True
+def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
+    """Optionally convert AAC to WAV and make speaker labels.
+    Args:
+        input_dir: the directory which holds the input dataset.
+        subset: the name of the specified subset. e.g. vox1_dev_wav
+        output_dir: the directory to place the newly generated csv files.
+        output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
+    """
+    logger.info("Preprocessing audio and label for subset %s" % subset)
+    source_dir = os.path.join(input_dir, subset)
+    files = []
+    # Convert all AAC file into WAV format. At the same time, generate the csv
+    for root, _, filenames in os.walk(source_dir):
+        for filename in filenames:
+            name, ext = os.path.splitext(filename)
+            if ext.lower() == ".wav":
+                _, ext2 = os.path.splitext(name)
+                if ext2:
+                    continue
+                wav_file = os.path.join(root, filename)
+            elif ext.lower() == ".m4a":
+                # Convert AAC to WAV.
+                aac_file = os.path.join(root, filename)
+                wav_file = aac_file + ".wav"
+                if not os.path.exists(wav_file):
+                    if not decode_aac_with_ffmpeg(aac_file, wav_file):
+                        raise RuntimeError("Audio decoding failed.")
+            else:
+                continue
+            speaker_name = root.split(os.path.sep)[-2]
+            if speaker_name not in speaker_id_dict:
+                num = len(speaker_id_dict)
+                speaker_id_dict[speaker_name] = num
+            # wav_filesize = os.path.getsize(wav_file)
+            wav_length = len(sf.read(wav_file)[0])
+            files.append((os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name))
+    # Write to CSV file which contains four columns:
+    # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
+    csv_file_path = os.path.join(output_dir, output_file)
+    with open(csv_file_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f, delimiter="\t")
+        writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
+        for wav_file in files:
+            writer.writerow(wav_file)
+    logger.info("Successfully generated csv file {}".format(csv_file_path))
+def processor(directory, subset, force_process):
+    """download and process"""
+    urls = SUBSETS
+    if subset not in urls:
+        raise ValueError(subset, "is not in voxceleb")
+    subset_csv = os.path.join(directory, subset + ".csv")
+    if not force_process and os.path.exists(subset_csv):
+        return subset_csv
+    logger.info("Downloading and process the voxceleb in %s", directory)
+    logger.info("Preparing subset %s", subset)
+    download_and_extract(directory, subset, urls[subset])
+    convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
+    logger.info("Finished downloading and processing")
+    return subset_csv
+if __name__ == "__main__":
+    setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+    if len(sys.argv) != 4:
+        print("Usage: python prepare_data.py save_directory user password")
+        sys.exit()
+    DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3]
+    for SUBSET in SUBSETS:
+        processor(DIR, SUBSET, False)

TTS/encoder/utils/training.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+from dataclasses import dataclass, field
+from coqpit import Coqpit
+from trainer import TrainerArgs, get_last_checkpoint
+from trainer.generic_utils import get_experiment_folder_path, get_git_branch
+from trainer.io import copy_model_files
+from trainer.logging import logger_factory
+from trainer.logging.console_logger import ConsoleLogger
+from TTS.config import load_config, register_config
+from TTS.tts.utils.text.characters import parse_symbols
+@dataclass
+class TrainArgs(TrainerArgs):
+    config_path: str = field(default=None, metadata={"help": "Path to the config file."})
+def getarguments():
+    train_config = TrainArgs()
+    parser = train_config.init_argparse(arg_prefix="")
+    return parser
+def process_args(args, config=None):
+    """Process parsed comand line arguments and initialize the config if not provided.
+    Args:
+        args (argparse.Namespace or dict like): Parsed input arguments.
+        config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
+    Returns:
+        c (Coqpit): Config paramaters.
+        out_path (str): Path to save models and logging.
+        audio_path (str): Path to save generated test audios.
+        c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
+            logging to the console.
+        dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
+    TODO:
+        - Interactive config definition.
+    """
+    if isinstance(args, tuple):
+        args, coqpit_overrides = args
+    if args.continue_path:
+        # continue a previous training from its output folder
+        experiment_path = args.continue_path
+        args.config_path = os.path.join(args.continue_path, "config.json")
+        args.restore_path, best_model = get_last_checkpoint(args.continue_path)
+        if not args.best_path:
+            args.best_path = best_model
+    # init config if not already defined
+    if config is None:
+        if args.config_path:
+            # init from a file
+            config = load_config(args.config_path)
+        else:
+            # init from console args
+            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel
+            config_base = BaseTrainingConfig()
+            config_base.parse_known_args(coqpit_overrides)
+            config = register_config(config_base.model)()
+    # override values from command-line args
+    config.parse_known_args(coqpit_overrides, relaxed_parser=True)
+    experiment_path = args.continue_path
+    if not experiment_path:
+        experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
+    audio_path = os.path.join(experiment_path, "test_audios")
+    config.output_log_path = experiment_path
+    # setup rank 0 process in distributed training
+    dashboard_logger = None
+    if args.rank == 0:
+        new_fields = {}
+        if args.restore_path:
+            new_fields["restore_path"] = args.restore_path
+        new_fields["github_branch"] = get_git_branch()
+        # if model characters are not set in the config file
+        # save the default set to the config file for future
+        # compatibility.
+        if config.has("characters") and config.characters is None:
+            used_characters = parse_symbols()
+            new_fields["characters"] = used_characters
+        copy_model_files(config, experiment_path, new_fields)
+        dashboard_logger = logger_factory(config, experiment_path)
+    c_logger = ConsoleLogger()
+    return config, experiment_path, audio_path, c_logger, dashboard_logger
+def init_arguments():
+    train_config = TrainArgs()
+    parser = train_config.init_argparse(arg_prefix="")
+    return parser
+def init_training(config: Coqpit = None):
+    """Initialization of a training run."""
+    parser = init_arguments()
+    args = parser.parse_known_args()
+    config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
+    return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger

TTS/encoder/utils/visual.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+matplotlib.use("Agg")
+colormap = (
+    np.array(
+        [
+            [76, 255, 0],
+            [0, 127, 70],
+            [255, 0, 0],
+            [255, 217, 38],
+            [0, 135, 255],
+            [165, 0, 165],
+            [255, 167, 255],
+            [0, 255, 255],
+            [255, 96, 38],
+            [142, 76, 0],
+            [33, 0, 127],
+            [0, 0, 0],
+            [183, 183, 183],
+        ],
+        dtype=float,
+    )
+    / 255
+)
+def plot_embeddings(embeddings, num_classes_in_batch):
+    try:
+        import umap
+    except ImportError as e:
+        raise ImportError("Package not installed: umap-learn") from e
+    num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
+    # if necessary get just the first 10 classes
+    if num_classes_in_batch > 10:
+        num_classes_in_batch = 10
+        embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
+    model = umap.UMAP()
+    projection = model.fit_transform(embeddings)
+    ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
+    colors = [colormap[i] for i in ground_truth]
+    fig, ax = plt.subplots(figsize=(16, 10))
+    _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
+    plt.gca().set_aspect("equal", "datalim")
+    plt.title("UMAP projection")
+    plt.tight_layout()
+    plt.savefig("umap")
+    return fig

TTS/model.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+from abc import abstractmethod
+from typing import Any, Union
+import torch
+from coqpit import Coqpit
+from trainer import TrainerModel
+# pylint: skip-file
+class BaseTrainerModel(TrainerModel):
+    """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
+    Every new 🐸TTS model must inherit it.
+    """
+    @staticmethod
+    @abstractmethod
+    def init_from_config(config: Coqpit) -> "BaseTrainerModel":
+        """Init the model and all its attributes from the given config.
+        Override this depending on your model.
+        """
+        ...
+    @abstractmethod
+    def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict[str, Any]:
+        """Forward pass for inference.
+        It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
+        is considered to be the main output and you can add any other auxiliary outputs as you want.
+        We don't use `*kwargs` since it is problematic with the TorchScript API.
+        Args:
+            input (torch.Tensor): [description]
+            aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
+        Returns:
+            Dict: [description]
+        """
+        outputs_dict = {"model_outputs": None}
+        ...
+        return outputs_dict
+    @abstractmethod
+    def load_checkpoint(
+        self,
+        config: Coqpit,
+        checkpoint_path: Union[str, os.PathLike[Any]],
+        eval: bool = False,
+        strict: bool = True,
+        cache: bool = False,
+    ) -> None:
+        """Load a model checkpoint file and get ready for training or inference.
+        Args:
+            config (Coqpit): Model configuration.
+            checkpoint_path (str | os.PathLike): Path to the model checkpoint file.
+            eval (bool, optional): If true, init model for inference else for training. Defaults to False.
+            strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
+            cache (bool, optional): If True, cache the file locally for subsequent calls.
+                It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False.
+        """
+        ...

TTS/server/README.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# :frog: TTS demo server
+Before you use the server, make sure you
+[install](https://github.com/idiap/coqui-ai-TTS/tree/dev#install-tts)) :frog: TTS
+properly and install the additional dependencies with `pip install
+coqui-tts[server]`. Then, you can follow the steps below.
+**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
+Examples runs:
+List officially released models.
+```python TTS/server/server.py  --list_models ```
+Run the server with the official models.
+```python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
+Run the server with the official models on a GPU.
+```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda```
+Run the server with a custom models.
+```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```

TTS/server/__init__.py ADDED Viewed

File without changes

TTS/server/conf.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/",  // tts model root folder
+    "tts_file":"best_model.pth",     // tts checkpoint file
+    "tts_config":"config.json",     // tts config.json file
+    "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
+    "vocoder_config":null,
+    "vocoder_file": null,
+    "is_wavernn_batched":true,
+    "port": 5002,
+    "use_cuda": true,
+    "debug": true
+}

TTS/server/server.py ADDED Viewed

	@@ -0,0 +1,262 @@

+#!flask/bin/python
+"""TTS demo server."""
+import argparse
+import io
+import json
+import logging
+import os
+import sys
+from pathlib import Path
+from threading import Lock
+from typing import Union
+from urllib.parse import parse_qs
+try:
+    from flask import Flask, render_template, render_template_string, request, send_file
+except ImportError as e:
+    msg = "Server requires requires flask, use `pip install coqui-tts[server]`"
+    raise ImportError(msg) from e
+from TTS.config import load_config
+from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+logger = logging.getLogger(__name__)
+setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
+def create_argparser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--list_models",
+        action="store_true",
+        help="list available pre-trained tts and vocoder models.",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="tts_models/en/ljspeech/tacotron2-DDC",
+        help="Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
+    )
+    parser.add_argument("--vocoder_name", type=str, default=None, help="name of one of the released vocoder models.")
+    # Args for running custom models
+    parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default=None,
+        help="Path to model file.",
+    )
+    parser.add_argument(
+        "--vocoder_path",
+        type=str,
+        help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
+        default=None,
+    )
+    parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
+    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
+    parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
+    parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="true to use CUDA.")
+    parser.add_argument(
+        "--debug", action=argparse.BooleanOptionalAction, default=False, help="true to enable Flask debug mode."
+    )
+    parser.add_argument(
+        "--show_details", action=argparse.BooleanOptionalAction, default=False, help="Generate model detail page."
+    )
+    return parser
+# parse the args
+args = create_argparser().parse_args()
+path = Path(__file__).parent / "../.models.json"
+manager = ModelManager(path)
+# update in-use models to the specified released models.
+model_path = None
+config_path = None
+speakers_file_path = None
+vocoder_path = None
+vocoder_config_path = None
+# CASE1: list pre-trained TTS models
+if args.list_models:
+    manager.list_models()
+    sys.exit()
+# CASE2: load pre-trained model paths
+if args.model_name is not None and not args.model_path:
+    model_path, config_path, model_item = manager.download_model(args.model_name)
+    args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+if args.vocoder_name is not None and not args.vocoder_path:
+    vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
+# CASE3: set custom model paths
+if args.model_path is not None:
+    model_path = args.model_path
+    config_path = args.config_path
+    speakers_file_path = args.speakers_file_path
+if args.vocoder_path is not None:
+    vocoder_path = args.vocoder_path
+    vocoder_config_path = args.vocoder_config_path
+# load models
+synthesizer = Synthesizer(
+    tts_checkpoint=model_path,
+    tts_config_path=config_path,
+    tts_speakers_file=speakers_file_path,
+    tts_languages_file=None,
+    vocoder_checkpoint=vocoder_path,
+    vocoder_config=vocoder_config_path,
+    encoder_checkpoint="",
+    encoder_config="",
+    use_cuda=args.use_cuda,
+)
+use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
+    synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
+)
+speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
+use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and (
+    synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None
+)
+language_manager = getattr(synthesizer.tts_model, "language_manager", None)
+# TODO: set this from SpeakerManager
+use_gst = synthesizer.tts_config.get("use_gst", False)
+app = Flask(__name__)
+def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
+    """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
+    or a dict (gst tokens/values to be use for styling)
+    Args:
+        style_wav (str): uri
+    Returns:
+        Union[str, dict]: path to file (str) or gst style (dict)
+    """
+    if style_wav:
+        if os.path.isfile(style_wav) and style_wav.endswith(".wav"):
+            return style_wav  # style_wav is a .wav file located on the server
+        style_wav = json.loads(style_wav)
+        return style_wav  # style_wav is a gst dictionary with {token1_id : token1_weigth, ...}
+    return None
+@app.route("/")
+def index():
+    return render_template(
+        "index.html",
+        show_details=args.show_details,
+        use_multi_speaker=use_multi_speaker,
+        use_multi_language=use_multi_language,
+        speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
+        language_ids=language_manager.name_to_id if language_manager is not None else None,
+        use_gst=use_gst,
+    )
+@app.route("/details")
+def details():
+    if args.config_path is not None and os.path.isfile(args.config_path):
+        model_config = load_config(args.config_path)
+    elif args.model_name is not None:
+        model_config = load_config(config_path)
+    if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
+        vocoder_config = load_config(args.vocoder_config_path)
+    elif args.vocoder_name is not None:
+        vocoder_config = load_config(vocoder_config_path)
+    else:
+        vocoder_config = None
+    return render_template(
+        "details.html",
+        show_details=args.show_details,
+        model_config=model_config,
+        vocoder_config=vocoder_config,
+        args=args.__dict__,
+    )
+lock = Lock()
+@app.route("/api/tts", methods=["GET", "POST"])
+def tts():
+    with lock:
+        text = request.headers.get("text") or request.values.get("text", "")
+        speaker_idx = request.headers.get("speaker-id") or request.values.get("speaker_id", "")
+        language_idx = request.headers.get("language-id") or request.values.get("language_id", "")
+        style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
+        style_wav = style_wav_uri_to_dict(style_wav)
+        logger.info("Model input: %s", text)
+        logger.info("Speaker idx: %s", speaker_idx)
+        logger.info("Language idx: %s", language_idx)
+        wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
+        out = io.BytesIO()
+        synthesizer.save_wav(wavs, out)
+    return send_file(out, mimetype="audio/wav")
+# Basic MaryTTS compatibility layer
+@app.route("/locales", methods=["GET"])
+def mary_tts_api_locales():
+    """MaryTTS-compatible /locales endpoint"""
+    # NOTE: We currently assume there is only one model active at the same time
+    if args.model_name is not None:
+        model_details = args.model_name.split("/")
+    else:
+        model_details = ["", "en", "", "default"]
+    return render_template_string("{{ locale }}\n", locale=model_details[1])
+@app.route("/voices", methods=["GET"])
+def mary_tts_api_voices():
+    """MaryTTS-compatible /voices endpoint"""
+    # NOTE: We currently assume there is only one model active at the same time
+    if args.model_name is not None:
+        model_details = args.model_name.split("/")
+    else:
+        model_details = ["", "en", "", "default"]
+    return render_template_string(
+        "{{ name }} {{ locale }} {{ gender }}\n", name=model_details[3], locale=model_details[1], gender="u"
+    )
+@app.route("/process", methods=["GET", "POST"])
+def mary_tts_api_process():
+    """MaryTTS-compatible /process endpoint"""
+    with lock:
+        if request.method == "POST":
+            data = parse_qs(request.get_data(as_text=True))
+            # NOTE: we ignore param. LOCALE and VOICE for now since we have only one active model
+            text = data.get("INPUT_TEXT", [""])[0]
+        else:
+            text = request.args.get("INPUT_TEXT", "")
+        logger.info("Model input: %s", text)
+        wavs = synthesizer.tts(text)
+        out = io.BytesIO()
+        synthesizer.save_wav(wavs, out)
+    return send_file(out, mimetype="audio/wav")
+def main():
+    app.run(debug=args.debug, host="::", port=args.port)
+if __name__ == "__main__":
+    main()

TTS/server/static/coqui-log-green-TTS.png ADDED Viewed

TTS/server/templates/details.html ADDED Viewed

	@@ -0,0 +1,131 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+  <meta name="description" content="">
+  <meta name="author" content="">
+  <title>TTS engine</title>
+  <!-- Bootstrap core CSS -->
+  <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
+    integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"
+    rel="stylesheet">
+  <!-- Custom styles for this template -->
+  <style>
+    body {
+      padding-top: 54px;
+    }
+    @media (min-width: 992px) {
+      body {
+        padding-top: 56px;
+      }
+    }
+  </style>
+</head>
+<body>
+  <a href="https://github.com/mozilla/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
+      src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
+  {% if show_details == true %}
+  <div class="container">
+    <b>Model details</b>
+  </div>
+  <div class="container">
+    <details>
+      <summary>CLI arguments:</summary>
+      <table border="1" align="center" width="75%">
+        <tr>
+          <td> CLI key </td>
+          <td> Value </td>
+        </tr>
+        {% for key, value in args.items() %}
+        <tr>
+          <td>{{ key }}</td>
+          <td>{{ value }}</td>
+        </tr>
+        {% endfor %}
+      </table>
+    </details>
+  </div></br>
+  <div class="container">
+    {% if model_config != None %}
+    <details>
+      <summary>Model config:</summary>
+      <table border="1" align="center" width="75%">
+        <tr>
+          <td> Key </td>
+          <td> Value </td>
+        </tr>
+        {% for key, value in model_config.items() %}
+        <tr>
+          <td>{{ key }}</td>
+          <td>{{ value }}</td>
+        </tr>
+        {% endfor %}
+      </table>
+    </details>
+    {% endif %}
+  </div></br>
+  <div class="container">
+    {% if vocoder_config != None %}
+    <details>
+      <summary>Vocoder model config:</summary>
+      <table border="1" align="center" width="75%">
+        <tr>
+          <td> Key </td>
+          <td> Value </td>
+        </tr>
+        {% for key, value in vocoder_config.items() %}
+        <tr>
+          <td>{{ key }}</td>
+          <td>{{ value }}</td>
+        </tr>
+        {% endfor %}
+      </table>
+    </details>
+    {% endif %}
+  </div></br>
+  {% else %}
+  <div class="container">
+    <b>Please start server with --show_details=true to see details.</b>
+  </div>
+  {% endif %}
+</body>
+</html>

TTS/server/templates/index.html ADDED Viewed

	@@ -0,0 +1,154 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+    <meta name="description" content="🐸Coqui AI TTS demo server.">
+    <meta name="author" content="🐸Coqui AI TTS">
+    <title>TTS engine</title>
+    <!-- Bootstrap core CSS -->
+    <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
+        integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"
+        rel="stylesheet">
+    <!-- Custom styles for this template -->
+    <style>
+        body {
+            padding-top: 54px;
+        }
+        @media (min-width: 992px) {
+            body {
+                padding-top: 56px;
+            }
+        }
+    </style>
+</head>
+<body>
+    <a href="https://github.com/idiap/coqui-ai-TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
+            src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
+    <!-- Navigation -->
+    <!--
+    <nav class="navbar navbar-expand-lg navbar-dark bg-dark fixed-top">
+      <div class="container">
+        <a class="navbar-brand" href="#">Coqui TTS</a>
+        <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false" aria-label="Toggle navigation">
+          <span class="navbar-toggler-icon"></span>
+        </button>
+        <div class="collapse navbar-collapse" id="navbarResponsive">
+          <ul class="navbar-nav ml-auto">
+            <li class="nav-item active">
+              <a class="nav-link" href="#">Home
+                <span class="sr-only">(current)</span>
+              </a>
+            </li>
+          </ul>
+        </div>
+      </div>
+    </nav>
+    -->
+    <!-- Page Content -->
+    <div class="container">
+        <div class="row">
+            <div class="col-lg-12 text-center">
+                <img class="mt-5" src="{{url_for('static', filename='coqui-log-green-TTS.png')}}" align="middle"
+                    width="512" />
+                <ul class="list-unstyled">
+                </ul>
+                {%if use_gst%}
+                <input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path to wav).." size=45
+                    type="text" name="style_wav">
+                {%endif%}
+                <input id="text" placeholder="Type here..." size=45 type="text" name="text">
+                <button id="speak-button" name="speak">Speak</button><br /><br />
+                {%if use_multi_speaker%}
+                Choose a speaker:
+                <select id="speaker_id" name=speaker_id method="GET" action="/">
+                    {% for speaker_id in speaker_ids %}
+                    <option value="{{speaker_id}}" SELECTED>{{speaker_id}}</option>"
+                    {% endfor %}
+                </select><br /><br />
+                {%endif%}
+                {%if use_multi_language%}
+                Choose a language:
+                <select id="language_id" name=language_id method="GET" action="/">
+                    {% for language_id in language_ids %}
+                    <option value="{{language_id}}" SELECTED>{{language_id}}</option>"
+                    {% endfor %}
+                </select><br /><br />
+                {%endif%}
+                {%if show_details%}
+                <button id="details-button" onclick="location.href = 'details'" name="model-details">Model
+                    Details</button><br /><br />
+                {%endif%}
+                <audio id="audio" controls autoplay hidden></audio>
+                <p id="message"></p>
+            </div>
+        </div>
+    </div>
+    <!-- Bootstrap core JavaScript -->
+    <script>
+        function getTextValue(textId) {
+            const container = q(textId)
+            if (container) {
+                return container.value
+            }
+            return ""
+        }
+        function q(selector) { return document.querySelector(selector) }
+        q('#text').focus()
+        function do_tts(e) {
+            const text = q('#text').value
+            const speaker_id = getTextValue('#speaker_id')
+            const style_wav = getTextValue('#style_wav')
+            const language_id = getTextValue('#language_id')
+            if (text) {
+                q('#message').textContent = 'Synthesizing...'
+                q('#speak-button').disabled = true
+                q('#audio').hidden = true
+                synthesize(text, speaker_id, style_wav, language_id)
+            }
+            e.preventDefault()
+            return false
+        }
+        q('#speak-button').addEventListener('click', do_tts)
+        q('#text').addEventListener('keyup', function (e) {
+            if (e.keyCode == 13) { // enter
+                do_tts(e)
+            }
+        })
+        function synthesize(text, speaker_id = "", style_wav = "", language_id = "") {
+            fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}&language_id=${encodeURIComponent(language_id)}`, { cache: 'no-cache' })
+                .then(function (res) {
+                    if (!res.ok) throw Error(res.statusText)
+                    return res.blob()
+                }).then(function (blob) {
+                    q('#message').textContent = ''
+                    q('#speak-button').disabled = false
+                    q('#audio').src = URL.createObjectURL(blob)
+                    q('#audio').hidden = false
+                }).catch(function (err) {
+                    q('#message').textContent = 'Error: ' + err.message
+                    q('#speak-button').disabled = false
+                })
+        }
+    </script>
+</body>
+</html>

TTS/tts/__init__.py ADDED Viewed

File without changes

TTS/tts/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import importlib
+import os
+from inspect import isclass
+# import all files under configs/
+# configs_dir = os.path.dirname(__file__)
+# for file in os.listdir(configs_dir):
+#     path = os.path.join(configs_dir, file)
+#     if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
+#         config_name = file[: file.find(".py")] if file.endswith(".py") else file
+#         module = importlib.import_module("TTS.tts.configs." + config_name)
+#         for attribute_name in dir(module):
+#             attribute = getattr(module, attribute_name)
+#             if isclass(attribute):
+#                 # Add the class to this package's variables
+#                 globals()[attribute_name] = attribute