Balacoon🦝 Text-to-Speech

"""
Copyright 2022 Balacoon

TTS interactive demo
"""

import os
import glob
import logging
from typing import cast

import gradio as gr
from balacoon_tts import TTS
from huggingface_hub import hf_hub_download, list_repo_files

# global tts module, initialized from a model selected
tts = None
# path to the model that is currently used in tts
cur_model_path = None
# cache of speakers, maps model name to speaker list
model_to_speakers = dict()
model_repo_dir = "data"
for name in list_repo_files(repo_id="balacoon/tts"):
    hf_hub_download(
        repo_id="balacoon/tts",
        filename=name,
        local_dir=model_repo_dir,
    )


def main():
    logging.basicConfig(level=logging.INFO)

    with gr.Blocks() as demo:
        gr.Markdown(
            """
            <h1 align="center">Balacoon🦝 Text-to-Speech</h1>

            1. Write an utterance to generate,
            2. Select the model to synthesize with
            3. Select speaker
            4. Hit "Generate" and listen to the result!

            You can learn more about models available
            [here](https://huggingface.co/balacoon/tts).
            Visit [Balacoon website](https://balacoon.com/) for more info.
            """
        )
        with gr.Row(variant="panel"):
            text = gr.Textbox(label="Text", placeholder="Type something here...")

        with gr.Row():
            with gr.Column(variant="panel"):
                repo_files = os.listdir(model_repo_dir)
                model_files = [x for x in repo_files if x.endswith("_cpu.addon")]
                model_name = gr.Dropdown(
                    label="Model",
                    choices=model_files,
                )
            with gr.Column(variant="panel"):
                speaker = gr.Dropdown(label="Speaker", choices=[])

            def set_model(model_name_str: str):
                """
                gets value from `model_name`. either
                uses cached list of speakers for the given model name
                or loads the addon and checks what are the speakers.
                """
                global model_to_speakers
                if model_name_str in model_to_speakers:
                    speakers = model_to_speakers[model_name_str]
                else:
                    global tts, cur_model_path
                    # need to load this model to learn the list of speakers
                    model_path = os.path.join(model_repo_dir, model_name_str)
                    tts = TTS(model_path)
                    cur_model_path = model_path
                    speakers = tts.get_speakers()
                    model_to_speakers[model_name_str] = speakers
                
                value = speakers[-1]
                return gr.Dropdown.update(
                    choices=speakers, value=value, visible=True
                )

            model_name.change(set_model, inputs=model_name, outputs=speaker)

        with gr.Row(variant="panel"):
            generate = gr.Button("Generate")
        with gr.Row(variant="panel"):
            audio = gr.Audio()

        def synthesize_audio(text_str: str, model_name_str: str, speaker_str: str):
            """
            gets utterance to synthesize from `text` Textbox
            and speaker name from `speaker` dropdown list.
            speaker name might be empty for single-speaker models.
            Synthesizes the waveform and updates `audio` with it.
            """
            if not text_str or not model_name_str or not speaker_str:
                logging.info("text, model name or speaker are not provided")
                return None
            expected_model_path = os.path.join(model_repo_dir, model_name_str)
            global tts, cur_model_path
            if expected_model_path != cur_model_path:
                # reload model
                tts = TTS(expected_model_path)
                cur_model_path = expected_model_path
            if len(text_str) > 1024:
                # truncate the text
                text_str = text_str[:1024]
            samples = tts.synthesize(text_str, speaker_str)
            return gr.Audio.update(value=(tts.get_sampling_rate(), samples))

        generate.click(synthesize_audio, inputs=[text, model_name, speaker], outputs=audio)

    demo.queue(concurrency_count=1).launch()


if __name__ == "__main__":
    main()