File size: 4,363 Bytes
53f8a32
 
 
 
 
 
5b5d4c0
 
53f8a32
 
 
 
 
9846d74
53f8a32
 
 
9ef73e2
 
 
 
98923fe
9846d74
 
 
5339f1e
9846d74
 
 
53f8a32
 
 
 
 
 
 
 
 
 
 
80ee0e5
53f8a32
 
 
0f11bd1
 
53f8a32
 
fbe7d93
53f8a32
 
 
fbe7d93
babf22d
37e87fa
53f8a32
 
 
 
fbe7d93
53f8a32
 
 
 
9ef73e2
 
 
53f8a32
6473463
9ef73e2
 
 
6473463
9ef73e2
 
 
 
 
 
 
2d5fa2d
53f8a32
2d5fa2d
53f8a32
 
 
 
fbe7d93
53f8a32
fbe7d93
53f8a32
 
9ef73e2
53f8a32
 
 
 
 
 
9ef73e2
 
53f8a32
9ef73e2
6473463
9ef73e2
 
 
 
6ebe60a
9ef73e2
6ebe60a
9ef73e2
 
53f8a32
9ef73e2
53f8a32
4263bcd
53f8a32
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Copyright 2022 Balacoon

TTS interactive demo
"""

import os
import glob
import logging
from typing import cast

import gradio as gr
from balacoon_tts import TTS
from huggingface_hub import hf_hub_download, list_repo_files

# global tts module, initialized from a model selected
tts = None
# path to the model that is currently used in tts
cur_model_path = None
# cache of speakers, maps model name to speaker list
model_to_speakers = dict()
model_repo_dir = "data"
for name in list_repo_files(repo_id="balacoon/tts"):
    hf_hub_download(
        repo_id="balacoon/tts",
        filename=name,
        local_dir=model_repo_dir,
    )


def main():
    logging.basicConfig(level=logging.INFO)

    with gr.Blocks() as demo:
        gr.Markdown(
            """
            <h1 align="center">Balacoon🦝 Text-to-Speech</h1>

            1. Write an utterance to generate,
            2. Select the model to synthesize with
            3. Select speaker
            4. Hit "Generate" and listen to the result!

            You can learn more about models available
            [here](https://huggingface.co/balacoon/tts).
            Visit [Balacoon website](https://balacoon.com/) for more info.
            """
        )
        with gr.Row(variant="panel"):
            text = gr.Textbox(label="Text", placeholder="Type something here...")

        with gr.Row():
            with gr.Column(variant="panel"):
                repo_files = os.listdir(model_repo_dir)
                model_files = [x for x in repo_files if x.endswith("_cpu.addon")]
                model_name = gr.Dropdown(
                    label="Model",
                    choices=model_files,
                )
            with gr.Column(variant="panel"):
                speaker = gr.Dropdown(label="Speaker", choices=[])

            def set_model(model_name_str: str):
                """
                gets value from `model_name`. either
                uses cached list of speakers for the given model name
                or loads the addon and checks what are the speakers.
                """
                global model_to_speakers
                if model_name_str in model_to_speakers:
                    speakers = model_to_speakers[model_name_str]
                else:
                    global tts, cur_model_path
                    # need to load this model to learn the list of speakers
                    model_path = os.path.join(model_repo_dir, model_name_str)
                    tts = TTS(model_path)
                    cur_model_path = model_path
                    speakers = tts.get_speakers()
                    model_to_speakers[model_name_str] = speakers
                
                value = speakers[-1]
                return gr.Dropdown.update(
                    choices=speakers, value=value, visible=True
                )

            model_name.change(set_model, inputs=model_name, outputs=speaker)

        with gr.Row(variant="panel"):
            generate = gr.Button("Generate")
        with gr.Row(variant="panel"):
            audio = gr.Audio()

        def synthesize_audio(text_str: str, model_name_str: str, speaker_str: str):
            """
            gets utterance to synthesize from `text` Textbox
            and speaker name from `speaker` dropdown list.
            speaker name might be empty for single-speaker models.
            Synthesizes the waveform and updates `audio` with it.
            """
            if not text_str or not model_name_str or not speaker_str:
                logging.info("text, model name or speaker are not provided")
                return None
            expected_model_path = os.path.join(model_repo_dir, model_name_str)
            global tts, cur_model_path
            if expected_model_path != cur_model_path:
                # reload model
                tts = TTS(expected_model_path)
                cur_model_path = expected_model_path
            if len(text_str) > 1024:
                # truncate the text
                text_str = text_str[:1024]
            samples = tts.synthesize(text_str, speaker_str)
            return gr.Audio.update(value=(tts.get_sampling_rate(), samples))

        generate.click(synthesize_audio, inputs=[text, model_name, speaker], outputs=audio)

    demo.queue(concurrency_count=1).launch()


if __name__ == "__main__":
    main()