Spaces:
Sleeping
Sleeping
txya900619
commited on
Commit
·
bde200c
1
Parent(s):
61f51e4
feat: let se model can use custom emb
Browse files
app.py
CHANGED
@@ -14,6 +14,7 @@ from replace.tts import ChangedVitsConfig
|
|
14 |
|
15 |
TTS.tts.configs.vits_config.VitsConfig = ChangedVitsConfig
|
16 |
|
|
|
17 |
def load_model(model_id):
|
18 |
model_dir = snapshot_download(model_id)
|
19 |
config_file_path = os.path.join(model_dir, "config.json")
|
@@ -34,11 +35,20 @@ def load_model(model_id):
|
|
34 |
f.close()
|
35 |
return Synthesizer(tts_checkpoint=model_ckpt_path, tts_config_path=temp_config_path)
|
36 |
|
|
|
37 |
OmegaConf.register_new_resolver("load_model", load_model)
|
38 |
|
39 |
models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
|
40 |
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
model = models_config[model_id]["model"]
|
43 |
if len(text) == 0:
|
44 |
raise gr.Error("請勿輸入空字串。")
|
@@ -47,21 +57,44 @@ def text_to_speech(model_id: str, speaker: str, dialect, text: str):
|
|
47 |
raise gr.Error(
|
48 |
f"句子中的[{','.join(missing_words)}]目前無法轉成 ipa。請嘗試其他句子。"
|
49 |
)
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
return words, pinyin, (16000, np.array(wav))
|
59 |
|
|
|
60 |
def when_model_selected(model_id):
|
61 |
model_config = models_config[model_id]
|
62 |
-
speaker_drop_down_choices = [
|
|
|
|
|
63 |
dialect_drop_down_choices = model_config["avalible_dialect"]
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
|
67 |
demo = gr.Blocks(
|
@@ -79,29 +112,51 @@ demo = gr.Blocks(
|
|
79 |
)
|
80 |
|
81 |
with demo:
|
82 |
-
|
83 |
default_model_id = list(models_config.keys())[0]
|
84 |
model_drop_down = gr.Dropdown(
|
85 |
models_config.keys(),
|
86 |
value=default_model_id,
|
87 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
speaker_drop_down = gr.Dropdown(
|
89 |
-
choices=[
|
90 |
-
|
|
|
|
|
|
|
91 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
dialect_drop_down = gr.Dropdown(
|
93 |
choices=models_config[default_model_id]["avalible_dialect"],
|
94 |
-
value=models_config[default_model_id]["avalible_dialect"][0]
|
95 |
)
|
96 |
|
97 |
model_drop_down.input(
|
98 |
when_model_selected,
|
99 |
inputs=[model_drop_down],
|
100 |
-
outputs=[speaker_drop_down, dialect_drop_down]
|
101 |
)
|
102 |
|
103 |
-
|
104 |
-
|
105 |
gr.Markdown(
|
106 |
"""
|
107 |
# 臺灣客語語音生成系統
|
@@ -111,6 +166,8 @@ with demo:
|
|
111 |
text_to_speech,
|
112 |
inputs=[
|
113 |
model_drop_down,
|
|
|
|
|
114 |
speaker_drop_down,
|
115 |
dialect_drop_down,
|
116 |
gr.Textbox(),
|
|
|
14 |
|
15 |
TTS.tts.configs.vits_config.VitsConfig = ChangedVitsConfig
|
16 |
|
17 |
+
|
18 |
def load_model(model_id):
|
19 |
model_dir = snapshot_download(model_id)
|
20 |
config_file_path = os.path.join(model_dir, "config.json")
|
|
|
35 |
f.close()
|
36 |
return Synthesizer(tts_checkpoint=model_ckpt_path, tts_config_path=temp_config_path)
|
37 |
|
38 |
+
|
39 |
OmegaConf.register_new_resolver("load_model", load_model)
|
40 |
|
41 |
models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
|
42 |
|
43 |
+
|
44 |
+
def text_to_speech(
|
45 |
+
model_id: str,
|
46 |
+
use_default_emb_or_custom: str,
|
47 |
+
speaker_wav,
|
48 |
+
speaker: str,
|
49 |
+
dialect,
|
50 |
+
text: str,
|
51 |
+
):
|
52 |
model = models_config[model_id]["model"]
|
53 |
if len(text) == 0:
|
54 |
raise gr.Error("請勿輸入空字串。")
|
|
|
57 |
raise gr.Error(
|
58 |
f"句子中的[{','.join(missing_words)}]目前無法轉成 ipa。請嘗試其他句子。"
|
59 |
)
|
60 |
+
if use_default_emb_or_custom == "default":
|
61 |
+
wav = model.tts(
|
62 |
+
parse_ipa(ipa),
|
63 |
+
speaker_name=speaker,
|
64 |
+
language_name=dialect,
|
65 |
+
split_sentences=False,
|
66 |
+
)
|
67 |
+
else:
|
68 |
+
wav = model.tts(
|
69 |
+
parse_ipa(ipa),
|
70 |
+
speaker_wav=speaker_wav,
|
71 |
+
language_name=dialect,
|
72 |
+
split_sentences=False,
|
73 |
+
)
|
74 |
|
75 |
return words, pinyin, (16000, np.array(wav))
|
76 |
|
77 |
+
|
78 |
def when_model_selected(model_id):
|
79 |
model_config = models_config[model_id]
|
80 |
+
speaker_drop_down_choices = [
|
81 |
+
(k, v) for k, v in model_config["speaker_mapping"].items()
|
82 |
+
]
|
83 |
dialect_drop_down_choices = model_config["avalible_dialect"]
|
84 |
+
use_default_emb_or_ref_radio_visible = False
|
85 |
+
if model_config["model"].tts_model.config.model_args.speaker_encoder_model_path:
|
86 |
+
use_default_emb_or_ref_radio_visible = True
|
87 |
+
return (
|
88 |
+
gr.update(choices=speaker_drop_down_choices),
|
89 |
+
gr.update(choices=dialect_drop_down_choices),
|
90 |
+
gr.update(visible=use_default_emb_or_ref_radio_visible),
|
91 |
+
)
|
92 |
+
|
93 |
+
|
94 |
+
def use_default_emb_or_custom_radio_input(use_default_emb_or_custom):
|
95 |
+
if use_default_emb_or_custom == "custom":
|
96 |
+
return gr.update(visible=True), gr.update(visible=False)
|
97 |
+
return gr.update(visible=False), gr.update(visible=True)
|
98 |
|
99 |
|
100 |
demo = gr.Blocks(
|
|
|
112 |
)
|
113 |
|
114 |
with demo:
|
|
|
115 |
default_model_id = list(models_config.keys())[0]
|
116 |
model_drop_down = gr.Dropdown(
|
117 |
models_config.keys(),
|
118 |
value=default_model_id,
|
119 |
)
|
120 |
+
use_default_emb_or_custom_radio = gr.Radio(
|
121 |
+
label="use default speaker embedding or custom speaker embedding",
|
122 |
+
choices=["default", "custom"],
|
123 |
+
value="default",
|
124 |
+
visible=False,
|
125 |
+
)
|
126 |
+
speaker_wav = gr.Microphone(
|
127 |
+
label="speaker wav",
|
128 |
+
visible=False,
|
129 |
+
editable=False,
|
130 |
+
type="filepath",
|
131 |
+
waveform_options=gr.WaveformOptions(
|
132 |
+
show_controls=False,
|
133 |
+
sample_rate=16000,
|
134 |
+
),
|
135 |
+
)
|
136 |
speaker_drop_down = gr.Dropdown(
|
137 |
+
choices=[
|
138 |
+
(k, v)
|
139 |
+
for k, v in models_config[default_model_id]["speaker_mapping"].items()
|
140 |
+
],
|
141 |
+
value=list(models_config[default_model_id]["speaker_mapping"].values())[0],
|
142 |
)
|
143 |
+
use_default_emb_or_custom_radio.input(
|
144 |
+
use_default_emb_or_custom_radio_input,
|
145 |
+
inputs=[use_default_emb_or_custom_radio],
|
146 |
+
outputs=[speaker_wav, speaker_drop_down],
|
147 |
+
)
|
148 |
+
|
149 |
dialect_drop_down = gr.Dropdown(
|
150 |
choices=models_config[default_model_id]["avalible_dialect"],
|
151 |
+
value=models_config[default_model_id]["avalible_dialect"][0],
|
152 |
)
|
153 |
|
154 |
model_drop_down.input(
|
155 |
when_model_selected,
|
156 |
inputs=[model_drop_down],
|
157 |
+
outputs=[speaker_drop_down, dialect_drop_down, use_default_emb_or_custom_radio],
|
158 |
)
|
159 |
|
|
|
|
|
160 |
gr.Markdown(
|
161 |
"""
|
162 |
# 臺灣客語語音生成系統
|
|
|
166 |
text_to_speech,
|
167 |
inputs=[
|
168 |
model_drop_down,
|
169 |
+
use_default_emb_or_custom_radio,
|
170 |
+
speaker_wav,
|
171 |
speaker_drop_down,
|
172 |
dialect_drop_down,
|
173 |
gr.Textbox(),
|