txya900619 commited on
Commit
bde200c
·
1 Parent(s): 61f51e4

feat: let se model can use custom emb

Browse files
Files changed (1) hide show
  1. app.py +74 -17
app.py CHANGED
@@ -14,6 +14,7 @@ from replace.tts import ChangedVitsConfig
14
 
15
  TTS.tts.configs.vits_config.VitsConfig = ChangedVitsConfig
16
 
 
17
  def load_model(model_id):
18
  model_dir = snapshot_download(model_id)
19
  config_file_path = os.path.join(model_dir, "config.json")
@@ -34,11 +35,20 @@ def load_model(model_id):
34
  f.close()
35
  return Synthesizer(tts_checkpoint=model_ckpt_path, tts_config_path=temp_config_path)
36
 
 
37
  OmegaConf.register_new_resolver("load_model", load_model)
38
 
39
  models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
40
 
41
- def text_to_speech(model_id: str, speaker: str, dialect, text: str):
 
 
 
 
 
 
 
 
42
  model = models_config[model_id]["model"]
43
  if len(text) == 0:
44
  raise gr.Error("請勿輸入空字串。")
@@ -47,21 +57,44 @@ def text_to_speech(model_id: str, speaker: str, dialect, text: str):
47
  raise gr.Error(
48
  f"句子中的[{','.join(missing_words)}]目前無法轉成 ipa。請嘗試其他句子。"
49
  )
50
-
51
- wav = model.tts(
52
- parse_ipa(ipa),
53
- speaker_name=speaker,
54
- language_name=dialect,
55
- split_sentences=False,
56
- )
 
 
 
 
 
 
 
57
 
58
  return words, pinyin, (16000, np.array(wav))
59
 
 
60
  def when_model_selected(model_id):
61
  model_config = models_config[model_id]
62
- speaker_drop_down_choices = [(k,v) for k, v in model_config["speaker_mapping"].items()]
 
 
63
  dialect_drop_down_choices = model_config["avalible_dialect"]
64
- return gr.update(choices=speaker_drop_down_choices), gr.update(choices=dialect_drop_down_choices)
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  demo = gr.Blocks(
@@ -79,29 +112,51 @@ demo = gr.Blocks(
79
  )
80
 
81
  with demo:
82
-
83
  default_model_id = list(models_config.keys())[0]
84
  model_drop_down = gr.Dropdown(
85
  models_config.keys(),
86
  value=default_model_id,
87
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  speaker_drop_down = gr.Dropdown(
89
- choices=[(k,v) for k, v in models_config[default_model_id]["speaker_mapping"].items()],
90
- value=list(models_config[default_model_id]["speaker_mapping"].values())[0]
 
 
 
91
  )
 
 
 
 
 
 
92
  dialect_drop_down = gr.Dropdown(
93
  choices=models_config[default_model_id]["avalible_dialect"],
94
- value=models_config[default_model_id]["avalible_dialect"][0]
95
  )
96
 
97
  model_drop_down.input(
98
  when_model_selected,
99
  inputs=[model_drop_down],
100
- outputs=[speaker_drop_down, dialect_drop_down]
101
  )
102
 
103
-
104
-
105
  gr.Markdown(
106
  """
107
  # 臺灣客語語音生成系統
@@ -111,6 +166,8 @@ with demo:
111
  text_to_speech,
112
  inputs=[
113
  model_drop_down,
 
 
114
  speaker_drop_down,
115
  dialect_drop_down,
116
  gr.Textbox(),
 
14
 
15
  TTS.tts.configs.vits_config.VitsConfig = ChangedVitsConfig
16
 
17
+
18
  def load_model(model_id):
19
  model_dir = snapshot_download(model_id)
20
  config_file_path = os.path.join(model_dir, "config.json")
 
35
  f.close()
36
  return Synthesizer(tts_checkpoint=model_ckpt_path, tts_config_path=temp_config_path)
37
 
38
+
39
  OmegaConf.register_new_resolver("load_model", load_model)
40
 
41
  models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
42
 
43
+
44
+ def text_to_speech(
45
+ model_id: str,
46
+ use_default_emb_or_custom: str,
47
+ speaker_wav,
48
+ speaker: str,
49
+ dialect,
50
+ text: str,
51
+ ):
52
  model = models_config[model_id]["model"]
53
  if len(text) == 0:
54
  raise gr.Error("請勿輸入空字串。")
 
57
  raise gr.Error(
58
  f"句子中的[{','.join(missing_words)}]目前無法轉成 ipa。請嘗試其他句子。"
59
  )
60
+ if use_default_emb_or_custom == "default":
61
+ wav = model.tts(
62
+ parse_ipa(ipa),
63
+ speaker_name=speaker,
64
+ language_name=dialect,
65
+ split_sentences=False,
66
+ )
67
+ else:
68
+ wav = model.tts(
69
+ parse_ipa(ipa),
70
+ speaker_wav=speaker_wav,
71
+ language_name=dialect,
72
+ split_sentences=False,
73
+ )
74
 
75
  return words, pinyin, (16000, np.array(wav))
76
 
77
+
78
  def when_model_selected(model_id):
79
  model_config = models_config[model_id]
80
+ speaker_drop_down_choices = [
81
+ (k, v) for k, v in model_config["speaker_mapping"].items()
82
+ ]
83
  dialect_drop_down_choices = model_config["avalible_dialect"]
84
+ use_default_emb_or_ref_radio_visible = False
85
+ if model_config["model"].tts_model.config.model_args.speaker_encoder_model_path:
86
+ use_default_emb_or_ref_radio_visible = True
87
+ return (
88
+ gr.update(choices=speaker_drop_down_choices),
89
+ gr.update(choices=dialect_drop_down_choices),
90
+ gr.update(visible=use_default_emb_or_ref_radio_visible),
91
+ )
92
+
93
+
94
+ def use_default_emb_or_custom_radio_input(use_default_emb_or_custom):
95
+ if use_default_emb_or_custom == "custom":
96
+ return gr.update(visible=True), gr.update(visible=False)
97
+ return gr.update(visible=False), gr.update(visible=True)
98
 
99
 
100
  demo = gr.Blocks(
 
112
  )
113
 
114
  with demo:
 
115
  default_model_id = list(models_config.keys())[0]
116
  model_drop_down = gr.Dropdown(
117
  models_config.keys(),
118
  value=default_model_id,
119
  )
120
+ use_default_emb_or_custom_radio = gr.Radio(
121
+ label="use default speaker embedding or custom speaker embedding",
122
+ choices=["default", "custom"],
123
+ value="default",
124
+ visible=False,
125
+ )
126
+ speaker_wav = gr.Microphone(
127
+ label="speaker wav",
128
+ visible=False,
129
+ editable=False,
130
+ type="filepath",
131
+ waveform_options=gr.WaveformOptions(
132
+ show_controls=False,
133
+ sample_rate=16000,
134
+ ),
135
+ )
136
  speaker_drop_down = gr.Dropdown(
137
+ choices=[
138
+ (k, v)
139
+ for k, v in models_config[default_model_id]["speaker_mapping"].items()
140
+ ],
141
+ value=list(models_config[default_model_id]["speaker_mapping"].values())[0],
142
  )
143
+ use_default_emb_or_custom_radio.input(
144
+ use_default_emb_or_custom_radio_input,
145
+ inputs=[use_default_emb_or_custom_radio],
146
+ outputs=[speaker_wav, speaker_drop_down],
147
+ )
148
+
149
  dialect_drop_down = gr.Dropdown(
150
  choices=models_config[default_model_id]["avalible_dialect"],
151
+ value=models_config[default_model_id]["avalible_dialect"][0],
152
  )
153
 
154
  model_drop_down.input(
155
  when_model_selected,
156
  inputs=[model_drop_down],
157
+ outputs=[speaker_drop_down, dialect_drop_down, use_default_emb_or_custom_radio],
158
  )
159
 
 
 
160
  gr.Markdown(
161
  """
162
  # 臺灣客語語音生成系統
 
166
  text_to_speech,
167
  inputs=[
168
  model_drop_down,
169
+ use_default_emb_or_custom_radio,
170
+ speaker_wav,
171
  speaker_drop_down,
172
  dialect_drop_down,
173
  gr.Textbox(),