zhzluke96 commited on
Commit
ae79826
1 Parent(s): b473486
modules/repos_static/resemble_enhance/inference.py CHANGED
@@ -127,6 +127,8 @@ def inference(
127
  ):
128
  if config.runtime_env_vars.off_tqdm:
129
  trange = range
 
 
130
 
131
  remove_weight_norm_recursively(model)
132
 
 
127
  ):
128
  if config.runtime_env_vars.off_tqdm:
129
  trange = range
130
+ else:
131
+ from tqdm import trange
132
 
133
  remove_weight_norm_recursively(model)
134
 
modules/utils/audio.py CHANGED
@@ -19,7 +19,11 @@ def audio_to_int16(audio_data):
19
  return audio_data
20
 
21
 
22
- def audiosegment_to_librosawav(audiosegment):
 
 
 
 
23
  channel_sounds = audiosegment.split_to_mono()
24
  samples = [s.get_array_of_samples() for s in channel_sounds]
25
 
 
19
  return audio_data
20
 
21
 
22
+ def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray:
23
+ """
24
+ Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
25
+ where each value is in range [-1.0, 1.0].
26
+ """
27
  channel_sounds = audiosegment.split_to_mono()
28
  samples = [s.get_array_of_samples() for s in channel_sounds]
29
 
modules/webui/app.py CHANGED
@@ -8,10 +8,11 @@ from modules import config
8
  from modules.webui import webui_config
9
 
10
  from modules.webui.changelog_tab import create_changelog_tab
 
11
  from modules.webui.system_tab import create_system_tab
12
  from modules.webui.tts_tab import create_tts_interface
13
- from modules.webui.ssml_tab import create_ssml_interface
14
- from modules.webui.spliter_tab import create_spliter_tab
15
  from modules.webui.speaker_tab import create_speaker_panel
16
  from modules.webui.readme_tab import create_readme_tab
17
 
@@ -86,10 +87,17 @@ def create_interface():
86
  create_tts_interface()
87
 
88
  with gr.TabItem("SSML", id="ssml"):
89
- ssml_input = create_ssml_interface()
90
-
91
- with gr.TabItem("Spilter"):
92
- create_spliter_tab(ssml_input, tabs=tabs)
 
 
 
 
 
 
 
93
 
94
  with gr.TabItem("Speaker"):
95
  create_speaker_panel()
 
8
  from modules.webui import webui_config
9
 
10
  from modules.webui.changelog_tab import create_changelog_tab
11
+ from modules.webui.ssml.podcast_tab import create_ssml_podcast_tab
12
  from modules.webui.system_tab import create_system_tab
13
  from modules.webui.tts_tab import create_tts_interface
14
+ from modules.webui.ssml.ssml_tab import create_ssml_interface
15
+ from modules.webui.ssml.spliter_tab import create_spliter_tab
16
  from modules.webui.speaker_tab import create_speaker_panel
17
  from modules.webui.readme_tab import create_readme_tab
18
 
 
87
  create_tts_interface()
88
 
89
  with gr.TabItem("SSML", id="ssml"):
90
+ with gr.Tabs() as ssml_tabs:
91
+ with gr.TabItem("Editor", id="ssml.editor"):
92
+ ssml_input = create_ssml_interface()
93
+ with gr.TabItem("Spilter"):
94
+ create_spliter_tab(
95
+ ssml_input=ssml_input, tabs1=tabs, tabs2=ssml_tabs
96
+ )
97
+ with gr.TabItem("Podcast"):
98
+ create_ssml_podcast_tab(
99
+ ssml_input=ssml_input, tabs1=tabs, tabs2=ssml_tabs
100
+ )
101
 
102
  with gr.TabItem("Speaker"):
103
  create_speaker_panel()
modules/webui/speaker/speaker_merger.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  import torch
4
 
5
  from modules.hf import spaces
 
6
  from modules.webui.webui_utils import get_speakers, tts_generate
7
  from modules.speaker import speaker_mgr, Speaker
8
 
@@ -138,23 +139,19 @@ merge_desc = """
138
  """
139
 
140
 
141
- def get_spk_choices():
142
- speakers = get_speakers()
143
-
144
- speaker_names = ["None"] + [get_speaker_show_name(speaker) for speaker in speakers]
145
- return speaker_names
146
-
147
-
148
  # 显示 a b c d 四个选择框,选择一个或多个,然后可以试音,并导出
149
  def create_speaker_merger():
150
- speaker_names = get_spk_choices()
 
 
 
151
 
152
  gr.Markdown(merge_desc)
153
 
154
  def spk_picker(label_tail: str):
155
  with gr.Row():
156
  spk_a = gr.Dropdown(
157
- choices=speaker_names, value="None", label=f"Speaker {label_tail}"
158
  )
159
  refresh_a_btn = gr.Button("🔄", variant="secondary")
160
 
 
3
  import torch
4
 
5
  from modules.hf import spaces
6
+ from modules.webui import webui_utils
7
  from modules.webui.webui_utils import get_speakers, tts_generate
8
  from modules.speaker import speaker_mgr, Speaker
9
 
 
139
  """
140
 
141
 
 
 
 
 
 
 
 
142
  # 显示 a b c d 四个选择框,选择一个或多个,然后可以试音,并导出
143
  def create_speaker_merger():
144
+ def get_spk_choices():
145
+ speakers, speaker_names = webui_utils.get_speaker_names()
146
+ speaker_names = ["None"] + speaker_names
147
+ return speaker_names
148
 
149
  gr.Markdown(merge_desc)
150
 
151
  def spk_picker(label_tail: str):
152
  with gr.Row():
153
  spk_a = gr.Dropdown(
154
+ choices=get_spk_choices(), value="None", label=f"Speaker {label_tail}"
155
  )
156
  refresh_a_btn = gr.Button("🔄", variant="secondary")
157
 
modules/webui/ssml/__init__.py ADDED
File without changes
modules/webui/ssml/podcast_tab.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import torch
4
+
5
+ from modules.normalization import text_normalize
6
+ from modules.webui import webui_utils
7
+ from modules.hf import spaces
8
+
9
+ podcast_default_case = [
10
+ [1, "female2", "你好,欢迎收听今天的播客内容。今天我们要聊的是中华料理。", "chat"],
11
+ [2, "Alice", "嗨,我特别期待这个话题!中华料理真的是博大精深。", "chat"],
12
+ [
13
+ 3,
14
+ "Bob",
15
+ "没错,中华料理有着几千年的历史,而且每个地区都有自己的特色菜。",
16
+ "chat",
17
+ ],
18
+ [
19
+ 4,
20
+ "female2",
21
+ "那我们先从最有名的川菜开始吧。川菜以其麻辣著称,是很多人的最爱。",
22
+ "chat",
23
+ ],
24
+ [
25
+ 5,
26
+ "Alice",
27
+ "对,我特别喜欢吃麻婆豆腐和辣子鸡。那种麻辣的感觉真是让人难以忘怀。",
28
+ "chat",
29
+ ],
30
+ [
31
+ 6,
32
+ "Bob",
33
+ "除了川菜,粤菜也是很受欢迎的。粤菜讲究鲜美,像是白切鸡和蒸鱼都是经典。",
34
+ "chat",
35
+ ],
36
+ [7, "female2", "对啊,粤菜的烹饪方式比较清淡,更注重食材本身的味道。", "chat"],
37
+ [8, "Alice", "还有北京的京菜,像北京烤鸭,那可是来北京必吃的美食。", "chat"],
38
+ [
39
+ 9,
40
+ "Bob",
41
+ "不仅如此,还有淮扬菜、湘菜、鲁菜等等,每个菜系都有其独特的风味。",
42
+ "chat",
43
+ ],
44
+ [
45
+ 10,
46
+ "female2",
47
+ "对对对,像淮扬菜的狮子头,湘菜的剁椒鱼头,都是让人垂涎三尺的美味。",
48
+ "chat",
49
+ ],
50
+ ]
51
+
52
+
53
+ # NOTE: 因为 text_normalize 需要使用 tokenizer
54
+ @torch.inference_mode()
55
+ @spaces.GPU
56
+ def merge_dataframe_to_ssml(msg, spk, style, df: pd.DataFrame):
57
+ ssml = ""
58
+ indent = " " * 2
59
+
60
+ for i, row in df.iterrows():
61
+ text = row.get("text")
62
+ spk = row.get("speaker")
63
+ style = row.get("style")
64
+
65
+ ssml += f"{indent}<voice"
66
+ if spk:
67
+ ssml += f' spk="{spk}"'
68
+ if style:
69
+ ssml += f' style="{style}"'
70
+ ssml += ">\n"
71
+ ssml += f"{indent}{indent}{text_normalize(text)}\n"
72
+ ssml += f"{indent}</voice>\n"
73
+ # 原封不动输出回去是为了触发 loadding 效果
74
+ return msg, spk, style, f"<speak version='0.1'>\n{ssml}</speak>"
75
+
76
+
77
+ def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Tabs):
78
+ def get_spk_choices():
79
+ speakers, speaker_names = webui_utils.get_speaker_names()
80
+ speaker_names = ["-1"] + speaker_names
81
+ return speaker_names
82
+
83
+ styles = ["*auto"] + [s.get("name") for s in webui_utils.get_styles()]
84
+
85
+ with gr.Row():
86
+ with gr.Column(scale=1):
87
+ with gr.Group():
88
+ spk_input_dropdown = gr.Dropdown(
89
+ choices=get_spk_choices(),
90
+ interactive=True,
91
+ value="female : female2",
92
+ show_label=False,
93
+ )
94
+ style_input_dropdown = gr.Dropdown(
95
+ choices=styles,
96
+ # label="Choose Style",
97
+ interactive=True,
98
+ show_label=False,
99
+ value="*auto",
100
+ )
101
+ with gr.Group():
102
+ msg = gr.Textbox(
103
+ lines=5, label="Message", placeholder="Type speaker message here"
104
+ )
105
+ add = gr.Button("Add")
106
+ undo = gr.Button("Undo")
107
+ clear = gr.Button("Clear")
108
+ with gr.Column(scale=5):
109
+ with gr.Group():
110
+ gr.Markdown("📔Script")
111
+ script_table = gr.DataFrame(
112
+ headers=["index", "speaker", "text", "style"],
113
+ datatype=["number", "str", "str", "str"],
114
+ interactive=False,
115
+ wrap=True,
116
+ value=podcast_default_case,
117
+ row_count=(0, "dynamic"),
118
+ )
119
+
120
+ send_to_ssml_btn = gr.Button("📩Send to SSML", variant="primary")
121
+
122
+ def add_message(msg, spk, style, sheet: pd.DataFrame):
123
+ if not msg:
124
+ return "", sheet
125
+
126
+ data = pd.DataFrame(
127
+ {
128
+ "index": [sheet.shape[0]],
129
+ "speaker": [spk.split(" : ")[1].strip()],
130
+ "text": [msg],
131
+ "style": [style],
132
+ },
133
+ )
134
+
135
+ # 如果只有一行 并且是空的
136
+ is_empty = sheet.empty or (sheet.shape[0] == 1 and "text" not in sheet.iloc[0])
137
+
138
+ if is_empty:
139
+ sheet = data
140
+ else:
141
+ sheet = pd.concat(
142
+ [
143
+ sheet,
144
+ data,
145
+ ],
146
+ ignore_index=True,
147
+ )
148
+ return "", sheet
149
+
150
+ def undo_message(msg, spk, style, sheet: pd.DataFrame):
151
+ if sheet.empty:
152
+ return msg, spk, style, sheet
153
+ data = sheet.iloc[-1]
154
+ sheet = sheet.iloc[:-1]
155
+ spk = ""
156
+ for choice in get_spk_choices():
157
+ if choice.endswith(data["speaker"]) and " : " in choice:
158
+ spk = choice
159
+ break
160
+ return data["text"], spk, data["style"], sheet
161
+
162
+ def clear_message():
163
+ return "", pd.DataFrame(
164
+ columns=["index", "speaker", "text", "style"],
165
+ )
166
+
167
+ def send_to_ssml(msg, spk, style, sheet: pd.DataFrame):
168
+ if sheet.empty:
169
+ return gr.Error("Please add some text to the script table.")
170
+ msg, spk, style, ssml = merge_dataframe_to_ssml(msg, spk, style, sheet)
171
+ return [
172
+ msg,
173
+ spk,
174
+ style,
175
+ gr.Textbox(value=ssml),
176
+ gr.Tabs(selected="ssml"),
177
+ gr.Tabs(selected="ssml.editor"),
178
+ ]
179
+
180
+ msg.submit(
181
+ add_message,
182
+ inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
183
+ outputs=[msg, script_table],
184
+ )
185
+ add.click(
186
+ add_message,
187
+ inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
188
+ outputs=[msg, script_table],
189
+ )
190
+ undo.click(
191
+ undo_message,
192
+ inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
193
+ outputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
194
+ )
195
+ clear.click(
196
+ clear_message,
197
+ outputs=[msg, script_table],
198
+ )
199
+ send_to_ssml_btn.click(
200
+ send_to_ssml,
201
+ inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
202
+ outputs=[
203
+ msg,
204
+ spk_input_dropdown,
205
+ style_input_dropdown,
206
+ ssml_input,
207
+ tabs1,
208
+ tabs2,
209
+ ],
210
+ )
modules/webui/ssml/spliter_tab.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from modules.normalization import text_normalize
4
+ from modules.webui import webui_utils
5
+ from modules.webui.webui_utils import (
6
+ get_speakers,
7
+ get_styles,
8
+ split_long_text,
9
+ )
10
+ from modules.hf import spaces
11
+
12
+
13
+ # NOTE: 因为 text_normalize 需要使用 tokenizer
14
+ @torch.inference_mode()
15
+ @spaces.GPU
16
+ def merge_dataframe_to_ssml(dataframe, spk, style, seed):
17
+ if style == "*auto":
18
+ style = None
19
+ if spk == "-1" or spk == -1:
20
+ spk = None
21
+ if seed == -1 or seed == "-1":
22
+ seed = None
23
+
24
+ ssml = ""
25
+ indent = " " * 2
26
+
27
+ for i, row in dataframe.iterrows():
28
+ ssml += f"{indent}<voice"
29
+ if spk:
30
+ ssml += f' spk="{spk}"'
31
+ if style:
32
+ ssml += f' style="{style}"'
33
+ if seed:
34
+ ssml += f' seed="{seed}"'
35
+ ssml += ">\n"
36
+ ssml += f"{indent}{indent}{text_normalize(row.iloc[1])}\n"
37
+ ssml += f"{indent}</voice>\n"
38
+ # 原封不动输出回去是为了触发 loadding 效果
39
+ return dataframe, spk, style, seed, f"<speak version='0.1'>\n{ssml}</speak>"
40
+
41
+
42
+ # 长文本处理
43
+ # 可以输入长文本,并选择切割方法,切割之后可以将拼接的SSML发送到SSML tab
44
+ # 根据 。 句号切割,切割之后显示到 data table
45
+ def create_spliter_tab(ssml_input, tabs1, tabs2):
46
+ speakers, speaker_names = webui_utils.get_speaker_names()
47
+ speaker_names = ["*random"] + speaker_names
48
+
49
+ styles = ["*auto"] + [s.get("name") for s in get_styles()]
50
+
51
+ with gr.Row():
52
+ with gr.Column(scale=1):
53
+ # 选择说话人 选择风格 选择seed
54
+ with gr.Group():
55
+ gr.Markdown("🗣️Speaker")
56
+ spk_input_text = gr.Textbox(
57
+ label="Speaker (Text or Seed)",
58
+ value="female2",
59
+ show_label=False,
60
+ )
61
+ spk_input_dropdown = gr.Dropdown(
62
+ choices=speaker_names,
63
+ interactive=True,
64
+ value="female : female2",
65
+ show_label=False,
66
+ )
67
+ spk_rand_button = gr.Button(
68
+ value="🎲",
69
+ variant="secondary",
70
+ )
71
+ with gr.Group():
72
+ gr.Markdown("🎭Style")
73
+ style_input_dropdown = gr.Dropdown(
74
+ choices=styles,
75
+ interactive=True,
76
+ show_label=False,
77
+ value="*auto",
78
+ )
79
+ with gr.Group():
80
+ gr.Markdown("🗣️Seed")
81
+ infer_seed_input = gr.Number(
82
+ value=42,
83
+ label="Inference Seed",
84
+ show_label=False,
85
+ minimum=-1,
86
+ maximum=2**32 - 1,
87
+ )
88
+ infer_seed_rand_button = gr.Button(
89
+ value="🎲",
90
+ variant="secondary",
91
+ )
92
+
93
+ send_btn = gr.Button("📩Send to SSML", variant="primary")
94
+
95
+ with gr.Column(scale=3):
96
+ with gr.Group():
97
+ gr.Markdown("📝Long Text Input")
98
+ gr.Markdown("- 此页面用于处理超长文本")
99
+ gr.Markdown("- 切割后,可以选择说话人、风格、seed,然后发送到SSML")
100
+ long_text_input = gr.Textbox(
101
+ label="Long Text Input",
102
+ lines=10,
103
+ placeholder="输入长文本",
104
+ elem_id="long-text-input",
105
+ show_label=False,
106
+ )
107
+ long_text_split_button = gr.Button("🔪Split Text")
108
+
109
+ with gr.Row():
110
+ with gr.Column(scale=3):
111
+ with gr.Group():
112
+ gr.Markdown("🎨Output")
113
+ long_text_output = gr.DataFrame(
114
+ headers=["index", "text", "length"],
115
+ datatype=["number", "str", "number"],
116
+ elem_id="long-text-output",
117
+ interactive=False,
118
+ wrap=True,
119
+ value=[],
120
+ )
121
+
122
+ spk_input_dropdown.change(
123
+ fn=lambda x: x.startswith("*") and "-1" or x.split(":")[-1].strip(),
124
+ inputs=[spk_input_dropdown],
125
+ outputs=[spk_input_text],
126
+ )
127
+ spk_rand_button.click(
128
+ lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
129
+ inputs=[spk_input_text],
130
+ outputs=[spk_input_text],
131
+ )
132
+ infer_seed_rand_button.click(
133
+ lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
134
+ inputs=[infer_seed_input],
135
+ outputs=[infer_seed_input],
136
+ )
137
+ long_text_split_button.click(
138
+ split_long_text,
139
+ inputs=[long_text_input],
140
+ outputs=[long_text_output],
141
+ )
142
+
143
+ infer_seed_rand_button.click(
144
+ lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
145
+ inputs=[infer_seed_input],
146
+ outputs=[infer_seed_input],
147
+ )
148
+
149
+ send_btn.click(
150
+ merge_dataframe_to_ssml,
151
+ inputs=[
152
+ long_text_output,
153
+ spk_input_text,
154
+ style_input_dropdown,
155
+ infer_seed_input,
156
+ ],
157
+ outputs=[
158
+ long_text_output,
159
+ spk_input_text,
160
+ style_input_dropdown,
161
+ infer_seed_input,
162
+ ssml_input,
163
+ ],
164
+ )
165
+
166
+ def change_tab():
167
+ return gr.Tabs(selected="ssml"), gr.Tabs(selected="ssml.editor")
168
+
169
+ send_btn.click(change_tab, inputs=[], outputs=[tabs1, tabs2])
modules/webui/ssml/ssml_tab.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from modules.webui.webui_utils import (
3
+ synthesize_ssml,
4
+ )
5
+ from modules.webui import webui_config
6
+ from modules.webui.examples import ssml_examples, default_ssml
7
+
8
+
9
+ def create_ssml_interface():
10
+ with gr.Row():
11
+ with gr.Column(scale=3):
12
+ with gr.Group():
13
+ gr.Markdown("📝SSML Input")
14
+ gr.Markdown(f"- 最长{webui_config.ssml_max:,}字符,超过会被截断")
15
+ gr.Markdown("- 尽量保证使用相同的 seed")
16
+ gr.Markdown(
17
+ "- 关于SSML可以看这个 [文档](https://github.com/lenML/ChatTTS-Forge/blob/main/docs/SSML.md)"
18
+ )
19
+ ssml_input = gr.Textbox(
20
+ label="SSML Input",
21
+ lines=10,
22
+ value=default_ssml,
23
+ placeholder="输入 SSML 或选择示例",
24
+ elem_id="ssml_input",
25
+ show_label=False,
26
+ )
27
+ ssml_button = gr.Button("🔊Synthesize SSML", variant="primary")
28
+ with gr.Column(scale=1):
29
+ with gr.Group():
30
+ # 参数
31
+ gr.Markdown("🎛️Parameters")
32
+ # batch size
33
+ batch_size_input = gr.Slider(
34
+ label="Batch Size",
35
+ value=4,
36
+ minimum=1,
37
+ maximum=webui_config.max_batch_size,
38
+ step=1,
39
+ )
40
+
41
+ with gr.Group():
42
+ gr.Markdown("💪🏼Enhance")
43
+ enable_enhance = gr.Checkbox(value=True, label="Enable Enhance")
44
+ enable_de_noise = gr.Checkbox(value=False, label="Enable De-noise")
45
+
46
+ with gr.Group():
47
+ gr.Markdown("🎄Examples")
48
+ gr.Examples(
49
+ examples=ssml_examples,
50
+ inputs=[ssml_input],
51
+ )
52
+
53
+ ssml_output = gr.Audio(label="Generated Audio", format="mp3")
54
+
55
+ ssml_button.click(
56
+ synthesize_ssml,
57
+ inputs=[ssml_input, batch_size_input, enable_enhance, enable_de_noise],
58
+ outputs=ssml_output,
59
+ )
60
+
61
+ return ssml_input
modules/webui/tts_tab.py CHANGED
@@ -27,6 +27,7 @@ def create_tts_interface():
27
  speaker_names = ["*random"] + [
28
  get_speaker_show_name(speaker) for speaker in speakers
29
  ]
 
30
 
31
  styles = ["*auto"] + [s.get("name") for s in get_styles()]
32
 
@@ -121,18 +122,10 @@ def create_tts_interface():
121
  # tooltip="Random Seed",
122
  variant="secondary",
123
  )
 
124
  use_decoder_input = gr.Checkbox(
125
  value=True, label="Use Decoder", visible=False
126
  )
127
- with gr.Group():
128
- gr.Markdown("🔧Prompt engineering")
129
- prompt1_input = gr.Textbox(label="Prompt 1")
130
- prompt2_input = gr.Textbox(label="Prompt 2")
131
- prefix_input = gr.Textbox(label="Prefix")
132
-
133
- prompt_audio = gr.File(
134
- label="prompt_audio", visible=webui_config.experimental
135
- )
136
 
137
  infer_seed_rand_button.click(
138
  lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
@@ -214,6 +207,16 @@ def create_tts_interface():
214
  )
215
  refine_button = gr.Button("✍️Refine Text")
216
 
 
 
 
 
 
 
 
 
 
 
217
  with gr.Group():
218
  gr.Markdown("🔊Generate")
219
  disable_normalize_input = gr.Checkbox(
 
27
  speaker_names = ["*random"] + [
28
  get_speaker_show_name(speaker) for speaker in speakers
29
  ]
30
+ speaker_names.sort(key=lambda x: x.startswith("*") and "-1" or x)
31
 
32
  styles = ["*auto"] + [s.get("name") for s in get_styles()]
33
 
 
122
  # tooltip="Random Seed",
123
  variant="secondary",
124
  )
125
+ # 感觉这个没必要设置...
126
  use_decoder_input = gr.Checkbox(
127
  value=True, label="Use Decoder", visible=False
128
  )
 
 
 
 
 
 
 
 
 
129
 
130
  infer_seed_rand_button.click(
131
  lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
 
207
  )
208
  refine_button = gr.Button("✍️Refine Text")
209
 
210
+ with gr.Group():
211
+ gr.Markdown("🔧Prompt engineering")
212
+ prompt1_input = gr.Textbox(label="Prompt 1")
213
+ prompt2_input = gr.Textbox(label="Prompt 2")
214
+ prefix_input = gr.Textbox(label="Prefix")
215
+
216
+ prompt_audio = gr.File(
217
+ label="prompt_audio", visible=webui_config.experimental
218
+ )
219
+
220
  with gr.Group():
221
  gr.Markdown("🔊Generate")
222
  disable_normalize_input = gr.Checkbox(
modules/webui/webui_utils.py CHANGED
@@ -32,6 +32,20 @@ def get_speakers():
32
  return speaker_mgr.list_speakers()
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def get_styles():
36
  return styles_mgr.list_items()
37
 
@@ -93,7 +107,12 @@ def apply_audio_enhance(audio_data, sr, enable_denoise, enable_enhance):
93
 
94
  @torch.inference_mode()
95
  @spaces.GPU
96
- def synthesize_ssml(ssml: str, batch_size=4):
 
 
 
 
 
97
  try:
98
  batch_size = int(batch_size)
99
  except Exception:
@@ -116,7 +135,16 @@ def synthesize_ssml(ssml: str, batch_size=4):
116
  audio_segments = synthesize.synthesize_segments(segments)
117
  combined_audio = combine_audio_segments(audio_segments)
118
 
119
- sr, audio_data = audio.pydub_to_np(combined_audio)
 
 
 
 
 
 
 
 
 
120
 
121
  return sr, audio_data
122
 
@@ -193,6 +221,7 @@ def tts_generate(
193
  audio_data, sample_rate = apply_audio_enhance(
194
  audio_data, sample_rate, enable_denoise, enable_enhance
195
  )
 
196
  audio_data = audio.audio_to_int16(audio_data)
197
  return sample_rate, audio_data
198
 
 
32
  return speaker_mgr.list_speakers()
33
 
34
 
35
+ def get_speaker_names() -> tuple[list[Speaker], list[str]]:
36
+ speakers = get_speakers()
37
+
38
+ def get_speaker_show_name(spk):
39
+ if spk.gender == "*" or spk.gender == "":
40
+ return spk.name
41
+ return f"{spk.gender} : {spk.name}"
42
+
43
+ speaker_names = [get_speaker_show_name(speaker) for speaker in speakers]
44
+ speaker_names.sort(key=lambda x: x.startswith("*") and "-1" or x)
45
+
46
+ return speakers, speaker_names
47
+
48
+
49
  def get_styles():
50
  return styles_mgr.list_items()
51
 
 
107
 
108
  @torch.inference_mode()
109
  @spaces.GPU
110
+ def synthesize_ssml(
111
+ ssml: str,
112
+ batch_size=4,
113
+ enable_enhance=False,
114
+ enable_denoise=False,
115
+ ):
116
  try:
117
  batch_size = int(batch_size)
118
  except Exception:
 
135
  audio_segments = synthesize.synthesize_segments(segments)
136
  combined_audio = combine_audio_segments(audio_segments)
137
 
138
+ sr = combined_audio.frame_rate
139
+ audio_data, sr = apply_audio_enhance(
140
+ audio.audiosegment_to_librosawav(combined_audio),
141
+ sr,
142
+ enable_denoise,
143
+ enable_enhance,
144
+ )
145
+
146
+ # NOTE: 这里必须要加,不然 gradio 没法解析成 mp3 格式
147
+ audio_data = audio.audio_to_int16(audio_data)
148
 
149
  return sr, audio_data
150
 
 
221
  audio_data, sample_rate = apply_audio_enhance(
222
  audio_data, sample_rate, enable_denoise, enable_enhance
223
  )
224
+ # NOTE: 这里必须要加,不然 gradio 没法解析成 mp3 格式
225
  audio_data = audio.audio_to_int16(audio_data)
226
  return sample_rate, audio_data
227