Kit-Lemonfoot commited on
Commit
c8a5451
1 Parent(s): 2a50cb7

NLTLK patching odyssey part 1

Browse files
Files changed (1) hide show
  1. app.py +388 -384
app.py CHANGED
@@ -1,384 +1,388 @@
1
- print("Starting up. Please be patient...")
2
-
3
- import argparse
4
- import datetime
5
- import os
6
- import sys
7
- from typing import Optional
8
- import json
9
- import utils
10
-
11
- import gradio as gr
12
- import torch
13
- import yaml
14
-
15
- from common.constants import (
16
- DEFAULT_ASSIST_TEXT_WEIGHT,
17
- DEFAULT_LENGTH,
18
- DEFAULT_LINE_SPLIT,
19
- DEFAULT_NOISE,
20
- DEFAULT_NOISEW,
21
- DEFAULT_SDP_RATIO,
22
- DEFAULT_SPLIT_INTERVAL,
23
- DEFAULT_STYLE,
24
- DEFAULT_STYLE_WEIGHT,
25
- Languages,
26
- )
27
- from common.log import logger
28
- from common.tts_model import ModelHolder
29
- from infer import InvalidToneError
30
- from text.japanese import g2kata_tone, kata_tone2phone_tone, text_normalize
31
-
32
- is_hf_spaces = os.getenv("SYSTEM") == "spaces"
33
- limit = 150
34
-
35
- # Get path settings
36
- with open(os.path.join("configs", "paths.yml"), "r", encoding="utf-8") as f:
37
- path_config: dict[str, str] = yaml.safe_load(f.read())
38
- # dataset_root = path_config["dataset_root"]
39
- assets_root = path_config["assets_root"]
40
-
41
- def tts_fn(
42
- model_name,
43
- model_path,
44
- text,
45
- language,
46
- reference_audio_path,
47
- sdp_ratio,
48
- noise_scale,
49
- noise_scale_w,
50
- length_scale,
51
- line_split,
52
- split_interval,
53
- assist_text,
54
- assist_text_weight,
55
- use_assist_text,
56
- style,
57
- style_weight,
58
- kata_tone_json_str,
59
- use_tone,
60
- speaker,
61
- ):
62
- if len(text)<2:
63
- return "Please enter some text.", None, kata_tone_json_str
64
-
65
- if is_hf_spaces and len(text) > limit:
66
- return f"Too long! There is a character limit of {limit} characters.", None, kata_tone_json_str
67
-
68
- if(not model_holder.current_model):
69
- model_holder.load_model_gr(model_name, model_path)
70
- logger.info(f"Loaded model '{model_name}'")
71
- if(model_holder.current_model.model_path != model_path):
72
- model_holder.load_model_gr(model_name, model_path)
73
- logger.info(f"Swapped to model '{model_name}'")
74
- speaker_id = model_holder.current_model.spk2id[speaker]
75
- start_time = datetime.datetime.now()
76
-
77
- wrong_tone_message = ""
78
- kata_tone: Optional[list[tuple[str, int]]] = None
79
- if use_tone and kata_tone_json_str != "":
80
- if language != "JP":
81
- #logger.warning("Only Japanese is supported for tone generation.")
82
- wrong_tone_message = "アクセント指定は現在日本語のみ対応しています。"
83
- if line_split:
84
- #logger.warning("Tone generation is not supported for line split.")
85
- wrong_tone_message = (
86
- "アクセント指定は改行で分けて生成を使わない場合のみ対応しています。"
87
- )
88
- try:
89
- kata_tone = []
90
- json_data = json.loads(kata_tone_json_str)
91
- # tupleを使うように変換
92
- for kana, tone in json_data:
93
- assert isinstance(kana, str) and tone in (0, 1), f"{kana}, {tone}"
94
- kata_tone.append((kana, tone))
95
- except Exception as e:
96
- logger.warning(f"Error occurred when parsing kana_tone_json: {e}")
97
- wrong_tone_message = f"アクセント指定が不正です: {e}"
98
- kata_tone = None
99
-
100
- # toneは実際に音声合成に代入される際のみnot Noneになる
101
- tone: Optional[list[int]] = None
102
- if kata_tone is not None:
103
- phone_tone = kata_tone2phone_tone(kata_tone)
104
- tone = [t for _, t in phone_tone]
105
-
106
- try:
107
- sr, audio = model_holder.current_model.infer(
108
- text=text,
109
- language=language,
110
- reference_audio_path=reference_audio_path,
111
- sdp_ratio=sdp_ratio,
112
- noise=noise_scale,
113
- noisew=noise_scale_w,
114
- length=length_scale,
115
- line_split=line_split,
116
- split_interval=split_interval,
117
- assist_text=assist_text,
118
- assist_text_weight=assist_text_weight,
119
- use_assist_text=use_assist_text,
120
- style=style,
121
- style_weight=style_weight,
122
- given_tone=tone,
123
- sid=speaker_id,
124
- )
125
- except InvalidToneError as e:
126
- logger.error(f"Tone error: {e}")
127
- return f"Error: アクセント指定が不正です:\n{e}", None, kata_tone_json_str
128
- except ValueError as e:
129
- logger.error(f"Value error: {e}")
130
- return f"Error: {e}", None, kata_tone_json_str
131
-
132
- end_time = datetime.datetime.now()
133
- duration = (end_time - start_time).total_seconds()
134
-
135
- if tone is None and language == "JP":
136
- # アクセント指定に使えるようにアクセント情報を返す
137
- norm_text = text_normalize(text)
138
- kata_tone = g2kata_tone(norm_text)
139
- kata_tone_json_str = json.dumps(kata_tone, ensure_ascii=False)
140
- elif tone is None:
141
- kata_tone_json_str = ""
142
-
143
- if reference_audio_path:
144
- style="External Audio"
145
- logger.info(f"Successful inference, took {duration}s | {speaker} | {language}/{sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{style}/{style_weight} | {text}")
146
- message = f"Success, time: {duration} seconds."
147
- if wrong_tone_message != "":
148
- message = wrong_tone_message + "\n" + message
149
- return message, (sr, audio), kata_tone_json_str
150
-
151
- def load_voicedata():
152
- print("Loading voice data...")
153
- envoices = []
154
- jpvoices = []
155
- styledict = {}
156
- with open("voicelist.json", "r", encoding="utf-8") as f:
157
- voc_info = json.load(f)
158
- for name, info in voc_info.items():
159
- if not info['enable']:
160
- continue
161
- model_path = info['model_path']
162
- model_path_full = f"{model_dir}/{model_path}/{model_path}.safetensors"
163
- if not os.path.exists(model_path_full):
164
- model_path_full = f"{model_dir}\\{model_path}\\{model_path}.safetensors"
165
- voice_name = info['title']
166
- speakerid = info['speakerid']
167
- datasetauthor = info['datasetauthor']
168
- image = info['cover']
169
- if not os.path.exists(f"images/{image}"):
170
- image="none.png"
171
- # for voices that are either known buggy or abnormal
172
- nospace=False
173
- if 'disableonspace' in info:
174
- nospace=info['disableonspace']
175
- if not model_path in styledict.keys():
176
- conf=f"{model_dir}/{model_path}/config.json"
177
- hps = utils.get_hparams_from_file(conf)
178
- s2id = hps.data.style2id
179
- styledict[model_path] = s2id.keys()
180
- print(f"Set up hyperparameters for model {model_path}")
181
- if(info['primarylang']=="JP"):
182
- jpvoices.append((name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image, nospace))
183
- else:
184
- envoices.append((name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image, nospace))
185
- return [envoices, jpvoices], styledict
186
-
187
-
188
- initial_text = "Hello there! This is test audio of a new Hololive text to speech tool."
189
-
190
- initial_md = """
191
- # Hololive [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2)
192
- ### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
193
- ### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
194
-
195
- Do no evil.
196
- """
197
-
198
- style_md = """
199
- - You can control things like voice tone, emotion, and reading style through presets or through voice files.
200
- - Neutral acts as an average across all speakers. Styling options act as an override to Neutral.
201
- - Setting the intensity too high will likely break the output.
202
- - The required intensity will depend based on the speaker and the desired style.
203
- - If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
204
- """
205
-
206
- if __name__ == "__main__":
207
- parser = argparse.ArgumentParser()
208
- parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
209
- parser.add_argument(
210
- "--dir", "-d", type=str, help="Model directory", default=assets_root
211
- )
212
- parser.add_argument(
213
- "--share", action="store_true", help="Share this app publicly", default=False
214
- )
215
- parser.add_argument(
216
- "--server-name",
217
- type=str,
218
- default=None,
219
- help="Server name for Gradio app",
220
- )
221
- parser.add_argument(
222
- "--no-autolaunch",
223
- action="store_true",
224
- default=False,
225
- help="Do not launch app automatically",
226
- )
227
- args = parser.parse_args()
228
- model_dir = args.dir
229
- print(model_dir)
230
-
231
- if args.cpu:
232
- device = "cpu"
233
- else:
234
- device = "cuda" if torch.cuda.is_available() else "cpu"
235
-
236
- model_holder = ModelHolder(model_dir, device)
237
-
238
- languages = ["EN", "JP", "ZH"]
239
- langnames = ["English", "Japanese"]
240
-
241
- model_names = model_holder.model_names
242
- if len(model_names) == 0:
243
- logger.error(f"No models found. Please place the model in {model_dir}.")
244
- sys.exit(1)
245
- initial_id = 0
246
- initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
247
- #print(initial_pth_files)
248
-
249
- voicedata, styledict = load_voicedata()
250
-
251
- #Gradio preload
252
- text_input = gr.TextArea(label="Text", value=initial_text)
253
- line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
254
- split_interval = gr.Slider(
255
- minimum=0.0,
256
- maximum=2,
257
- value=0.5,
258
- step=0.1,
259
- label="Length of division seperation time (in seconds)",
260
- )
261
- language = gr.Dropdown(choices=languages, value="EN", label="Language")
262
- sdp_ratio = gr.Slider(
263
- minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
264
- )
265
- noise_scale = gr.Slider(
266
- minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
267
- )
268
- noise_scale_w = gr.Slider(
269
- minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
270
- )
271
- length_scale = gr.Slider(
272
- minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
273
- )
274
- use_style_text = gr.Checkbox(label="Use stylization text", value=False)
275
- style_text = gr.Textbox(
276
- label="Style text",
277
- placeholder="Check the \"Use stylization text\" box to use this option!",
278
- info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
279
- visible=True,
280
- )
281
- style_text_weight = gr.Slider(
282
- minimum=0,
283
- maximum=1,
284
- value=0.7,
285
- step=0.1,
286
- label="Text stylization strength",
287
- visible=True,
288
- )
289
-
290
- with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="Hololive Style-Bert-VITS2") as app:
291
- gr.Markdown(initial_md)
292
-
293
- #NOT USED SINCE NONE OF MY MODELS ARE JPEXTRA.
294
- #ONLY HERE FOR COMPATIBILITY WITH THE EXISTING INFER CODE.
295
- #DO NOT RENDER OR MAKE VISIBLE
296
- tone = gr.Textbox(
297
- label="Accent adjustment (0 for low, 1 for high)",
298
- info="This can only be used when not seperated by line breaks. It is not universal.",
299
- visible=False
300
- )
301
- use_tone = gr.Checkbox(label="Use accent adjustment", value=False, visible=False)
302
-
303
- #for (name, model_path, voice_name, speakerid, datasetauthor, image) in voicedata:
304
- for vi in range(len(voicedata)):
305
- with gr.TabItem(langnames[vi]):
306
- for (name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image, nospace) in voicedata[vi]:
307
- if(nospace and is_hf_spaces):
308
- continue
309
- with gr.TabItem(name):
310
- mn = gr.Textbox(value=model_path, visible=False, interactive=False)
311
- mp = gr.Textbox(value=model_path_full, visible=False, interactive=False)
312
- spk = gr.Textbox(value=speakerid, visible=False, interactive=False)
313
- with gr.Row():
314
- with gr.Column():
315
- gr.Markdown(f"**{voice_name}**\n\nModel name: {model_path} | Dataset author: {datasetauthor}")
316
- gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False)
317
- with gr.Column():
318
- with gr.TabItem("Style using a preset"):
319
- style = gr.Dropdown(
320
- label="Current style (Neutral is an average style)",
321
- choices=styledict[model_path],
322
- value="Neutral",
323
- )
324
- with gr.TabItem("Style using existing audio"):
325
- ref_audio_path = gr.Audio(label="Reference Audio", type="filepath", sources=["upload"])
326
- style_weight = gr.Slider(
327
- minimum=0,
328
- maximum=20,
329
- value=3,
330
- step=0.1,
331
- label="Style strength",
332
- )
333
- with gr.Column():
334
- tts_button = gr.Button(
335
- "Synthesize", variant="primary", interactive=True
336
- )
337
- text_output = gr.Textbox(label="Info")
338
- audio_output = gr.Audio(label="Result")
339
-
340
- tts_button.click(
341
- tts_fn,
342
- inputs=[
343
- mn,
344
- mp,
345
- text_input,
346
- language,
347
- ref_audio_path,
348
- sdp_ratio,
349
- noise_scale,
350
- noise_scale_w,
351
- length_scale,
352
- line_split,
353
- split_interval,
354
- style_text,
355
- style_text_weight,
356
- use_style_text,
357
- style,
358
- style_weight,
359
- tone,
360
- use_tone,
361
- spk,
362
- ],
363
- outputs=[text_output, audio_output, tone],
364
- )
365
-
366
- with gr.Row():
367
- with gr.Column():
368
- text_input.render()
369
- line_split.render()
370
- split_interval.render()
371
- language.render()
372
- with gr.Column():
373
- sdp_ratio.render()
374
- noise_scale.render()
375
- noise_scale_w.render()
376
- length_scale.render()
377
- use_style_text.render()
378
- style_text.render()
379
- style_text_weight.render()
380
-
381
- with gr.Accordion("Styling Guide", open=False):
382
- gr.Markdown(style_md)
383
-
384
- app.launch(allowed_paths=['/file/images/'])
 
 
 
 
 
1
+ print("Starting up. Please be patient...")
2
+
3
+ import argparse
4
+ import datetime
5
+ import os
6
+ import sys
7
+ from typing import Optional
8
+ import json
9
+ import utils
10
+
11
+ import gradio as gr
12
+ import torch
13
+ import yaml
14
+
15
+ from common.constants import (
16
+ DEFAULT_ASSIST_TEXT_WEIGHT,
17
+ DEFAULT_LENGTH,
18
+ DEFAULT_LINE_SPLIT,
19
+ DEFAULT_NOISE,
20
+ DEFAULT_NOISEW,
21
+ DEFAULT_SDP_RATIO,
22
+ DEFAULT_SPLIT_INTERVAL,
23
+ DEFAULT_STYLE,
24
+ DEFAULT_STYLE_WEIGHT,
25
+ Languages,
26
+ )
27
+ from common.log import logger
28
+ from common.tts_model import ModelHolder
29
+ from infer import InvalidToneError
30
+ from text.japanese import g2kata_tone, kata_tone2phone_tone, text_normalize
31
+
32
+ # Here we go again with NLTK's bullshit.
33
+ import nltk
34
+ nltk.download('averaged_perceptron_tagger_eng')
35
+
36
+ is_hf_spaces = os.getenv("SYSTEM") == "spaces"
37
+ limit = 150
38
+
39
+ # Get path settings
40
+ with open(os.path.join("configs", "paths.yml"), "r", encoding="utf-8") as f:
41
+ path_config: dict[str, str] = yaml.safe_load(f.read())
42
+ # dataset_root = path_config["dataset_root"]
43
+ assets_root = path_config["assets_root"]
44
+
45
+ def tts_fn(
46
+ model_name,
47
+ model_path,
48
+ text,
49
+ language,
50
+ reference_audio_path,
51
+ sdp_ratio,
52
+ noise_scale,
53
+ noise_scale_w,
54
+ length_scale,
55
+ line_split,
56
+ split_interval,
57
+ assist_text,
58
+ assist_text_weight,
59
+ use_assist_text,
60
+ style,
61
+ style_weight,
62
+ kata_tone_json_str,
63
+ use_tone,
64
+ speaker,
65
+ ):
66
+ if len(text)<2:
67
+ return "Please enter some text.", None, kata_tone_json_str
68
+
69
+ if is_hf_spaces and len(text) > limit:
70
+ return f"Too long! There is a character limit of {limit} characters.", None, kata_tone_json_str
71
+
72
+ if(not model_holder.current_model):
73
+ model_holder.load_model_gr(model_name, model_path)
74
+ logger.info(f"Loaded model '{model_name}'")
75
+ if(model_holder.current_model.model_path != model_path):
76
+ model_holder.load_model_gr(model_name, model_path)
77
+ logger.info(f"Swapped to model '{model_name}'")
78
+ speaker_id = model_holder.current_model.spk2id[speaker]
79
+ start_time = datetime.datetime.now()
80
+
81
+ wrong_tone_message = ""
82
+ kata_tone: Optional[list[tuple[str, int]]] = None
83
+ if use_tone and kata_tone_json_str != "":
84
+ if language != "JP":
85
+ #logger.warning("Only Japanese is supported for tone generation.")
86
+ wrong_tone_message = "アクセント指定は現在日本語のみ対応しています。"
87
+ if line_split:
88
+ #logger.warning("Tone generation is not supported for line split.")
89
+ wrong_tone_message = (
90
+ "アクセント指定は改行で分けて生成を使わない場合のみ対応しています。"
91
+ )
92
+ try:
93
+ kata_tone = []
94
+ json_data = json.loads(kata_tone_json_str)
95
+ # tupleを使うように変換
96
+ for kana, tone in json_data:
97
+ assert isinstance(kana, str) and tone in (0, 1), f"{kana}, {tone}"
98
+ kata_tone.append((kana, tone))
99
+ except Exception as e:
100
+ logger.warning(f"Error occurred when parsing kana_tone_json: {e}")
101
+ wrong_tone_message = f"アクセント指定が不正です: {e}"
102
+ kata_tone = None
103
+
104
+ # toneは実際に音声合成に代入される際のみnot Noneになる
105
+ tone: Optional[list[int]] = None
106
+ if kata_tone is not None:
107
+ phone_tone = kata_tone2phone_tone(kata_tone)
108
+ tone = [t for _, t in phone_tone]
109
+
110
+ try:
111
+ sr, audio = model_holder.current_model.infer(
112
+ text=text,
113
+ language=language,
114
+ reference_audio_path=reference_audio_path,
115
+ sdp_ratio=sdp_ratio,
116
+ noise=noise_scale,
117
+ noisew=noise_scale_w,
118
+ length=length_scale,
119
+ line_split=line_split,
120
+ split_interval=split_interval,
121
+ assist_text=assist_text,
122
+ assist_text_weight=assist_text_weight,
123
+ use_assist_text=use_assist_text,
124
+ style=style,
125
+ style_weight=style_weight,
126
+ given_tone=tone,
127
+ sid=speaker_id,
128
+ )
129
+ except InvalidToneError as e:
130
+ logger.error(f"Tone error: {e}")
131
+ return f"Error: アクセント指定が不正です:\n{e}", None, kata_tone_json_str
132
+ except ValueError as e:
133
+ logger.error(f"Value error: {e}")
134
+ return f"Error: {e}", None, kata_tone_json_str
135
+
136
+ end_time = datetime.datetime.now()
137
+ duration = (end_time - start_time).total_seconds()
138
+
139
+ if tone is None and language == "JP":
140
+ # アクセント指定に使えるようにアクセント情報を返す
141
+ norm_text = text_normalize(text)
142
+ kata_tone = g2kata_tone(norm_text)
143
+ kata_tone_json_str = json.dumps(kata_tone, ensure_ascii=False)
144
+ elif tone is None:
145
+ kata_tone_json_str = ""
146
+
147
+ if reference_audio_path:
148
+ style="External Audio"
149
+ logger.info(f"Successful inference, took {duration}s | {speaker} | {language}/{sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{style}/{style_weight} | {text}")
150
+ message = f"Success, time: {duration} seconds."
151
+ if wrong_tone_message != "":
152
+ message = wrong_tone_message + "\n" + message
153
+ return message, (sr, audio), kata_tone_json_str
154
+
155
+ def load_voicedata():
156
+ print("Loading voice data...")
157
+ envoices = []
158
+ jpvoices = []
159
+ styledict = {}
160
+ with open("voicelist.json", "r", encoding="utf-8") as f:
161
+ voc_info = json.load(f)
162
+ for name, info in voc_info.items():
163
+ if not info['enable']:
164
+ continue
165
+ model_path = info['model_path']
166
+ model_path_full = f"{model_dir}/{model_path}/{model_path}.safetensors"
167
+ if not os.path.exists(model_path_full):
168
+ model_path_full = f"{model_dir}\\{model_path}\\{model_path}.safetensors"
169
+ voice_name = info['title']
170
+ speakerid = info['speakerid']
171
+ datasetauthor = info['datasetauthor']
172
+ image = info['cover']
173
+ if not os.path.exists(f"images/{image}"):
174
+ image="none.png"
175
+ # for voices that are either known buggy or abnormal
176
+ nospace=False
177
+ if 'disableonspace' in info:
178
+ nospace=info['disableonspace']
179
+ if not model_path in styledict.keys():
180
+ conf=f"{model_dir}/{model_path}/config.json"
181
+ hps = utils.get_hparams_from_file(conf)
182
+ s2id = hps.data.style2id
183
+ styledict[model_path] = s2id.keys()
184
+ print(f"Set up hyperparameters for model {model_path}")
185
+ if(info['primarylang']=="JP"):
186
+ jpvoices.append((name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image, nospace))
187
+ else:
188
+ envoices.append((name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image, nospace))
189
+ return [envoices, jpvoices], styledict
190
+
191
+
192
+ initial_text = "Hello there! This is test audio of a new Hololive text to speech tool."
193
+
194
+ initial_md = """
195
+ # Hololive [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2)
196
+ ### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
197
+ ### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
198
+
199
+ Do no evil.
200
+ """
201
+
202
+ style_md = """
203
+ - You can control things like voice tone, emotion, and reading style through presets or through voice files.
204
+ - Neutral acts as an average across all speakers. Styling options act as an override to Neutral.
205
+ - Setting the intensity too high will likely break the output.
206
+ - The required intensity will depend based on the speaker and the desired style.
207
+ - If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
208
+ """
209
+
210
+ if __name__ == "__main__":
211
+ parser = argparse.ArgumentParser()
212
+ parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
213
+ parser.add_argument(
214
+ "--dir", "-d", type=str, help="Model directory", default=assets_root
215
+ )
216
+ parser.add_argument(
217
+ "--share", action="store_true", help="Share this app publicly", default=False
218
+ )
219
+ parser.add_argument(
220
+ "--server-name",
221
+ type=str,
222
+ default=None,
223
+ help="Server name for Gradio app",
224
+ )
225
+ parser.add_argument(
226
+ "--no-autolaunch",
227
+ action="store_true",
228
+ default=False,
229
+ help="Do not launch app automatically",
230
+ )
231
+ args = parser.parse_args()
232
+ model_dir = args.dir
233
+ print(model_dir)
234
+
235
+ if args.cpu:
236
+ device = "cpu"
237
+ else:
238
+ device = "cuda" if torch.cuda.is_available() else "cpu"
239
+
240
+ model_holder = ModelHolder(model_dir, device)
241
+
242
+ languages = ["EN", "JP", "ZH"]
243
+ langnames = ["English", "Japanese"]
244
+
245
+ model_names = model_holder.model_names
246
+ if len(model_names) == 0:
247
+ logger.error(f"No models found. Please place the model in {model_dir}.")
248
+ sys.exit(1)
249
+ initial_id = 0
250
+ initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
251
+ #print(initial_pth_files)
252
+
253
+ voicedata, styledict = load_voicedata()
254
+
255
+ #Gradio preload
256
+ text_input = gr.TextArea(label="Text", value=initial_text)
257
+ line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
258
+ split_interval = gr.Slider(
259
+ minimum=0.0,
260
+ maximum=2,
261
+ value=0.5,
262
+ step=0.1,
263
+ label="Length of division seperation time (in seconds)",
264
+ )
265
+ language = gr.Dropdown(choices=languages, value="EN", label="Language")
266
+ sdp_ratio = gr.Slider(
267
+ minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
268
+ )
269
+ noise_scale = gr.Slider(
270
+ minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
271
+ )
272
+ noise_scale_w = gr.Slider(
273
+ minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
274
+ )
275
+ length_scale = gr.Slider(
276
+ minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
277
+ )
278
+ use_style_text = gr.Checkbox(label="Use stylization text", value=False)
279
+ style_text = gr.Textbox(
280
+ label="Style text",
281
+ placeholder="Check the \"Use stylization text\" box to use this option!",
282
+ info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
283
+ visible=True,
284
+ )
285
+ style_text_weight = gr.Slider(
286
+ minimum=0,
287
+ maximum=1,
288
+ value=0.7,
289
+ step=0.1,
290
+ label="Text stylization strength",
291
+ visible=True,
292
+ )
293
+
294
+ with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="Hololive Style-Bert-VITS2") as app:
295
+ gr.Markdown(initial_md)
296
+
297
+ #NOT USED SINCE NONE OF MY MODELS ARE JPEXTRA.
298
+ #ONLY HERE FOR COMPATIBILITY WITH THE EXISTING INFER CODE.
299
+ #DO NOT RENDER OR MAKE VISIBLE
300
+ tone = gr.Textbox(
301
+ label="Accent adjustment (0 for low, 1 for high)",
302
+ info="This can only be used when not seperated by line breaks. It is not universal.",
303
+ visible=False
304
+ )
305
+ use_tone = gr.Checkbox(label="Use accent adjustment", value=False, visible=False)
306
+
307
+ #for (name, model_path, voice_name, speakerid, datasetauthor, image) in voicedata:
308
+ for vi in range(len(voicedata)):
309
+ with gr.TabItem(langnames[vi]):
310
+ for (name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image, nospace) in voicedata[vi]:
311
+ if(nospace and is_hf_spaces):
312
+ continue
313
+ with gr.TabItem(name):
314
+ mn = gr.Textbox(value=model_path, visible=False, interactive=False)
315
+ mp = gr.Textbox(value=model_path_full, visible=False, interactive=False)
316
+ spk = gr.Textbox(value=speakerid, visible=False, interactive=False)
317
+ with gr.Row():
318
+ with gr.Column():
319
+ gr.Markdown(f"**{voice_name}**\n\nModel name: {model_path} | Dataset author: {datasetauthor}")
320
+ gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False)
321
+ with gr.Column():
322
+ with gr.TabItem("Style using a preset"):
323
+ style = gr.Dropdown(
324
+ label="Current style (Neutral is an average style)",
325
+ choices=styledict[model_path],
326
+ value="Neutral",
327
+ )
328
+ with gr.TabItem("Style using existing audio"):
329
+ ref_audio_path = gr.Audio(label="Reference Audio", type="filepath", sources=["upload"])
330
+ style_weight = gr.Slider(
331
+ minimum=0,
332
+ maximum=20,
333
+ value=3,
334
+ step=0.1,
335
+ label="Style strength",
336
+ )
337
+ with gr.Column():
338
+ tts_button = gr.Button(
339
+ "Synthesize", variant="primary", interactive=True
340
+ )
341
+ text_output = gr.Textbox(label="Info")
342
+ audio_output = gr.Audio(label="Result")
343
+
344
+ tts_button.click(
345
+ tts_fn,
346
+ inputs=[
347
+ mn,
348
+ mp,
349
+ text_input,
350
+ language,
351
+ ref_audio_path,
352
+ sdp_ratio,
353
+ noise_scale,
354
+ noise_scale_w,
355
+ length_scale,
356
+ line_split,
357
+ split_interval,
358
+ style_text,
359
+ style_text_weight,
360
+ use_style_text,
361
+ style,
362
+ style_weight,
363
+ tone,
364
+ use_tone,
365
+ spk,
366
+ ],
367
+ outputs=[text_output, audio_output, tone],
368
+ )
369
+
370
+ with gr.Row():
371
+ with gr.Column():
372
+ text_input.render()
373
+ line_split.render()
374
+ split_interval.render()
375
+ language.render()
376
+ with gr.Column():
377
+ sdp_ratio.render()
378
+ noise_scale.render()
379
+ noise_scale_w.render()
380
+ length_scale.render()
381
+ use_style_text.render()
382
+ style_text.render()
383
+ style_text_weight.render()
384
+
385
+ with gr.Accordion("Styling Guide", open=False):
386
+ gr.Markdown(style_md)
387
+
388
+ app.launch(allowed_paths=['/file/images/'])