Kit-Lemonfoot
commited on
Commit
•
f2a3f8d
1
Parent(s):
587b3fe
Upload 2 files
Browse files- app.py +10 -26
- voicelist.json +14 -0
app.py
CHANGED
@@ -230,8 +230,8 @@ def tts_fn(
|
|
230 |
emotion_weight,
|
231 |
speaker,
|
232 |
):
|
233 |
-
if
|
234 |
-
return "Please enter some text.",
|
235 |
#logger.info(f"Start TTS with {language}:\n{text}")
|
236 |
#logger.info(f"Model: {model_holder.current_model.model_path}")
|
237 |
#logger.info(f"SDP: {sdp_ratio}, Noise: {noise_scale}, Noise_W: {noise_scale_w}, Length: {length_scale}")
|
@@ -239,7 +239,7 @@ def tts_fn(
|
|
239 |
#logger.info(f"Style: {emotion}, Style weight: {emotion_weight}")
|
240 |
|
241 |
if is_hf_spaces and len(text) > limit:
|
242 |
-
return f"Too long! There is a character limit of {limit} characters.",
|
243 |
|
244 |
if(not model_holder.current_model):
|
245 |
model_holder.load_model(model_name, model_path)
|
@@ -275,7 +275,7 @@ def tts_fn(
|
|
275 |
return f"Success, time: {duration} seconds.", (sr, audio)
|
276 |
|
277 |
def load_voicedata():
|
278 |
-
|
279 |
voices = []
|
280 |
styledict = {}
|
281 |
with open("voicelist.json", "r", encoding="utf-8") as f:
|
@@ -292,21 +292,21 @@ def load_voicedata():
|
|
292 |
hps = utils.get_hparams_from_file(conf)
|
293 |
s2id = hps.data.style2id
|
294 |
styledict[model_path] = s2id.keys()
|
|
|
295 |
voices.append((name, model_path, voice_name, speakerid, image))
|
296 |
return voices, styledict
|
297 |
|
298 |
|
299 |
-
initial_text = "Hello there! This is test audio of
|
300 |
|
301 |
initial_md = """
|
302 |
-
#
|
303 |
### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
|
304 |
### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
|
305 |
-
This HuggingFace space is designed to demonstrate multiple experimental [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) models made by Kit Lemonfoot.
|
306 |
|
307 |
Do no evil.
|
308 |
|
309 |
-
**Note:** Most of
|
310 |
"""
|
311 |
|
312 |
style_md = """
|
@@ -317,22 +317,6 @@ style_md = """
|
|
317 |
- If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
|
318 |
"""
|
319 |
|
320 |
-
|
321 |
-
def make_interactive():
|
322 |
-
return gr.update(interactive=True, value="Synthesize")
|
323 |
-
|
324 |
-
|
325 |
-
def make_non_interactive():
|
326 |
-
return gr.update(interactive=False, value="Synthesize (Please load a model!)")
|
327 |
-
|
328 |
-
|
329 |
-
def gr_util(item):
|
330 |
-
if item == "Select from presets":
|
331 |
-
return (gr.update(visible=True), gr.Audio(visible=False, value=None))
|
332 |
-
else:
|
333 |
-
return (gr.update(visible=False), gr.update(visible=True))
|
334 |
-
|
335 |
-
|
336 |
if __name__ == "__main__":
|
337 |
parser = argparse.ArgumentParser()
|
338 |
parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
|
@@ -357,7 +341,7 @@ if __name__ == "__main__":
|
|
357 |
sys.exit(1)
|
358 |
initial_id = 0
|
359 |
initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
|
360 |
-
print(initial_pth_files)
|
361 |
|
362 |
voicedata, styledict = load_voicedata()
|
363 |
|
@@ -401,7 +385,7 @@ if __name__ == "__main__":
|
|
401 |
)
|
402 |
|
403 |
|
404 |
-
with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="
|
405 |
gr.Markdown(initial_md)
|
406 |
|
407 |
for (name, model_path, voice_name, speakerid, image) in voicedata:
|
|
|
230 |
emotion_weight,
|
231 |
speaker,
|
232 |
):
|
233 |
+
if len(text)<2:
|
234 |
+
return "Please enter some text.", None
|
235 |
#logger.info(f"Start TTS with {language}:\n{text}")
|
236 |
#logger.info(f"Model: {model_holder.current_model.model_path}")
|
237 |
#logger.info(f"SDP: {sdp_ratio}, Noise: {noise_scale}, Noise_W: {noise_scale_w}, Length: {length_scale}")
|
|
|
239 |
#logger.info(f"Style: {emotion}, Style weight: {emotion_weight}")
|
240 |
|
241 |
if is_hf_spaces and len(text) > limit:
|
242 |
+
return f"Too long! There is a character limit of {limit} characters.", None
|
243 |
|
244 |
if(not model_holder.current_model):
|
245 |
model_holder.load_model(model_name, model_path)
|
|
|
275 |
return f"Success, time: {duration} seconds.", (sr, audio)
|
276 |
|
277 |
def load_voicedata():
|
278 |
+
print("Loading voice data...")
|
279 |
voices = []
|
280 |
styledict = {}
|
281 |
with open("voicelist.json", "r", encoding="utf-8") as f:
|
|
|
292 |
hps = utils.get_hparams_from_file(conf)
|
293 |
s2id = hps.data.style2id
|
294 |
styledict[model_path] = s2id.keys()
|
295 |
+
print(f"Indexed voice {voice_name}")
|
296 |
voices.append((name, model_path, voice_name, speakerid, image))
|
297 |
return voices, styledict
|
298 |
|
299 |
|
300 |
+
initial_text = "Hello there! This is test audio of Hololive Style Bert Vits 2."
|
301 |
|
302 |
initial_md = """
|
303 |
+
# Hololive [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2)
|
304 |
### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
|
305 |
### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
|
|
|
306 |
|
307 |
Do no evil.
|
308 |
|
309 |
+
**Note:** Most of the models are a *work in progress.* They may not sound fully correct.
|
310 |
"""
|
311 |
|
312 |
style_md = """
|
|
|
317 |
- If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
|
318 |
"""
|
319 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
if __name__ == "__main__":
|
321 |
parser = argparse.ArgumentParser()
|
322 |
parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
|
|
|
341 |
sys.exit(1)
|
342 |
initial_id = 0
|
343 |
initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
|
344 |
+
#print(initial_pth_files)
|
345 |
|
346 |
voicedata, styledict = load_voicedata()
|
347 |
|
|
|
385 |
)
|
386 |
|
387 |
|
388 |
+
with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="Hololive Style-Bert-VITS2") as app:
|
389 |
gr.Markdown(initial_md)
|
390 |
|
391 |
for (name, model_path, voice_name, speakerid, image) in voicedata:
|
voicelist.json
CHANGED
@@ -97,11 +97,25 @@
|
|
97 |
"speakerid": "AiraniIofifteen",
|
98 |
"cover": "iofi.png"
|
99 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
"Anya": {
|
101 |
"enable": true,
|
102 |
"model_path": "SBV2_HoloESL",
|
103 |
"title": "Anya Melfissa",
|
104 |
"speakerid": "AnyaMelfissa",
|
105 |
"cover": "anya.png"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
}
|
107 |
}
|
|
|
97 |
"speakerid": "AiraniIofifteen",
|
98 |
"cover": "iofi.png"
|
99 |
},
|
100 |
+
"Ollie": {
|
101 |
+
"enable": true,
|
102 |
+
"model_path": "SBV2_HoloIDFlu",
|
103 |
+
"title": "Kureiji Ollie",
|
104 |
+
"speakerid": "KureijiOllie",
|
105 |
+
"cover": "ollie.png"
|
106 |
+
},
|
107 |
"Anya": {
|
108 |
"enable": true,
|
109 |
"model_path": "SBV2_HoloESL",
|
110 |
"title": "Anya Melfissa",
|
111 |
"speakerid": "AnyaMelfissa",
|
112 |
"cover": "anya.png"
|
113 |
+
},
|
114 |
+
"Zeta": {
|
115 |
+
"enable": true,
|
116 |
+
"model_path": "SBV2_HoloIDFlu",
|
117 |
+
"title": "Vestia Zeta",
|
118 |
+
"speakerid": "VestiaZeta",
|
119 |
+
"cover": "zeta.png"
|
120 |
}
|
121 |
}
|