Kit-Lemonfoot commited on
Commit
6de3522
·
verified ·
1 Parent(s): 2714e75

Upload 16 files

Browse files
app.py CHANGED
@@ -3,6 +3,7 @@ import datetime
3
  import os
4
  import sys
5
  import warnings
 
6
 
7
  import gradio as gr
8
  import numpy as np
@@ -238,7 +239,8 @@ def tts_fn(
238
  if is_hf_spaces and len(text) > limit:
239
  return f"Too long! There is a character limit of {limit} characters.", (44100, None)
240
 
241
- assert model_holder.current_model is not None
 
242
 
243
  if(model_holder.current_model.model_path != model_path):
244
  model_holder.load_model(model_name, model_path)
@@ -267,16 +269,37 @@ def tts_fn(
267
 
268
  end_time = datetime.datetime.now()
269
  duration = (end_time - start_time).total_seconds()
270
- logger.info(f"Successful inference, took {duration}s | {speaker} | {sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale} | {text}")
271
  return f"Success, time: {duration} seconds.", (sr, audio)
272
 
273
-
274
- initial_text = "Hi there! How are you doing?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
  initial_md = """
277
  # LemonfootSBV2 😊🍋
278
- ### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot) / [Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
279
- ### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
280
  This HuggingFace space is designed to demonstrate multiple experimental [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) models made by Kit Lemonfoot.
281
 
282
  Do no evil.
@@ -331,144 +354,125 @@ if __name__ == "__main__":
331
  sys.exit(1)
332
  initial_id = 0
333
  initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="LemonfootSBV2") as app:
336
  gr.Markdown(initial_md)
337
- with gr.Row():
338
- with gr.Column():
 
 
 
 
339
  with gr.Row():
340
- with gr.Column(scale=3):
341
- model_name = gr.Dropdown(
342
- label="Available Models",
343
- choices=model_names,
344
- value=model_names[initial_id],
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  )
346
- model_path = gr.Dropdown(
347
- label="Model File",
348
- choices=initial_pth_files,
349
- value=initial_pth_files[0],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  )
351
- refresh_button = gr.Button("Refresh", scale=1, visible=not is_hf_spaces)
352
- load_button = gr.Button("Load", scale=1, variant="primary")
353
- text_input = gr.TextArea(label="Text", value=initial_text)
354
-
355
- line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
356
- split_interval = gr.Slider(
357
- minimum=0.0,
358
- maximum=2,
359
- value=0.5,
360
- step=0.1,
361
- label="Length of division seperation time (in seconds)",
362
- )
363
- language = gr.Dropdown(choices=languages, value="EN", label="Language")
364
- speaker = gr.Dropdown(label="Speaker")
365
- with gr.Accordion(label="Advanced Settings", open=False):
366
- sdp_ratio = gr.Slider(
367
- minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
368
- )
369
- noise_scale = gr.Slider(
370
- minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
371
- )
372
- noise_scale_w = gr.Slider(
373
- minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
374
- )
375
- length_scale = gr.Slider(
376
- minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
377
- )
378
- use_style_text = gr.Checkbox(label="Use stylization text", value=False)
379
- style_text = gr.Textbox(
380
- label="Style text",
381
- placeholder="Why are you ignoring me? You're unforgivable and disgusting! I hope you die.",
382
- info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
383
- visible=False,
384
- )
385
- style_text_weight = gr.Slider(
386
- minimum=0,
387
- maximum=1,
388
- value=0.7,
389
- step=0.1,
390
- label="Text stylization strength",
391
- visible=False,
392
- )
393
- use_style_text.change(
394
- lambda x: (gr.Textbox(visible=x), gr.Slider(visible=x)),
395
- inputs=[use_style_text],
396
- outputs=[style_text, style_text_weight],
397
- )
398
- with gr.Column():
399
- with gr.Accordion("Styling Guide", open=False):
400
- gr.Markdown(style_md)
401
- style_mode = gr.Radio(
402
- ["Select from presets", "Use an audio file"],
403
- label="Style Specification",
404
- value="Select from presets",
405
- )
406
- style = gr.Dropdown(
407
- label="Current style (Neutral is an average style)",
408
- choices=["Please load a model first!"],
409
- value="Please load a model first!",
410
- )
411
- style_weight = gr.Slider(
412
- minimum=0,
413
- maximum=50,
414
- value=5,
415
- step=0.1,
416
- label="Style strength",
417
- )
418
- ref_audio_path = gr.Audio(label="Reference Audio", type="filepath", visible=False)
419
- tts_button = gr.Button(
420
- "Synthesize (Please load a model!)", variant="primary", interactive=False
421
- )
422
- text_output = gr.Textbox(label="Info")
423
- audio_output = gr.Audio(label="Result")
424
-
425
- tts_button.click(
426
- tts_fn,
427
- inputs=[
428
- model_name,
429
- model_path,
430
- text_input,
431
- language,
432
- ref_audio_path,
433
- sdp_ratio,
434
- noise_scale,
435
- noise_scale_w,
436
- length_scale,
437
- line_split,
438
- split_interval,
439
- style_text,
440
- style_text_weight,
441
- use_style_text,
442
- style,
443
- style_weight,
444
- speaker,
445
- ],
446
- outputs=[text_output, audio_output],
447
- )
448
-
449
- model_name.change(
450
- model_holder.update_model_files_dropdown,
451
- inputs=[model_name],
452
- outputs=[model_path],
453
- )
454
-
455
- model_path.change(make_non_interactive, outputs=[tts_button])
456
-
457
- refresh_button.click(
458
- model_holder.update_model_names_dropdown,
459
- outputs=[model_name, model_path, tts_button],
460
- )
461
-
462
- load_button.click(
463
- model_holder.load_model,
464
- inputs=[model_name, model_path],
465
- outputs=[style, tts_button, speaker],
466
- )
467
-
468
- style_mode.change(
469
- gr_util,
470
- inputs=[style_mode],
471
- outputs=[style, ref_audio_path],
472
- )
473
 
474
- app.launch(inbrowser=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import os
4
  import sys
5
  import warnings
6
+ import json
7
 
8
  import gradio as gr
9
  import numpy as np
 
239
  if is_hf_spaces and len(text) > limit:
240
  return f"Too long! There is a character limit of {limit} characters.", (44100, None)
241
 
242
+ if(not model_holder.current_model):
243
+ model_holder.load_model(model_name, model_path)
244
 
245
  if(model_holder.current_model.model_path != model_path):
246
  model_holder.load_model(model_name, model_path)
 
269
 
270
  end_time = datetime.datetime.now()
271
  duration = (end_time - start_time).total_seconds()
272
+ logger.info(f"Successful inference, took {duration}s | {speaker} | {sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{emotion}/{emotion_weight} | {text}")
273
  return f"Success, time: {duration} seconds.", (sr, audio)
274
 
275
+ def load_voicedata():
276
+ logger.info("Loading voice data...")
277
+ voices = []
278
+ styledict = {}
279
+ with open("voicelist.json", "r", encoding="utf-8") as f:
280
+ voc_info = json.load(f)
281
+ for name, info in voc_info.items():
282
+ if not info['enable']:
283
+ continue
284
+ model_path = info['model_path']
285
+ voice_name = info['title']
286
+ speakerid = info['speakerid']
287
+ image = info['cover']
288
+ if not model_path in styledict.keys():
289
+ conf=f"model_assets\\{model_path}\\config.json"
290
+ hps = utils.get_hparams_from_file(conf)
291
+ s2id = hps.data.style2id
292
+ styledict[model_path] = s2id.keys()
293
+ voices.append((name, model_path, voice_name, speakerid, image))
294
+ return voices, styledict
295
+
296
+
297
+ initial_text = "Hello there! This is test audio of Lemonfoot S B V 2."
298
 
299
  initial_md = """
300
  # LemonfootSBV2 😊🍋
301
+ ### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
302
+ ### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
303
  This HuggingFace space is designed to demonstrate multiple experimental [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) models made by Kit Lemonfoot.
304
 
305
  Do no evil.
 
354
  sys.exit(1)
355
  initial_id = 0
356
  initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
357
+ print(initial_pth_files)
358
+
359
+ voicedata, styledict = load_voicedata()
360
+
361
+ #Gradio preload
362
+ text_input = gr.TextArea(label="Text", value=initial_text)
363
+ line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
364
+ split_interval = gr.Slider(
365
+ minimum=0.0,
366
+ maximum=2,
367
+ value=0.5,
368
+ step=0.1,
369
+ label="Length of division seperation time (in seconds)",
370
+ )
371
+ language = gr.Dropdown(choices=languages, value="EN", label="Language")
372
+ sdp_ratio = gr.Slider(
373
+ minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
374
+ )
375
+ noise_scale = gr.Slider(
376
+ minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
377
+ )
378
+ noise_scale_w = gr.Slider(
379
+ minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
380
+ )
381
+ length_scale = gr.Slider(
382
+ minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
383
+ )
384
+ use_style_text = gr.Checkbox(label="Use stylization text", value=False)
385
+ style_text = gr.Textbox(
386
+ label="Style text",
387
+ placeholder="Check the \"Use styleization text\" box to use this option!",
388
+ info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
389
+ visible=True,
390
+ )
391
+ style_text_weight = gr.Slider(
392
+ minimum=0,
393
+ maximum=1,
394
+ value=0.7,
395
+ step=0.1,
396
+ label="Text stylization strength",
397
+ visible=True,
398
+ )
399
+
400
 
401
  with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="LemonfootSBV2") as app:
402
  gr.Markdown(initial_md)
403
+
404
+ for (name, model_path, voice_name, speakerid, image) in voicedata:
405
+ with gr.TabItem(name):
406
+ mn = gr.Textbox(value=model_path, visible=False, interactive=False)
407
+ mp = gr.Textbox(value=f"model_assets\\{model_path}\\{model_path}.safetensors", visible=False, interactive=False)
408
+ spk = gr.Textbox(value=speakerid, visible=False, interactive=False)
409
  with gr.Row():
410
+ with gr.Column():
411
+ gr.Markdown(f"**{voice_name}**\n\nModel name: {model_path}")
412
+ gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False)
413
+ with gr.Column():
414
+ with gr.TabItem("Preset Styles"):
415
+ style = gr.Dropdown(
416
+ label="Current style (Neutral is an average style)",
417
+ choices=styledict[model_path],
418
+ value="Neutral",
419
+ )
420
+ with gr.TabItem("Use an audio file"):
421
+ ref_audio_path = gr.Audio(label="Reference Audio", type="filepath")
422
+ style_weight = gr.Slider(
423
+ minimum=0,
424
+ maximum=50,
425
+ value=5,
426
+ step=0.1,
427
+ label="Style strength",
428
  )
429
+ with gr.Column():
430
+ tts_button = gr.Button(
431
+ "Synthesize", variant="primary", interactive=True
432
+ )
433
+ text_output = gr.Textbox(label="Info")
434
+ audio_output = gr.Audio(label="Result")
435
+
436
+ tts_button.click(
437
+ tts_fn,
438
+ inputs=[
439
+ mn,
440
+ mp,
441
+ text_input,
442
+ language,
443
+ ref_audio_path,
444
+ sdp_ratio,
445
+ noise_scale,
446
+ noise_scale_w,
447
+ length_scale,
448
+ line_split,
449
+ split_interval,
450
+ style_text,
451
+ style_text_weight,
452
+ use_style_text,
453
+ style,
454
+ style_weight,
455
+ spk,
456
+ ],
457
+ outputs=[text_output, audio_output],
458
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
 
460
+ with gr.Row():
461
+ with gr.Column():
462
+ text_input.render()
463
+ line_split.render()
464
+ split_interval.render()
465
+ language.render()
466
+ with gr.Column():
467
+ sdp_ratio.render()
468
+ noise_scale.render()
469
+ noise_scale_w.render()
470
+ length_scale.render()
471
+ use_style_text.render()
472
+ style_text.render()
473
+ style_text_weight.render()
474
+
475
+ with gr.Accordion("Styling Guide", open=False):
476
+ gr.Markdown(style_md)
477
+
478
+ app.launch(allowed_paths=['/file/images/'])
images/ame.png ADDED
images/anya.png ADDED
images/bae.png ADDED
images/calli.png ADDED
images/fauna.png ADDED
images/gura.png ADDED
images/ina.png ADDED
images/iofi.png ADDED
images/irys.png ADDED
images/kronii.png ADDED
images/mumei.png ADDED
images/nerissa.png ADDED
images/sana.png ADDED
images/shiori.png ADDED
voicelist.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Calliope": {
3
+ "enable": true,
4
+ "model_path": "SBV2_HoloLow",
5
+ "title": "Mori Calliope",
6
+ "speakerid": "MoriCalliope",
7
+ "cover": "calli.png"
8
+ },
9
+ "Ina": {
10
+ "enable": false,
11
+ "model_path": "SBV2_HoloHi",
12
+ "title": "Ninomae Ina'nis",
13
+ "speakerid": "NinomaeInanis",
14
+ "cover": "ina.png"
15
+ },
16
+ "Gura": {
17
+ "enable": false,
18
+ "model_path": "SBV2_HoloHi",
19
+ "title": "Gawr Gura",
20
+ "speakerid": "GawrGura",
21
+ "cover": "gura.png"
22
+ },
23
+ "Ame": {
24
+ "enable": false,
25
+ "model_path": "SBV2_HoloHi",
26
+ "title": "Amelia Watson",
27
+ "speakerid": "AmeliaWatson",
28
+ "cover": "ame.png"
29
+ },
30
+ "IRyS": {
31
+ "enable": false,
32
+ "model_path": "SBV2_HoloHi",
33
+ "title": "IRyS",
34
+ "speakerid": "IRyS",
35
+ "cover": "irys.png"
36
+ },
37
+ "Sana": {
38
+ "enable": true,
39
+ "model_path": "SBV2_HoloAus",
40
+ "title": "Tsukumo Sana",
41
+ "speakerid": "TsukumoSana",
42
+ "cover": "sana.png"
43
+ },
44
+ "Fauna": {
45
+ "enable": false,
46
+ "model_path": "SBV2_HoloHi",
47
+ "title": "Ceres Fauna",
48
+ "speakerid": "CeresFauna",
49
+ "cover": "fauna.png"
50
+ },
51
+ "Kronii": {
52
+ "enable": true,
53
+ "model_path": "SBV2_HoloLow",
54
+ "title": "Ouro Kronii",
55
+ "speakerid": "OuroKronii",
56
+ "cover": "kronii.png"
57
+ },
58
+ "Mumei": {
59
+ "enable": false,
60
+ "model_path": "SBV2_HoloHi",
61
+ "title": "Nanashi Mumei",
62
+ "speakerid": "NanashiMumei",
63
+ "cover": "mumei.png"
64
+ },
65
+ "Baelz": {
66
+ "enable": true,
67
+ "model_path": "SBV2_HoloAus",
68
+ "title": "Hakos Baelz",
69
+ "speakerid": "HakosBaelz",
70
+ "cover": "bae.png"
71
+ },
72
+ "Shiori": {
73
+ "enable": false,
74
+ "model_path": "SBV2_HoloHi",
75
+ "title": "Shiori Novella",
76
+ "speakerid": "ShioriNovella",
77
+ "cover": "shiori.png"
78
+ },
79
+ "Nerissa": {
80
+ "enable": true,
81
+ "model_path": "SBV2_HoloLow",
82
+ "title": "Nerissa Ravencroft",
83
+ "speakerid": "NerissaRavencroft",
84
+ "cover": "nerissa.png"
85
+ },
86
+ "Iofi": {
87
+ "enable": true,
88
+ "model_path": "SBV2_HoloESL",
89
+ "title": "Airani Iofifteen",
90
+ "speakerid": "AiraniIofifteen",
91
+ "cover": "iofi.png"
92
+ },
93
+ "Anya": {
94
+ "enable": true,
95
+ "model_path": "SBV2_HoloESL",
96
+ "title": "Anya Melfissa",
97
+ "speakerid": "AnyaMelfissa",
98
+ "cover": "anya.png"
99
+ }
100
+ }