Hololive-Style-Bert-VITS2

Running

App Files Files Community

Kit-Lemonfoot commited on Jan 19, 2024

Commit

6de3522

verified ·

1 Parent(s): 2714e75

Upload 16 files

Browse files

Files changed (16) hide show

app.py +144 -140
images/ame.png +0 -0
images/anya.png +0 -0
images/bae.png +0 -0
images/calli.png +0 -0
images/fauna.png +0 -0
images/gura.png +0 -0
images/ina.png +0 -0
images/iofi.png +0 -0
images/irys.png +0 -0
images/kronii.png +0 -0
images/mumei.png +0 -0
images/nerissa.png +0 -0
images/sana.png +0 -0
images/shiori.png +0 -0
voicelist.json +100 -0

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import datetime
 import os
 import sys
 import warnings
 import gradio as gr
 import numpy as np
@@ -238,7 +239,8 @@ def tts_fn(
     if is_hf_spaces and len(text) > limit:
         return f"Too long! There is a character limit of {limit} characters.", (44100, None)
-    assert model_holder.current_model is not None
     if(model_holder.current_model.model_path != model_path):
         model_holder.load_model(model_name, model_path)
@@ -267,16 +269,37 @@ def tts_fn(
     end_time = datetime.datetime.now()
     duration = (end_time - start_time).total_seconds()
-    logger.info(f"Successful inference, took {duration}s | {speaker} | {sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale} | {text}")
     return f"Success, time: {duration} seconds.", (sr, audio)
-initial_text = "Hi there! How are you doing?"
 initial_md = """
 # LemonfootSBV2 😊🍋
-### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)  /  [Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
-### Based on code originally by  [fishaudio](https://github.com/fishaudio)  and  [litagin02](https://github.com/litagin02)
 This HuggingFace space is designed to demonstrate multiple experimental [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) models made by Kit Lemonfoot.
 Do no evil.
@@ -331,144 +354,125 @@ if __name__ == "__main__":
         sys.exit(1)
     initial_id = 0
     initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
     with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="LemonfootSBV2") as app:
         gr.Markdown(initial_md)
-        with gr.Row():
-            with gr.Column():
                 with gr.Row():
-                    with gr.Column(scale=3):
-                        model_name = gr.Dropdown(
-                            label="Available Models",
-                            choices=model_names,
-                            value=model_names[initial_id],
                         )
-                        model_path = gr.Dropdown(
-                            label="Model File",
-                            choices=initial_pth_files,
-                            value=initial_pth_files[0],
                         )
-                    refresh_button = gr.Button("Refresh", scale=1, visible=not is_hf_spaces)
-                    load_button = gr.Button("Load", scale=1, variant="primary")
-                text_input = gr.TextArea(label="Text", value=initial_text)
-                line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
-                split_interval = gr.Slider(
-                    minimum=0.0,
-                    maximum=2,
-                    value=0.5,
-                    step=0.1,
-                    label="Length of division seperation time (in seconds)",
-                )
-                language = gr.Dropdown(choices=languages, value="EN", label="Language")
-                speaker = gr.Dropdown(label="Speaker")
-                with gr.Accordion(label="Advanced Settings", open=False):
-                    sdp_ratio = gr.Slider(
-                        minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
-                    )
-                    noise_scale = gr.Slider(
-                        minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
-                    )
-                    noise_scale_w = gr.Slider(
-                        minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
-                    )
-                    length_scale = gr.Slider(
-                        minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
-                    )
-                    use_style_text = gr.Checkbox(label="Use stylization text", value=False)
-                    style_text = gr.Textbox(
-                        label="Style text",
-                        placeholder="Why are you ignoring me? You're unforgivable and disgusting! I hope you die.",
-                        info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
-                        visible=False,
-                    )
-                    style_text_weight = gr.Slider(
-                        minimum=0,
-                        maximum=1,
-                        value=0.7,
-                        step=0.1,
-                        label="Text stylization strength",
-                        visible=False,
-                    )
-                    use_style_text.change(
-                        lambda x: (gr.Textbox(visible=x), gr.Slider(visible=x)),
-                        inputs=[use_style_text],
-                        outputs=[style_text, style_text_weight],
-                    )
-            with gr.Column():
-                with gr.Accordion("Styling Guide", open=False):
-                    gr.Markdown(style_md)
-                style_mode = gr.Radio(
-                    ["Select from presets", "Use an audio file"],
-                    label="Style Specification",
-                    value="Select from presets",
-                )
-                style = gr.Dropdown(
-                    label="Current style (Neutral is an average style)",
-                    choices=["Please load a model first!"],
-                    value="Please load a model first!",
-                )
-                style_weight = gr.Slider(
-                    minimum=0,
-                    maximum=50,
-                    value=5,
-                    step=0.1,
-                    label="Style strength",
-                )
-                ref_audio_path = gr.Audio(label="Reference Audio", type="filepath", visible=False)
-                tts_button = gr.Button(
-                    "Synthesize (Please load a model!)", variant="primary", interactive=False
-                )
-                text_output = gr.Textbox(label="Info")
-                audio_output = gr.Audio(label="Result")
-        tts_button.click(
-            tts_fn,
-            inputs=[
-                model_name,
-                model_path,
-                text_input,
-                language,
-                ref_audio_path,
-                sdp_ratio,
-                noise_scale,
-                noise_scale_w,
-                length_scale,
-                line_split,
-                split_interval,
-                style_text,
-                style_text_weight,
-                use_style_text,
-                style,
-                style_weight,
-                speaker,
-            ],
-            outputs=[text_output, audio_output],
-        )
-        model_name.change(
-            model_holder.update_model_files_dropdown,
-            inputs=[model_name],
-            outputs=[model_path],
-        )
-        model_path.change(make_non_interactive, outputs=[tts_button])
-        refresh_button.click(
-            model_holder.update_model_names_dropdown,
-            outputs=[model_name, model_path, tts_button],
-        )
-        load_button.click(
-            model_holder.load_model,
-            inputs=[model_name, model_path],
-            outputs=[style, tts_button, speaker],
-        )
-        style_mode.change(
-            gr_util,
-            inputs=[style_mode],
-            outputs=[style, ref_audio_path],
-        )
-    app.launch(inbrowser=True)

 import os
 import sys
 import warnings
+import json
 import gradio as gr
 import numpy as np
     if is_hf_spaces and len(text) > limit:
         return f"Too long! There is a character limit of {limit} characters.", (44100, None)
+    if(not model_holder.current_model):
+        model_holder.load_model(model_name, model_path)
     if(model_holder.current_model.model_path != model_path):
         model_holder.load_model(model_name, model_path)
     end_time = datetime.datetime.now()
     duration = (end_time - start_time).total_seconds()
+    logger.info(f"Successful inference, took {duration}s | {speaker} | {sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{emotion}/{emotion_weight} | {text}")
     return f"Success, time: {duration} seconds.", (sr, audio)
+def load_voicedata():
+    logger.info("Loading voice data...")
+    voices = []
+    styledict = {}
+    with open("voicelist.json", "r", encoding="utf-8") as f:
+        voc_info = json.load(f)
+    for name, info in voc_info.items():
+        if not info['enable']:
+            continue
+        model_path = info['model_path']
+        voice_name = info['title']
+        speakerid = info['speakerid']
+        image = info['cover']
+        if not model_path in styledict.keys():
+           conf=f"model_assets\\{model_path}\\config.json"
+           hps = utils.get_hparams_from_file(conf)
+           s2id = hps.data.style2id
+           styledict[model_path] = s2id.keys()
+        voices.append((name, model_path, voice_name, speakerid, image))
+    return voices, styledict
+initial_text = "Hello there! This is test audio of Lemonfoot S B V 2."
 initial_md = """
 # LemonfootSBV2 😊🍋
+### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
+### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
 This HuggingFace space is designed to demonstrate multiple experimental [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) models made by Kit Lemonfoot.
 Do no evil.
         sys.exit(1)
     initial_id = 0
     initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
+    print(initial_pth_files)
+    voicedata, styledict = load_voicedata()
+    #Gradio preload
+    text_input = gr.TextArea(label="Text", value=initial_text)
+    line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
+    split_interval = gr.Slider(
+        minimum=0.0,
+        maximum=2,
+        value=0.5,
+        step=0.1,
+        label="Length of division seperation time (in seconds)",
+    )
+    language = gr.Dropdown(choices=languages, value="EN", label="Language")
+    sdp_ratio = gr.Slider(
+        minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
+    )
+    noise_scale = gr.Slider(
+        minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
+    )
+    noise_scale_w = gr.Slider(
+        minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
+    )
+    length_scale = gr.Slider(
+        minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
+    )
+    use_style_text = gr.Checkbox(label="Use stylization text", value=False)
+    style_text = gr.Textbox(
+        label="Style text",
+        placeholder="Check the \"Use styleization text\" box to use this option!",
+        info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
+        visible=True,
+    )
+    style_text_weight = gr.Slider(
+        minimum=0,
+        maximum=1,
+        value=0.7,
+        step=0.1,
+        label="Text stylization strength",
+        visible=True,
+    )
     with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="LemonfootSBV2") as app:
         gr.Markdown(initial_md)
+        for (name, model_path, voice_name, speakerid, image) in voicedata:
+            with gr.TabItem(name):
+                mn = gr.Textbox(value=model_path, visible=False, interactive=False)
+                mp = gr.Textbox(value=f"model_assets\\{model_path}\\{model_path}.safetensors", visible=False, interactive=False)
+                spk = gr.Textbox(value=speakerid, visible=False, interactive=False)
                 with gr.Row():
+                    with gr.Column():
+                        gr.Markdown(f"**{voice_name}**\n\nModel name: {model_path}")
+                        gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False)
+                    with gr.Column():
+                        with gr.TabItem("Preset Styles"):
+                            style = gr.Dropdown(
+                                label="Current style (Neutral is an average style)",
+                                choices=styledict[model_path],
+                                value="Neutral",
+                            )
+                        with gr.TabItem("Use an audio file"):
+                            ref_audio_path = gr.Audio(label="Reference Audio", type="filepath")
+                        style_weight = gr.Slider(
+                            minimum=0,
+                            maximum=50,
+                            value=5,
+                            step=0.1,
+                            label="Style strength",
                         )
+                    with gr.Column():
+                        tts_button = gr.Button(
+                            "Synthesize", variant="primary", interactive=True
+                        )
+                        text_output = gr.Textbox(label="Info")
+                        audio_output = gr.Audio(label="Result")
+                        tts_button.click(
+                            tts_fn,
+                            inputs=[
+                                mn,
+                                mp,
+                                text_input,
+                                language,
+                                ref_audio_path,
+                                sdp_ratio,
+                                noise_scale,
+                                noise_scale_w,
+                                length_scale,
+                                line_split,
+                                split_interval,
+                                style_text,
+                                style_text_weight,
+                                use_style_text,
+                                style,
+                                style_weight,
+                                spk,
+                            ],
+                            outputs=[text_output, audio_output],
                         )
+        with gr.Row():
+            with gr.Column():
+                text_input.render()
+                line_split.render()
+                split_interval.render()
+                language.render()
+            with gr.Column():
+                sdp_ratio.render()
+                noise_scale.render()
+                noise_scale_w.render()
+                length_scale.render()
+                use_style_text.render()
+                style_text.render()
+                style_text_weight.render()
+        with gr.Accordion("Styling Guide", open=False):
+            gr.Markdown(style_md)
+    app.launch(allowed_paths=['/file/images/'])

images/ame.png ADDED Viewed

images/anya.png ADDED Viewed

images/bae.png ADDED Viewed

images/calli.png ADDED Viewed

images/fauna.png ADDED Viewed

images/gura.png ADDED Viewed

images/ina.png ADDED Viewed

images/iofi.png ADDED Viewed

images/irys.png ADDED Viewed

images/kronii.png ADDED Viewed

images/mumei.png ADDED Viewed

images/nerissa.png ADDED Viewed

images/sana.png ADDED Viewed

images/shiori.png ADDED Viewed

voicelist.json ADDED Viewed

	@@ -0,0 +1,100 @@

+{
+  "Calliope": {
+    "enable": true,
+    "model_path": "SBV2_HoloLow",
+    "title": "Mori Calliope",
+	"speakerid": "MoriCalliope",
+    "cover": "calli.png"
+  },
+  "Ina": {
+    "enable": false,
+    "model_path": "SBV2_HoloHi",
+    "title": "Ninomae Ina'nis",
+	"speakerid": "NinomaeInanis",
+    "cover": "ina.png"
+  },
+  "Gura": {
+    "enable": false,
+    "model_path": "SBV2_HoloHi",
+    "title": "Gawr Gura",
+	"speakerid": "GawrGura",
+    "cover": "gura.png"
+  },
+  "Ame": {
+    "enable": false,
+    "model_path": "SBV2_HoloHi",
+    "title": "Amelia Watson",
+	"speakerid": "AmeliaWatson",
+    "cover": "ame.png"
+  },
+  "IRyS": {
+    "enable": false,
+    "model_path": "SBV2_HoloHi",
+    "title": "IRyS",
+	"speakerid": "IRyS",
+    "cover": "irys.png"
+  },
+  "Sana": {
+    "enable": true,
+    "model_path": "SBV2_HoloAus",
+    "title": "Tsukumo Sana",
+	"speakerid": "TsukumoSana",
+    "cover": "sana.png"
+  },
+  "Fauna": {
+    "enable": false,
+    "model_path": "SBV2_HoloHi",
+    "title": "Ceres Fauna",
+	"speakerid": "CeresFauna",
+    "cover": "fauna.png"
+  },
+  "Kronii": {
+    "enable": true,
+    "model_path": "SBV2_HoloLow",
+    "title": "Ouro Kronii",
+	"speakerid": "OuroKronii",
+    "cover": "kronii.png"
+  },
+  "Mumei": {
+    "enable": false,
+    "model_path": "SBV2_HoloHi",
+    "title": "Nanashi Mumei",
+	"speakerid": "NanashiMumei",
+    "cover": "mumei.png"
+  },
+  "Baelz": {
+    "enable": true,
+    "model_path": "SBV2_HoloAus",
+    "title": "Hakos Baelz",
+	"speakerid": "HakosBaelz",
+    "cover": "bae.png"
+  },
+  "Shiori": {
+    "enable": false,
+    "model_path": "SBV2_HoloHi",
+    "title": "Shiori Novella",
+	"speakerid": "ShioriNovella",
+    "cover": "shiori.png"
+  },
+  "Nerissa": {
+    "enable": true,
+    "model_path": "SBV2_HoloLow",
+    "title": "Nerissa Ravencroft",
+	"speakerid": "NerissaRavencroft",
+    "cover": "nerissa.png"
+  },
+  "Iofi": {
+    "enable": true,
+    "model_path": "SBV2_HoloESL",
+    "title": "Airani Iofifteen",
+	"speakerid": "AiraniIofifteen",
+    "cover": "iofi.png"
+  },
+  "Anya": {
+    "enable": true,
+    "model_path": "SBV2_HoloESL",
+    "title": "Anya Melfissa",
+	"speakerid": "AnyaMelfissa",
+    "cover": "anya.png"
+  }
+}