print("Starting up. Please be patient...") import argparse import datetime import os import sys import warnings import json import gradio as gr import numpy as np import torch from gradio.processing_utils import convert_to_16_bit_wav import utils from config import config from infer import get_net_g, infer from tools.log import logger is_hf_spaces = os.getenv("SYSTEM") == "spaces" limit = 150 class Model: def __init__(self, model_path, config_path, style_vec_path, device): self.model_path = model_path self.config_path = config_path self.device = device self.style_vec_path = style_vec_path self.load() def load(self): self.hps = utils.get_hparams_from_file(self.config_path) self.spk2id = self.hps.data.spk2id self.num_styles = self.hps.data.num_styles if hasattr(self.hps.data, "style2id"): self.style2id = self.hps.data.style2id else: self.style2id = {str(i): i for i in range(self.num_styles)} self.style_vectors = np.load(self.style_vec_path) self.net_g = None def load_net_g(self): self.net_g = get_net_g( model_path=self.model_path, version=self.hps.version, device=self.device, hps=self.hps, ) def get_style_vector(self, style_id, weight=1.0): mean = self.style_vectors[0] style_vec = self.style_vectors[style_id] style_vec = mean + (style_vec - mean) * weight return style_vec def get_style_vector_from_audio(self, audio_path, weight=1.0): from style_gen import extract_style_vector xvec = extract_style_vector(audio_path) mean = self.style_vectors[0] xvec = mean + (xvec - mean) * weight return xvec def infer( self, text, language="JP", sid=0, reference_audio_path=None, sdp_ratio=0.2, noise=0.6, noisew=0.8, length=1.0, line_split=True, split_interval=0.2, style_text="", style_weight=0.7, use_style_text=False, style="0", emotion_weight=1.0, ): if reference_audio_path == "": reference_audio_path = None if style_text == "" or not use_style_text: style_text = None if self.net_g is None: self.load_net_g() if reference_audio_path is None: style_id = self.style2id[style] style_vector = self.get_style_vector(style_id, emotion_weight) else: style_vector = self.get_style_vector_from_audio( reference_audio_path, emotion_weight ) if not line_split: with torch.no_grad(): audio = infer( text=text, sdp_ratio=sdp_ratio, noise_scale=noise, noise_scale_w=noisew, length_scale=length, sid=sid, language=language, hps=self.hps, net_g=self.net_g, device=self.device, style_text=style_text, style_weight=style_weight, style_vec=style_vector, ) else: texts = text.split("\n") texts = [t for t in texts if t != ""] audios = [] with torch.no_grad(): for i, t in enumerate(texts): audios.append( infer( text=t, sdp_ratio=sdp_ratio, noise_scale=noise, noise_scale_w=noisew, length_scale=length, sid=sid, language=language, hps=self.hps, net_g=self.net_g, device=self.device, style_text=style_text, style_weight=style_weight, style_vec=style_vector, ) ) if i != len(texts) - 1: audios.append(np.zeros(int(44100 * split_interval))) audio = np.concatenate(audios) with warnings.catch_warnings(): warnings.simplefilter("ignore") audio = convert_to_16_bit_wav(audio) return (self.hps.data.sampling_rate, audio) class ModelHolder: def __init__(self, root_dir, device): self.root_dir = root_dir self.device = device self.model_files_dict = {} self.current_model = None self.model_names = [] self.models = [] self.refresh() def refresh(self): self.model_files_dict = {} self.model_names = [] self.current_model = None model_dirs = [ d for d in os.listdir(self.root_dir) if os.path.isdir(os.path.join(self.root_dir, d)) ] for model_name in model_dirs: model_dir = os.path.join(self.root_dir, model_name) model_files = [ os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith(".pth") or f.endswith(".pt") or f.endswith(".safetensors") ] if len(model_files) == 0: logger.info( f"No model files found in {self.root_dir}/{model_name}, so skip it" ) self.model_files_dict[model_name] = model_files self.model_names.append(model_name) def load_model(self, model_name, model_path): if model_name not in self.model_files_dict: raise Exception(f"モデル名{model_name}は存在しません") if model_path not in self.model_files_dict[model_name]: raise Exception(f"pthファイル{model_path}は存在しません") self.current_model = Model( model_path=model_path, config_path=os.path.join(self.root_dir, model_name, "config.json"), style_vec_path=os.path.join(self.root_dir, model_name, "style_vectors.npy"), device=self.device, ) styles = list(self.current_model.style2id.keys()) speakers = list(self.current_model.spk2id.keys()) return ( gr.Dropdown(choices=styles, value=styles[0]), gr.update(interactive=True, value="Synthesize"), gr.Dropdown(choices=speakers, value=speakers[0]), ) def update_model_files_dropdown(self, model_name): model_files = self.model_files_dict[model_name] return gr.Dropdown(choices=model_files, value=model_files[0]) def update_model_names_dropdown(self): self.refresh() initial_model_name = self.model_names[0] initial_model_files = self.model_files_dict[initial_model_name] return ( gr.Dropdown(choices=self.model_names, value=initial_model_name), gr.Dropdown(choices=initial_model_files, value=initial_model_files[0]), gr.update(interactive=False), # For tts_button ) def tts_fn( model_name, model_path, text, language, reference_audio_path, sdp_ratio, noise_scale, noise_scale_w, length_scale, line_split, split_interval, style_text, style_weight, use_style_text, emotion, emotion_weight, speaker, ): if not text: return "Please enter some text.", (44100, None) #logger.info(f"Start TTS with {language}:\n{text}") #logger.info(f"Model: {model_holder.current_model.model_path}") #logger.info(f"SDP: {sdp_ratio}, Noise: {noise_scale}, Noise_W: {noise_scale_w}, Length: {length_scale}") #logger.info(f"Style text enabled: {use_style_text}, Style text: {style_text}, Style weight: {style_weight}") #logger.info(f"Style: {emotion}, Style weight: {emotion_weight}") if is_hf_spaces and len(text) > limit: return f"Too long! There is a character limit of {limit} characters.", (44100, None) if(not model_holder.current_model): model_holder.load_model(model_name, model_path) if(model_holder.current_model.model_path != model_path): model_holder.load_model(model_name, model_path) speaker_id = model_holder.current_model.spk2id[speaker] start_time = datetime.datetime.now() sr, audio = model_holder.current_model.infer( text=text, language=language, sid=speaker_id, reference_audio_path=reference_audio_path, sdp_ratio=sdp_ratio, noise=noise_scale, noisew=noise_scale_w, length=length_scale, line_split=line_split, split_interval=split_interval, style_text=style_text, style_weight=style_weight, use_style_text=use_style_text, style=emotion, emotion_weight=emotion_weight, ) end_time = datetime.datetime.now() duration = (end_time - start_time).total_seconds() logger.info(f"Successful inference, took {duration}s | {speaker} | {sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{emotion}/{emotion_weight} | {text}") return f"Success, time: {duration} seconds.", (sr, audio) def load_voicedata(): logger.info("Loading voice data...") voices = [] styledict = {} with open("voicelist.json", "r", encoding="utf-8") as f: voc_info = json.load(f) for name, info in voc_info.items(): if not info['enable']: continue model_path = info['model_path'] voice_name = info['title'] speakerid = info['speakerid'] image = info['cover'] if not model_path in styledict.keys(): conf=f"model_assets/{model_path}/config.json" hps = utils.get_hparams_from_file(conf) s2id = hps.data.style2id styledict[model_path] = s2id.keys() voices.append((name, model_path, voice_name, speakerid, image)) return voices, styledict initial_text = "Hello there! This is test audio of Lemonfoot S B V 2." initial_md = """ # LemonfootSBV2 😊🍋 ### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA) ### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02) This HuggingFace space is designed to demonstrate multiple experimental [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) models made by Kit Lemonfoot. Do no evil. **Note:** Most of my models are a *work in progress.* They may not sound fully correct. """ style_md = """ - You can control things like voice tone, emotion, and reading style through presets or through voice files. - Neutral acts as an average across all speakers. Styling options act as an override to Neutral. - Setting the intensity too high will likely break the output. - The required intensity will depend based on the speaker and the desired style. - If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker. """ def make_interactive(): return gr.update(interactive=True, value="Synthesize") def make_non_interactive(): return gr.update(interactive=False, value="Synthesize (Please load a model!)") def gr_util(item): if item == "Select from presets": return (gr.update(visible=True), gr.Audio(visible=False, value=None)) else: return (gr.update(visible=False), gr.update(visible=True)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU") parser.add_argument( "--dir", "-d", type=str, help="Model directory", default=config.out_dir ) args = parser.parse_args() model_dir = args.dir if args.cpu: device = "cpu" else: device = "cuda" if torch.cuda.is_available() else "cpu" model_holder = ModelHolder(model_dir, device) languages = ["EN", "JP", "ZH"] model_names = model_holder.model_names if len(model_names) == 0: logger.error(f"No models found. Please place the model in {model_dir}.") sys.exit(1) initial_id = 0 initial_pth_files = model_holder.model_files_dict[model_names[initial_id]] print(initial_pth_files) voicedata, styledict = load_voicedata() #Gradio preload text_input = gr.TextArea(label="Text", value=initial_text) line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True) split_interval = gr.Slider( minimum=0.0, maximum=2, value=0.5, step=0.1, label="Length of division seperation time (in seconds)", ) language = gr.Dropdown(choices=languages, value="EN", label="Language") sdp_ratio = gr.Slider( minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio" ) noise_scale = gr.Slider( minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise" ) noise_scale_w = gr.Slider( minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W" ) length_scale = gr.Slider( minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length" ) use_style_text = gr.Checkbox(label="Use stylization text", value=False) style_text = gr.Textbox( label="Style text", placeholder="Check the \"Use stylization text\" box to use this option!", info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.", visible=True, ) style_text_weight = gr.Slider( minimum=0, maximum=1, value=0.7, step=0.1, label="Text stylization strength", visible=True, ) with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="LemonfootSBV2") as app: gr.Markdown(initial_md) for (name, model_path, voice_name, speakerid, image) in voicedata: with gr.TabItem(name): mn = gr.Textbox(value=model_path, visible=False, interactive=False) mp = gr.Textbox(value=f"model_assets/{model_path}/{model_path}.safetensors", visible=False, interactive=False) spk = gr.Textbox(value=speakerid, visible=False, interactive=False) with gr.Row(): with gr.Column(): gr.Markdown(f"**{voice_name}**\n\nModel name: {model_path}") gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False) with gr.Column(): with gr.TabItem("Style using a preset"): style = gr.Dropdown( label="Current style (Neutral is an average style)", choices=styledict[model_path], value="Neutral", ) with gr.TabItem("Style using existing audio"): ref_audio_path = gr.Audio(label="Reference Audio", type="filepath") style_weight = gr.Slider( minimum=0, maximum=50, value=5, step=0.1, label="Style strength", ) with gr.Column(): tts_button = gr.Button( "Synthesize", variant="primary", interactive=True ) text_output = gr.Textbox(label="Info") audio_output = gr.Audio(label="Result") tts_button.click( tts_fn, inputs=[ mn, mp, text_input, language, ref_audio_path, sdp_ratio, noise_scale, noise_scale_w, length_scale, line_split, split_interval, style_text, style_text_weight, use_style_text, style, style_weight, spk, ], outputs=[text_output, audio_output], ) with gr.Row(): with gr.Column(): text_input.render() line_split.render() split_interval.render() language.render() with gr.Column(): sdp_ratio.render() noise_scale.render() noise_scale_w.render() length_scale.render() use_style_text.render() style_text.render() style_text_weight.render() with gr.Accordion("Styling Guide", open=False): gr.Markdown(style_md) app.launch(allowed_paths=['/file/images/'])