from typing import Optional, Union, Dict from bark.mode_load import BarkModelLoader, SAMPLE_RATE from vits.modules import commons from vits.text import text_to_sequence from torch import LongTensor from speakers.common.registry import registry from speakers.processors import BaseProcessor, ProcessorData import os import logging import numpy as np import nltk # we'll use this to split into sentences from nltk.tokenize import RegexpTokenizer logger = logging.getLogger('bark_to_voice') def set_bark_to_voice_logger(l): global logger logger = l silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence def get_text(text, hps): text_norm, clean_text = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = LongTensor(text_norm) return text_norm, clean_text class BarkProcessorData(ProcessorData): """ :param text: 生成文本 :param speaker_history_prompt: 音频预设npz文件 :param text_temp: 提示特殊标记程序,趋近于1,提示词特殊标记越明显 :param waveform_temp: 提示隐藏空间转音频参数比例 """ """生成文本""" text: str """音频预设npz文件""" speaker_history_prompt: str """提示特殊标记程序,趋近于1,提示词特殊标记越明显""" text_temp: float """提示隐藏空间转音频参数比例""" waveform_temp: float @property def type(self) -> str: """Type of the Message, used for serialization.""" return "BARK" @registry.register_processor("bark_to_voice") class BarkToVoice(BaseProcessor): def __init__(self, tokenizer_path: str, text_path: str, coarse_path: str, fine_path: str): super().__init__() self._load_bark_mode(tokenizer_path=tokenizer_path, text_path=text_path, coarse_path=coarse_path, fine_path=fine_path) def __call__( self, data: BarkProcessorData ): # 分词,适配长句子 script = data.text.replace("\n", "。").strip() tokenizer = RegexpTokenizer(r'[^,。!?]+[,。!?]?') sentences = tokenizer.tokenize(script) pieces = [] logger.info(f"sentences:{sentences}") for sentence in sentences: audio_array = self._generate_audio(text=sentence, history_prompt_dir=registry.get_path('bark_library_root'), history_prompt=data.speaker_history_prompt, text_temp=data.text_temp, waveform_temp=data.waveform_temp) pieces += [audio_array, silence.copy()] audio_array_out = np.concatenate(pieces) del pieces return audio_array_out @classmethod def from_config(cls, cfg=None): if cfg is None: raise RuntimeError("from_config cfg is None.") tokenizer_path = cfg.get("tokenizer_path", "") text_model_path = cfg.get("text_model_path", "") coarse_model_path = cfg.get("coarse_model_path", "") fine_model_path = cfg.get("fine_model_path", "") return cls(tokenizer_path=os.path.join(registry.get_path("vits_library_root"), tokenizer_path), text_path=os.path.join(registry.get_path("vits_library_root"), text_model_path), coarse_path=os.path.join(registry.get_path("vits_library_root"), coarse_model_path), fine_path=os.path.join(registry.get_path("vits_library_root"), fine_model_path) ) def match(self, data: ProcessorData): return "BARK" in data.type def _load_bark_mode(self, tokenizer_path: str, text_path: str, coarse_path: str, fine_path: str): logger.info(f'Bark model loading') self.bark_load = BarkModelLoader(tokenizer_path=tokenizer_path, text_path=text_path, coarse_path=coarse_path, fine_path=fine_path, device=registry.get("device")) logger.info(f'Models loaded bark') def _generate_audio( self, text: str, history_prompt: Optional[str] = None, history_prompt_dir: str = None, text_temp: float = 0.7, waveform_temp: float = 0.7, fine_temp: float = 0.5, silent: bool = False, output_full: bool = False): """Generate audio array from input text. Args: text: text to be turned into audio history_prompt: history choice for audio cloning text_temp: generation temperature (1.0 more diverse, 0.0 more conservative) waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative) fine_temp: generation temperature (1.0 more diverse, 0.0 more conservative) silent: disable progress bar output_full: return full generation to be used as a history prompt Returns: numpy audio array at sample frequency 24khz """ semantic_tokens = self._text_to_semantic( text, history_prompt=history_prompt, history_prompt_dir=history_prompt_dir, temp=text_temp, silent=silent, ) out = self._semantic_to_waveform( semantic_tokens, history_prompt=history_prompt, history_prompt_dir=history_prompt_dir, temp=waveform_temp, fine_temp=fine_temp, silent=silent, output_full=output_full, ) if output_full: full_generation, audio_arr = out return full_generation, audio_arr else: audio_arr = out return audio_arr def _text_to_semantic( self, text: str, history_prompt: Optional[str] = None, history_prompt_dir=None, temp: float = 0.7, silent: bool = False, ): """Generate semantic array from text. Args: text: text to be turned into audio history_prompt: history choice for audio cloning temp: generation temperature (1.0 more diverse, 0.0 more conservative) silent: disable progress bar Returns: numpy semantic array to be fed into `semantic_to_waveform` """ x_semantic = self.bark_load.generate_text_semantic( text, history_prompt=history_prompt, history_prompt_dir=history_prompt_dir, temp=temp, silent=silent, use_kv_caching=True ) return x_semantic def _semantic_to_waveform( self, semantic_tokens: np.ndarray, history_prompt: Optional[Union[Dict, str]] = None, history_prompt_dir: str = None, temp: float = 0.7, fine_temp: float = 0.5, silent: bool = False, output_full: bool = False, ): """Generate audio array from semantic input. Args: semantic_tokens: semantic token output from `text_to_semantic` history_prompt: history choice for audio cloning fine_temp: generation temperature (1.0 more diverse, 0.0 more conservative) temp: generation temperature (1.0 more diverse, 0.0 more conservative) silent: disable progress bar output_full: return full generation to be used as a history prompt Returns: numpy audio array at sample frequency 24khz """ coarse_tokens = self.bark_load.generate_coarse( semantic_tokens, history_prompt=history_prompt, history_prompt_dir=history_prompt_dir, temp=temp, silent=silent, use_kv_caching=True ) fine_tokens = self.bark_load.generate_fine( coarse_tokens, history_prompt=history_prompt, history_prompt_dir=history_prompt_dir, temp=fine_temp, ) audio_arr = self.bark_load.codec_decode(fine_tokens) if output_full: full_generation = { "semantic_prompt": semantic_tokens, "coarse_prompt": coarse_tokens, "fine_prompt": fine_tokens, } return full_generation, audio_arr return audio_arr