RVC-Speakers / speakers /processors /bark_to_voice.py
glide-the
Add large files to Git LFS
04ffec9
raw
history blame
8.95 kB
from typing import Optional, Union, Dict
from bark.mode_load import BarkModelLoader, SAMPLE_RATE
from vits.modules import commons
from vits.text import text_to_sequence
from torch import LongTensor
from speakers.common.registry import registry
from speakers.processors import BaseProcessor, ProcessorData
import os
import logging
import numpy as np
import nltk # we'll use this to split into sentences
from nltk.tokenize import RegexpTokenizer
logger = logging.getLogger('bark_to_voice')
def set_bark_to_voice_logger(l):
global logger
logger = l
silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence
def get_text(text, hps):
text_norm, clean_text = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = LongTensor(text_norm)
return text_norm, clean_text
class BarkProcessorData(ProcessorData):
"""
:param text: 生成文本
:param speaker_history_prompt: 音频预设npz文件
:param text_temp: 提示特殊标记程序,趋近于1,提示词特殊标记越明显
:param waveform_temp: 提示隐藏空间转音频参数比例
"""
"""生成文本"""
text: str
"""音频预设npz文件"""
speaker_history_prompt: str
"""提示特殊标记程序,趋近于1,提示词特殊标记越明显"""
text_temp: float
"""提示隐藏空间转音频参数比例"""
waveform_temp: float
@property
def type(self) -> str:
"""Type of the Message, used for serialization."""
return "BARK"
@registry.register_processor("bark_to_voice")
class BarkToVoice(BaseProcessor):
def __init__(self, tokenizer_path: str, text_path: str, coarse_path: str, fine_path: str):
super().__init__()
self._load_bark_mode(tokenizer_path=tokenizer_path,
text_path=text_path,
coarse_path=coarse_path,
fine_path=fine_path)
def __call__(
self,
data: BarkProcessorData
):
# 分词,适配长句子
script = data.text.replace("\n", "。").strip()
tokenizer = RegexpTokenizer(r'[^,。!?]+[,。!?]?')
sentences = tokenizer.tokenize(script)
pieces = []
logger.info(f"sentences:{sentences}")
for sentence in sentences:
audio_array = self._generate_audio(text=sentence,
history_prompt_dir=registry.get_path('bark_library_root'),
history_prompt=data.speaker_history_prompt,
text_temp=data.text_temp,
waveform_temp=data.waveform_temp)
pieces += [audio_array, silence.copy()]
audio_array_out = np.concatenate(pieces)
del pieces
return audio_array_out
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
raise RuntimeError("from_config cfg is None.")
tokenizer_path = cfg.get("tokenizer_path", "")
text_model_path = cfg.get("text_model_path", "")
coarse_model_path = cfg.get("coarse_model_path", "")
fine_model_path = cfg.get("fine_model_path", "")
return cls(tokenizer_path=os.path.join(registry.get_path("vits_library_root"),
tokenizer_path),
text_path=os.path.join(registry.get_path("vits_library_root"),
text_model_path),
coarse_path=os.path.join(registry.get_path("vits_library_root"),
coarse_model_path),
fine_path=os.path.join(registry.get_path("vits_library_root"),
fine_model_path)
)
def match(self, data: ProcessorData):
return "BARK" in data.type
def _load_bark_mode(self, tokenizer_path: str, text_path: str, coarse_path: str, fine_path: str):
logger.info(f'Bark model loading')
self.bark_load = BarkModelLoader(tokenizer_path=tokenizer_path,
text_path=text_path,
coarse_path=coarse_path,
fine_path=fine_path,
device=registry.get("device"))
logger.info(f'Models loaded bark')
def _generate_audio(
self,
text: str,
history_prompt: Optional[str] = None,
history_prompt_dir: str = None,
text_temp: float = 0.7,
waveform_temp: float = 0.7,
fine_temp: float = 0.5,
silent: bool = False,
output_full: bool = False):
"""Generate audio array from input text.
Args:
text: text to be turned into audio
history_prompt: history choice for audio cloning
text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
fine_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
silent: disable progress bar
output_full: return full generation to be used as a history prompt
Returns:
numpy audio array at sample frequency 24khz
"""
semantic_tokens = self._text_to_semantic(
text,
history_prompt=history_prompt,
history_prompt_dir=history_prompt_dir,
temp=text_temp,
silent=silent,
)
out = self._semantic_to_waveform(
semantic_tokens,
history_prompt=history_prompt,
history_prompt_dir=history_prompt_dir,
temp=waveform_temp,
fine_temp=fine_temp,
silent=silent,
output_full=output_full,
)
if output_full:
full_generation, audio_arr = out
return full_generation, audio_arr
else:
audio_arr = out
return audio_arr
def _text_to_semantic(
self,
text: str,
history_prompt: Optional[str] = None,
history_prompt_dir=None,
temp: float = 0.7,
silent: bool = False,
):
"""Generate semantic array from text.
Args:
text: text to be turned into audio
history_prompt: history choice for audio cloning
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
silent: disable progress bar
Returns:
numpy semantic array to be fed into `semantic_to_waveform`
"""
x_semantic = self.bark_load.generate_text_semantic(
text,
history_prompt=history_prompt,
history_prompt_dir=history_prompt_dir,
temp=temp,
silent=silent,
use_kv_caching=True
)
return x_semantic
def _semantic_to_waveform(
self,
semantic_tokens: np.ndarray,
history_prompt: Optional[Union[Dict, str]] = None,
history_prompt_dir: str = None,
temp: float = 0.7,
fine_temp: float = 0.5,
silent: bool = False,
output_full: bool = False,
):
"""Generate audio array from semantic input.
Args:
semantic_tokens: semantic token output from `text_to_semantic`
history_prompt: history choice for audio cloning
fine_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
silent: disable progress bar
output_full: return full generation to be used as a history prompt
Returns:
numpy audio array at sample frequency 24khz
"""
coarse_tokens = self.bark_load.generate_coarse(
semantic_tokens,
history_prompt=history_prompt,
history_prompt_dir=history_prompt_dir,
temp=temp,
silent=silent,
use_kv_caching=True
)
fine_tokens = self.bark_load.generate_fine(
coarse_tokens,
history_prompt=history_prompt,
history_prompt_dir=history_prompt_dir,
temp=fine_temp,
)
audio_arr = self.bark_load.codec_decode(fine_tokens)
if output_full:
full_generation = {
"semantic_prompt": semantic_tokens,
"coarse_prompt": coarse_tokens,
"fine_prompt": fine_tokens,
}
return full_generation, audio_arr
return audio_arr