Spaces:
Runtime error
Runtime error
File size: 5,542 Bytes
5c60553 79309e0 5c60553 45bafcb 5c60553 79309e0 5c60553 79309e0 5c60553 79309e0 5c60553 79309e0 5c60553 45bafcb 5c60553 79309e0 5c60553 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import re
import torch
import requests
import torchaudio
import numpy as np
# from src.reduce_noise import smooth_and_reduce_noise, model_remove_noise, model, df_state
import io
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from pydub import AudioSegment
import re
from uroman import uroman
# from src.pynote_speaker_embedding import create_speaker_embedding
from src.speechbrain_speaker_embedding import create_speaker_embedding
from datasets import load_dataset
dataset = load_dataset("truong-xuan-linh/vi-xvector-speechbrain",
download_mode="force_redownload",
verification_mode="no_checks",
cache_dir="temp/",
revision="5ea5e4345258333cbc6d1dd2544f6c658e66a634")
dataset = dataset["train"].to_list()
dataset_dict = {}
for rc in dataset:
dataset_dict[rc["speaker_id"]] = rc["embedding"]
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
def remove_special_characters(sentence):
# Use regular expression to keep only letters, periods, and commas
sentence_after_removal = re.sub(r'[^a-zA-Z\s,.\u00C0-\u1EF9]', ' ,', sentence)
return sentence_after_removal
from scipy.signal import butter, lfilter
def butter_bandpass(lowcut, highcut, fs, order=5):
nyq = 0.5 * fs
low = lowcut / nyq
high = highcut / nyq
b, a = butter(order, [low, high], btype='band')
return b, a
def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
b, a = butter_bandpass(lowcut, highcut, fs, order=order)
y = lfilter(b, a, data)
return y
def korean_splitter(string):
pattern = re.compile('[가-힣]+')
matches = pattern.findall(string)
return matches
def uroman_normalization(string):
korean_inputs = korean_splitter(string)
for korean_input in korean_inputs:
korean_roman = uroman(korean_input)
string = string.replace(korean_input, korean_roman)
return string
class Model():
def __init__(self, model_name, speaker_url=""):
self.model_name = model_name
self.processor = SpeechT5Processor.from_pretrained(model_name)
self.model = SpeechT5ForTextToSpeech.from_pretrained(model_name)
# self.model.generate = partial(self.model.generate, use_cache=True)
self.model.eval()
self.speaker_url = speaker_url
if speaker_url:
print(f"download speaker_url")
response = requests.get(speaker_url)
audio_stream = io.BytesIO(response.content)
audio_segment = AudioSegment.from_file(audio_stream, format="wav")
audio_segment = audio_segment.set_channels(1)
audio_segment = audio_segment.set_frame_rate(16000)
audio_segment = audio_segment.set_sample_width(2)
wavform, _ = torchaudio.load(audio_segment.export())
self.speaker_embeddings = create_speaker_embedding(wavform)[0]
else:
self.speaker_embeddings = None
if model_name == "truong-xuan-linh/speecht5-vietnamese-commonvoice" or model_name == "truong-xuan-linh/speecht5-irmvivoice":
self.speaker_embeddings = torch.zeros((1, 512)) # or load xvectors from a file
def inference(self, text, speaker_id=None):
# if self.model_name == "truong-xuan-linh/speecht5-vietnamese-voiceclone-v2":
# # self.speaker_embeddings = torch.tensor(dataset_dict_v2[speaker_id])
# wavform, _ = torchaudio.load(speaker_id)
# self.speaker_embeddings = create_speaker_embedding(wavform)[0]
if "voiceclone" in self.model_name:
if not self.speaker_url:
self.speaker_embeddings = torch.tensor(dataset_dict[speaker_id])
# self.speaker_embeddings = create_speaker_embedding(speaker_id)[0]
# wavform, _ = torchaudio.load("voices/kcbn1.wav")
# self.speaker_embeddings = create_speaker_embedding(wavform)[0]
# wavform, _ = torchaudio.load(wav_file)
# self.speaker_embeddings = create_speaker_embedding(wavform)[0]
with torch.no_grad():
full_speech = []
separators = r";|\.|!|\?|\n"
text = uroman_normalization(text)
text = remove_special_characters(text)
text = text.replace(" ", "▁")
split_texts = re.split(separators, text)
for split_text in split_texts:
if split_text != "▁":
split_text = split_text.lower() + "▁"
print(split_text)
inputs = self.processor.tokenizer(text=split_text, return_tensors="pt")
speech = self.model.generate_speech(inputs["input_ids"], threshold=0.5, speaker_embeddings=self.speaker_embeddings, vocoder=vocoder)
full_speech.append(speech.numpy())
# full_speech.append(butter_bandpass_filter(speech.numpy(), lowcut=10, highcut=5000, fs=16000, order=2))
# out_audio = model_remove_noise(model, df_state, np.concatenate(full_speech))
return np.concatenate(full_speech)
@staticmethod
def moving_average(data, window_size):
return np.convolve(data, np.ones(window_size)/window_size, mode='same')
# woman: VIVOSSPK26, VIVOSSPK02, VIVOSSPK40
# man: VIVOSSPK28, VIVOSSPK36, VIVOSDEV09, VIVOSSPK33, VIVOSSPK23
|