from typing import Dict import librosa import numpy as np import torch import pyewts import noisereduce as nr from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from num2tib.core import convert from num2tib.core import convert2text import re converter = pyewts.pyewts() def replace_numbers_with_convert(sentence, wylie=True): pattern = r'\d+(\.\d+)?' def replace(match): return convert(match.group(), wylie) result = re.sub(pattern, replace, sentence) return result def cleanup_text(inputs): for src, dst in replacements: inputs = inputs.replace(src, dst) return inputs speaker_embeddings = { "Lhasa(female)": "female_2.npy", } replacements = [ ('_', '_'), ('*', 'v'), ('`', ';'), ('~', ','), ('+', ','), ('\\', ';'), ('|', ';'), ('╚',''), ('╗','') ] class EndpointHandler(): def __init__(self, path=""): # load the model self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") self.model.to('cuda') self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") def __call__(self, data: Dict[str]) -> Dict[str, str]: """ Args: data (:obj:): includes the deserialized audio file as bytes Return: A :obj:`dict`:. base64 encoded image """ # process input if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) text = converter.toWylie(text) text=cleanup_text(text) text=replace_numbers_with_convert(text) inputs = self.processor(text=text, return_tensors="pt") # limit input length input_ids = inputs["input_ids"] input_ids = input_ids[..., :self.model.config.max_text_positions] speaker_embedding = np.load(speaker_embeddings['Lhasa(female)']) speaker_embedding = torch.tensor(speaker_embedding) speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=vocoder.to('cuda')) speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000) return (16000, speech)