from typing import Dict, Any,Union import tempfile import numpy as np import torch import pyewts import noisereduce as nr from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from num2tib.core import convert from num2tib.core import convert2text import soundfile as sf import base64 import re import requests import os from pydub import AudioSegment converter = pyewts.pyewts() def download_file(url, destination): response = requests.get(url) with open(destination, 'wb') as file: file.write(response.content) # Example usage: download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy') def replace_numbers_with_convert(sentence, wylie=True): pattern = r'\d+(\.\d+)?' def replace(match): return convert(match.group(), wylie) result = re.sub(pattern, replace, sentence) return result def cleanup_text(inputs): for src, dst in replacements: inputs = inputs.replace(src, dst) return inputs speaker_embeddings = { "Lhasa(female)": "female_2.npy", } replacements = [ ('_', '_'), ('*', 'v'), ('`', ';'), ('~', ','), ('+', ','), ('\\', ';'), ('|', ';'), ('╚',''), ('╗','') ] class EndpointHandler(): def __init__(self, path=""): # load the model self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") self.model.to('cuda') self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]: """_summary_ Args: data (Dict[str, Any]): _description_ Returns: bytes: _description_ """ text = data.pop("inputs",data) # process input if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) text = converter.toWylie(text) text=cleanup_text(text) text=replace_numbers_with_convert(text) inputs = self.processor(text=text, return_tensors="pt") input_ids = inputs["input_ids"] input_ids = input_ids[..., :self.model.config.max_text_positions] speaker_embedding = np.load(speaker_embeddings['Lhasa(female)']) speaker_embedding = torch.tensor(speaker_embedding) speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda')) speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000) # Create a unique temporary WAV file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file: temp_wav_path = temp_wav_file.name sf.write(temp_wav_path, speech, 16000, 'PCM_24') # Use sf.write to write the WAV file # Read the WAV file and encode it as base64 with open(temp_wav_path, "rb") as wav_file: audio_base64 = base64.b64encode(wav_file.read()).decode("utf-8") # Clean up the temporary WAV file os.remove(temp_wav_path) return { "sample_rate": 16000, "audio_base64": audio_base64, # Base64-encoded audio data "model": "openpecha/speecht5-tts-01", "model_version": "1" }