TenzinGayche
/

whisper-small-3

Automatic Speech Recognition

Transformers

PyTorch

whisper

Inference Endpoints

Model card Files Files and versions Community

TenzinGayche commited on Sep 28, 2023

Commit

a6157cf

•

1 Parent(s): 31ba694

Update handler.py

Browse files

Files changed (1) hide show

handler.py +79 -21

handler.py CHANGED Viewed

@@ -1,36 +1,94 @@
-from typing import  Dict
-from transformers.pipelines.audio_utils import ffmpeg_read
 import torch
 import pyewts
-from transformers import pipeline
 converter = pyewts.pyewts()
-SAMPLE_RATE = 16000
 class EndpointHandler():
     def __init__(self, path=""):
         # load the model
-        self.pipe = pipeline(model="TenzinGayche/whisper-small-3",chunk_length_s=30,device='cuda')
-    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
-        """
         Args:
-            data (:obj:):
-                includes the deserialized audio file as bytes
-        Return:
-            A :obj:`dict`:. base64 encoded image
         """
         # process input
-        inputs = data.pop("inputs", data)
-        audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
-        audio_tensor= torch.from_numpy(audio_nparray)
-        text = self.pipe(audio_tensor.numpy())["text"]
-        # run inference pipeline
-        result = converter.toUnicode(text)
-        # postprocess the prediction
-        return {"text": result}

+from typing import  Dict, Any,Union
+import librosa
+import numpy as np
 import torch
 import pyewts
+import noisereduce as nr
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from num2tib.core import convert
+from num2tib.core import convert2text
+import base64
+import re
+import requests
 converter = pyewts.pyewts()
+def download_file(url, destination):
+    response = requests.get(url)
+    with open(destination, 'wb') as file:
+        file.write(response.content)
+# Example usage:
+download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy')
+def replace_numbers_with_convert(sentence, wylie=True):
+    pattern = r'\d+(\.\d+)?'
+    def replace(match):
+        return convert(match.group(), wylie)
+    result = re.sub(pattern, replace, sentence)
+    return result
+def cleanup_text(inputs):
+    for src, dst in replacements:
+        inputs = inputs.replace(src, dst)
+    return inputs
+speaker_embeddings = {
+    "Lhasa(female)": "female_2.npy",
+}
+replacements = [
+    ('_', '_'),
+    ('*', 'v'),
+    ('`', ';'),
+    ('~', ','),
+    ('+', ','),
+    ('\\', ';'),
+    ('|', ';'),
+    ('╚',''),
+    ('╗','')
+]
 class EndpointHandler():
     def __init__(self, path=""):
         # load the model
+        self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
+        self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
+        self.model.to('cuda')
+        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]:
+        """_summary_
         Args:
+            data (Dict[str, Any]): _description_
+        Returns:
+            bytes: _description_
         """
+        text = data.pop("inputs",data)
         # process input
+        if len(text.strip()) == 0:
+            return (16000, np.zeros(0).astype(np.int16))
+        text = converter.toWylie(text)
+        text=cleanup_text(text)
+        text=replace_numbers_with_convert(text)
+        inputs = self.processor(text=text, return_tensors="pt")
+        # limit input length
+        input_ids = inputs["input_ids"]
+        input_ids = input_ids[..., :self.model.config.max_text_positions]
+        speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
+        speaker_embedding = torch.tensor(speaker_embedding)
+        speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda'))
+        speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
+        return {
+            "sample_rate": 16000,
+            "audio": base64.b64encode(speech.tostring()).decode("utf-8"),
+        }