TenzinGayche commited on
Commit
4629510
β€’
1 Parent(s): 0dc87ee

Create handle.py

Browse files
Files changed (1) hide show
  1. handle.py +78 -0
handle.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+ import pyewts
6
+ import noisereduce as nr
7
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
8
+ from num2tib.core import convert
9
+ from num2tib.core import convert2text
10
+ import re
11
+ converter = pyewts.pyewts()
12
+ def replace_numbers_with_convert(sentence, wylie=True):
13
+ pattern = r'\d+(\.\d+)?'
14
+ def replace(match):
15
+ return convert(match.group(), wylie)
16
+ result = re.sub(pattern, replace, sentence)
17
+
18
+ return result
19
+
20
+ def cleanup_text(inputs):
21
+ for src, dst in replacements:
22
+ inputs = inputs.replace(src, dst)
23
+ return inputs
24
+
25
+ speaker_embeddings = {
26
+ "Lhasa(female)": "female_2.npy",
27
+
28
+ }
29
+
30
+ replacements = [
31
+ ('_', '_'),
32
+ ('*', 'v'),
33
+ ('`', ';'),
34
+ ('~', ','),
35
+ ('+', ','),
36
+ ('\\', ';'),
37
+ ('|', ';'),
38
+ ('β•š',''),
39
+ ('β•—','')
40
+ ]
41
+
42
+
43
+
44
+
45
+
46
+ class EndpointHandler():
47
+ def __init__(self, path=""):
48
+ # load the model
49
+ self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
50
+ self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
51
+ self.model.to('cuda')
52
+ self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
53
+
54
+
55
+ def __call__(self, data: Dict[str]) -> Dict[str, str]:
56
+ """
57
+ Args:
58
+ data (:obj:):
59
+ includes the deserialized audio file as bytes
60
+ Return:
61
+ A :obj:`dict`:. base64 encoded image
62
+ """
63
+ # process input
64
+
65
+ if len(text.strip()) == 0:
66
+ return (16000, np.zeros(0).astype(np.int16))
67
+ text = converter.toWylie(text)
68
+ text=cleanup_text(text)
69
+ text=replace_numbers_with_convert(text)
70
+ inputs = self.processor(text=text, return_tensors="pt")
71
+ # limit input length
72
+ input_ids = inputs["input_ids"]
73
+ input_ids = input_ids[..., :self.model.config.max_text_positions]
74
+ speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
75
+ speaker_embedding = torch.tensor(speaker_embedding)
76
+ speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=vocoder.to('cuda'))
77
+ speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
78
+ return (16000, speech)