Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,13 +12,13 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5Hif
|
|
12 |
import requests
|
13 |
from requests.exceptions import Timeout
|
14 |
|
15 |
-
checkpoint = "
|
16 |
processor = SpeechT5Processor.from_pretrained(checkpoint)
|
17 |
-
model = SpeechT5ForTextToSpeech.from_pretrained(
|
18 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
19 |
|
20 |
speaker_embeddings = {
|
21 |
-
"BDL": "cmu_us_bdl_arctic-wav-
|
22 |
}
|
23 |
|
24 |
def translate_text(text):
|
@@ -69,95 +69,11 @@ def process_text(text: str) -> str:
|
|
69 |
processed_text = ' '.join(words)
|
70 |
return processed_text
|
71 |
|
72 |
-
replacements = [
|
73 |
-
("՚", "?"),
|
74 |
-
('՛', ""),
|
75 |
-
('՝', ""),
|
76 |
-
("«", "\""),
|
77 |
-
("»", "\""),
|
78 |
-
("՞", "?"),
|
79 |
-
("ա", "a"),
|
80 |
-
("բ", "b"),
|
81 |
-
("գ", "g"),
|
82 |
-
("դ", "d"),
|
83 |
-
("զ", "z"),
|
84 |
-
("է", "e"),
|
85 |
-
("ը", "e'"),
|
86 |
-
("թ", "t'"),
|
87 |
-
("ժ", "jh"),
|
88 |
-
("ի", "i"),
|
89 |
-
("լ", "l"),
|
90 |
-
("խ", "kh"),
|
91 |
-
("ծ", "ts"),
|
92 |
-
("կ", "k"),
|
93 |
-
("հ", "h"),
|
94 |
-
("ձ", "dz"),
|
95 |
-
("ղ", "gh"),
|
96 |
-
("ճ", "ch"),
|
97 |
-
("մ", "m"),
|
98 |
-
("յ", "y"),
|
99 |
-
("ն", "n"),
|
100 |
-
("շ", "sh"),
|
101 |
-
("չ", "ch'"),
|
102 |
-
("պ", "p"),
|
103 |
-
("ջ", "j"),
|
104 |
-
("ռ", "r"),
|
105 |
-
("ս", "s"),
|
106 |
-
("վ", "v"),
|
107 |
-
("տ", "t"),
|
108 |
-
("ր", "r"),
|
109 |
-
("ց", "ts'"),
|
110 |
-
("ւ", ""),
|
111 |
-
("փ", "p'"),
|
112 |
-
("ք", "k'"),
|
113 |
-
("և", "yev"),
|
114 |
-
("օ", "o"),
|
115 |
-
("ֆ", "f"),
|
116 |
-
('։', "."),
|
117 |
-
('–', "-"),
|
118 |
-
('†', "e'"),
|
119 |
-
]
|
120 |
-
|
121 |
-
|
122 |
-
def cleanup_text(text):
|
123 |
-
|
124 |
-
translator = str.maketrans("", "", string.punctuation)
|
125 |
-
|
126 |
-
text = text.translate(translator).lower()
|
127 |
-
text = text.lower()
|
128 |
-
|
129 |
-
normalized_text = text
|
130 |
-
|
131 |
-
normalized_text = normalized_text.replace("ու", "u")
|
132 |
-
normalized_text = normalized_text.replace("եւ", "yev")
|
133 |
-
normalized_text = normalized_text.replace("եվ", "yev")
|
134 |
-
|
135 |
-
# Handle 'ո' at the beginning of a word
|
136 |
-
normalized_text = normalized_text.replace(" ո", " vo")
|
137 |
-
|
138 |
-
# Handle 'ո' in the middle of a word
|
139 |
-
normalized_text = normalized_text.replace("ո", "o")
|
140 |
-
|
141 |
-
# Handle 'ե' at the beginning of a word
|
142 |
-
normalized_text = normalized_text.replace(" ե", " ye")
|
143 |
-
|
144 |
-
# Handle 'ե' in the middle of a word
|
145 |
-
normalized_text = normalized_text.replace("ե", "e")
|
146 |
-
|
147 |
-
# Apply other replacements
|
148 |
-
for src, dst in replacements:
|
149 |
-
normalized_text = normalized_text.replace(src, dst)
|
150 |
-
|
151 |
-
inputs = normalized_text
|
152 |
-
return inputs
|
153 |
-
|
154 |
def predict(text, speaker):
|
155 |
if len(text.strip()) == 0:
|
156 |
return (16000, np.zeros(0).astype(np.int16))
|
157 |
|
158 |
text = process_text(text)
|
159 |
-
|
160 |
-
text = cleanup_text(text)
|
161 |
|
162 |
inputs = processor(text=text, return_tensors="pt")
|
163 |
|
|
|
12 |
import requests
|
13 |
from requests.exceptions import Timeout
|
14 |
|
15 |
+
checkpoint = "Edmon02/TTS_NB_2"
|
16 |
processor = SpeechT5Processor.from_pretrained(checkpoint)
|
17 |
+
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
|
18 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
19 |
|
20 |
speaker_embeddings = {
|
21 |
+
"BDL": "cmu_us_bdl_arctic-wav-arctic_a0004.npy",
|
22 |
}
|
23 |
|
24 |
def translate_text(text):
|
|
|
69 |
processed_text = ' '.join(words)
|
70 |
return processed_text
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
def predict(text, speaker):
|
73 |
if len(text.strip()) == 0:
|
74 |
return (16000, np.zeros(0).astype(np.int16))
|
75 |
|
76 |
text = process_text(text)
|
|
|
|
|
77 |
|
78 |
inputs = processor(text=text, return_tensors="pt")
|
79 |
|