Spaces:
Running
on
Zero
Running
on
Zero
Upload 2 files
Browse files
app.py
CHANGED
@@ -96,19 +96,24 @@ def point_num(num):
|
|
96 |
a, b = num.group().split('.')
|
97 |
return ' point '.join([a, ' '.join(b)])
|
98 |
|
99 |
-
def
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
|
102 |
text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text)
|
103 |
text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text)
|
104 |
text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text)
|
105 |
text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text)
|
106 |
text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text)
|
107 |
-
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
108 |
-
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
109 |
-
text = re.sub(r'[^\S \n]', ' ', text)
|
110 |
-
text = re.sub(r' +', ' ', text)
|
111 |
-
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
112 |
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
|
113 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
|
114 |
text = re.sub(r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
|
@@ -119,7 +124,7 @@ def normalize(text):
|
|
119 |
text = re.sub(r"(?<=X')S\b", 's', text)
|
120 |
text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
|
121 |
text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
|
122 |
-
return
|
123 |
|
124 |
phonemizers = dict(
|
125 |
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
|
@@ -178,7 +183,7 @@ def resolve_voices(voice, warn=True):
|
|
178 |
def phonemize(text, voice, norm=True):
|
179 |
lang = resolve_voices(voice)[0][0]
|
180 |
if norm:
|
181 |
-
text =
|
182 |
ps = phonemizers[lang].phonemize([text])
|
183 |
ps = ps[0] if ps else ''
|
184 |
# TODO: Custom phonemization rules?
|
@@ -438,9 +443,10 @@ def recursive_split(text, voice):
|
|
438 |
return recursive_split(a, voice) + recursive_split(b, voice)
|
439 |
|
440 |
def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):
|
|
|
441 |
if skip_square_brackets:
|
442 |
text = re.sub(r'\[.*?\]', '', text)
|
443 |
-
texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}',
|
444 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
445 |
return [(i, *row) for i, row in enumerate(segments)]
|
446 |
|
|
|
96 |
a, b = num.group().split('.')
|
97 |
return ' point '.join([a, ' '.join(b)])
|
98 |
|
99 |
+
def normalize_text(text, lang):
|
100 |
+
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
101 |
+
text = text.replace('«', chr(8220)).replace('»', chr(8221))
|
102 |
+
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
103 |
+
text = parens_to_angles(text)
|
104 |
+
for a, b in zip('、。!,:;?', ',.!,:;?'):
|
105 |
+
text = text.replace(a, b+' ')
|
106 |
+
text = re.sub(r'[^\S \n]', ' ', text)
|
107 |
+
text = re.sub(r' +', ' ', text)
|
108 |
+
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
109 |
+
if lang == 'j':
|
110 |
+
return text.strip()
|
111 |
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
|
112 |
text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text)
|
113 |
text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text)
|
114 |
text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text)
|
115 |
text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text)
|
116 |
text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text)
|
|
|
|
|
|
|
|
|
|
|
117 |
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
|
118 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
|
119 |
text = re.sub(r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
|
|
|
124 |
text = re.sub(r"(?<=X')S\b", 's', text)
|
125 |
text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
|
126 |
text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
|
127 |
+
return text.strip()
|
128 |
|
129 |
phonemizers = dict(
|
130 |
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
|
|
|
183 |
def phonemize(text, voice, norm=True):
|
184 |
lang = resolve_voices(voice)[0][0]
|
185 |
if norm:
|
186 |
+
text = normalize_text(text, lang)
|
187 |
ps = phonemizers[lang].phonemize([text])
|
188 |
ps = ps[0] if ps else ''
|
189 |
# TODO: Custom phonemization rules?
|
|
|
443 |
return recursive_split(a, voice) + recursive_split(b, voice)
|
444 |
|
445 |
def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):
|
446 |
+
lang = resolve_voices(voice)[0][0]
|
447 |
if skip_square_brackets:
|
448 |
text = re.sub(r'\[.*?\]', '', text)
|
449 |
+
texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize_text(text, lang))] if newline_split > 0 else [normalize_text(text, lang)]
|
450 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
451 |
return [(i, *row) for i, row in enumerate(segments)]
|
452 |
|
katsu.py
CHANGED
@@ -231,6 +231,7 @@ HEPBURN.update({
|
|
231 |
'『': '"',
|
232 |
'』': '"',
|
233 |
':': ':',
|
|
|
234 |
'(': '(',
|
235 |
')': ')',
|
236 |
'《': '(',
|
|
|
231 |
'『': '"',
|
232 |
'』': '"',
|
233 |
':': ':',
|
234 |
+
';': ';',
|
235 |
'(': '(',
|
236 |
')': ')',
|
237 |
'《': '(',
|