hexgrad commited on
Commit
e673bfc
1 Parent(s): 9386491

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +16 -10
  2. katsu.py +1 -0
app.py CHANGED
@@ -96,19 +96,24 @@ def point_num(num):
96
  a, b = num.group().split('.')
97
  return ' point '.join([a, ' '.join(b)])
98
 
99
- def normalize(text):
100
- # TODO: Custom text normalization rules?
 
 
 
 
 
 
 
 
 
 
101
  text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
102
  text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text)
103
  text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text)
104
  text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text)
105
  text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text)
106
  text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text)
107
- text = text.replace(chr(8216), "'").replace(chr(8217), "'")
108
- text = text.replace(chr(8220), '"').replace(chr(8221), '"')
109
- text = re.sub(r'[^\S \n]', ' ', text)
110
- text = re.sub(r' +', ' ', text)
111
- text = re.sub(r'(?<=\n) +(?=\n)', '', text)
112
  text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
113
  text = re.sub(r'(?<=\d),(?=\d)', '', text)
114
  text = re.sub(r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
@@ -119,7 +124,7 @@ def normalize(text):
119
  text = re.sub(r"(?<=X')S\b", 's', text)
120
  text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
121
  text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
122
- return parens_to_angles(text).strip()
123
 
124
  phonemizers = dict(
125
  a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
@@ -178,7 +183,7 @@ def resolve_voices(voice, warn=True):
178
  def phonemize(text, voice, norm=True):
179
  lang = resolve_voices(voice)[0][0]
180
  if norm:
181
- text = normalize(text)
182
  ps = phonemizers[lang].phonemize([text])
183
  ps = ps[0] if ps else ''
184
  # TODO: Custom phonemization rules?
@@ -438,9 +443,10 @@ def recursive_split(text, voice):
438
  return recursive_split(a, voice) + recursive_split(b, voice)
439
 
440
  def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):
 
441
  if skip_square_brackets:
442
  text = re.sub(r'\[.*?\]', '', text)
443
- texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize(text))] if newline_split > 0 else [normalize(text)]
444
  segments = [row for t in texts for row in recursive_split(t, voice)]
445
  return [(i, *row) for i, row in enumerate(segments)]
446
 
 
96
  a, b = num.group().split('.')
97
  return ' point '.join([a, ' '.join(b)])
98
 
99
+ def normalize_text(text, lang):
100
+ text = text.replace(chr(8216), "'").replace(chr(8217), "'")
101
+ text = text.replace('«', chr(8220)).replace('»', chr(8221))
102
+ text = text.replace(chr(8220), '"').replace(chr(8221), '"')
103
+ text = parens_to_angles(text)
104
+ for a, b in zip('、。!,:;?', ',.!,:;?'):
105
+ text = text.replace(a, b+' ')
106
+ text = re.sub(r'[^\S \n]', ' ', text)
107
+ text = re.sub(r' +', ' ', text)
108
+ text = re.sub(r'(?<=\n) +(?=\n)', '', text)
109
+ if lang == 'j':
110
+ return text.strip()
111
  text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
112
  text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text)
113
  text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text)
114
  text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text)
115
  text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text)
116
  text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text)
 
 
 
 
 
117
  text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
118
  text = re.sub(r'(?<=\d),(?=\d)', '', text)
119
  text = re.sub(r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
 
124
  text = re.sub(r"(?<=X')S\b", 's', text)
125
  text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
126
  text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
127
+ return text.strip()
128
 
129
  phonemizers = dict(
130
  a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
 
183
  def phonemize(text, voice, norm=True):
184
  lang = resolve_voices(voice)[0][0]
185
  if norm:
186
+ text = normalize_text(text, lang)
187
  ps = phonemizers[lang].phonemize([text])
188
  ps = ps[0] if ps else ''
189
  # TODO: Custom phonemization rules?
 
443
  return recursive_split(a, voice) + recursive_split(b, voice)
444
 
445
  def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):
446
+ lang = resolve_voices(voice)[0][0]
447
  if skip_square_brackets:
448
  text = re.sub(r'\[.*?\]', '', text)
449
+ texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize_text(text, lang))] if newline_split > 0 else [normalize_text(text, lang)]
450
  segments = [row for t in texts for row in recursive_split(t, voice)]
451
  return [(i, *row) for i, row in enumerate(segments)]
452
 
katsu.py CHANGED
@@ -231,6 +231,7 @@ HEPBURN.update({
231
  '『': '"',
232
  '』': '"',
233
  ':': ':',
 
234
  '(': '(',
235
  ')': ')',
236
  '《': '(',
 
231
  '『': '"',
232
  '』': '"',
233
  ':': ':',
234
+ ';': ';',
235
  '(': '(',
236
  ')': ')',
237
  '《': '(',