|
import re |
|
|
|
def japanese_cleaners(text): |
|
from text.japanese import japanese_to_romaji_with_accent |
|
text = japanese_to_romaji_with_accent(text) |
|
if len(text) == 0 or re.match('[A-Za-z]', text[-1]): |
|
text += '.' |
|
|
|
return text |
|
|
|
|
|
def japanese_cleaners2(text): |
|
text = text.replace('・・・', '…').replace('・', ' ') |
|
text = japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…') \ |
|
.replace('(', '').replace(')', '') \ |
|
.replace('[', '').replace(']', '') \ |
|
.replace('*', ' ').replace('{', '').replace('}', '') |
|
return text |
|
|
|
|
|
def ko2kata(text): |
|
return text |
|
|
|
def en2kata(text): |
|
return text |
|
|
|
|
|
|
|
def jke_cleaners(text): |
|
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text) |
|
korean_texts = re.findall(r'\[KO\].*?\[KO\]', text) |
|
english_texts = re.findall(r'\[EN\].*?\[EN\]', text) |
|
|
|
for japanese_text in japanese_texts: |
|
cleaned_text = japanese_text[4:-4] |
|
text = text.replace(japanese_text, cleaned_text+' ', 1) |
|
|
|
for korean_text in korean_texts: |
|
cleaned_text = ko2kata(korean_text[4:-4]) |
|
text = text.replace(korean_text, cleaned_text+' ', 1) |
|
|
|
for english_text in english_texts: |
|
cleaned_text = en2kata(english_text[4:-4]) |
|
text = text.replace(english_text, cleaned_text+' ', 1) |
|
|
|
text = japanese_cleaners2(text) |
|
|
|
text = text[:-1] |
|
if re.match(r'[^\.,!\?\-…~]', text[-1]): |
|
text += '.' |
|
return text |
|
|
|
|
|
|