Spaces:
Running
on
L4
Running
on
L4
File size: 1,604 Bytes
0a3525d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import itertools
import re
LANGUAGE_UNICODE_RANGE_MAP = {
"ZH": [(0x4E00, 0x9FFF)],
"JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)],
"EN": [(0x0000, 0x007F)],
}
SYMBOLS_MAPPING = {
"οΌ": ",",
"οΌ": ",",
"οΌ": ",",
"γ": ".",
"οΌ": "!",
"οΌ": "?",
"\n": ".",
"Β·": ",",
"γ": ",",
"...": "β¦",
"$": ".",
"β": "'",
"β": "'",
"β": "'",
"β": "'",
"οΌ": "'",
"οΌ": "'",
"(": "'",
")": "'",
"γ": "'",
"γ": "'",
"γ": "'",
"γ": "'",
"[": "'",
"]": "'",
"β": "-",
"ο½": "-",
"~": "-",
"γ»": "-",
"γ": "'",
"γ": "'",
";": ",",
":": ",",
}
REPLACE_SYMBOL_REGEX = re.compile(
"|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
)
ALL_KNOWN_UTF8_RANGE = list(
itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values())
)
REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile(
"[^"
+ "".join(
f"{re.escape(chr(start))}-{re.escape(chr(end))}"
for start, end in ALL_KNOWN_UTF8_RANGE
)
+ "]"
)
def clean_text(text):
# Clean the text
text = text.strip()
# Replace <p:(.*?)> with <PPP(.*?)PPP>
text = re.sub(r"<p:(.*?)>", r"<PPP\1PPP>", text)
# Replace all chinese symbols with their english counterparts
text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text)
# Replace <PPP(.*?)PPP> with <p:(.*?)>
text = re.sub(r"<PPP(.*?)PPP>", r"<p:\1>", text)
return text
|