horiyouta's picture
202406210928
fd6a905
import re
import inflect
__INFLECT = inflect.engine()
__COMMA_NUMBER_PATTERN = re.compile(r"([0-9][0-9\,]+[0-9])")
__DECIMAL_NUMBER_PATTERN = re.compile(r"([0-9]+\.[0-9]+)")
__POUNDS_PATTERN = re.compile(r"Β£([0-9\,]*[0-9]+)")
__DOLLARS_PATTERN = re.compile(r"\$([0-9\.\,]*[0-9]+)")
__ORDINAL_PATTERN = re.compile(r"[0-9]+(st|nd|rd|th)")
__NUMBER_PATTERN = re.compile(r"[0-9]+")
def normalize_text(text: str) -> str:
text = __normalize_numbers(text)
text = replace_punctuation(text)
text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
return text
def replace_punctuation(text: str) -> str:
REPLACE_MAP = {
":": ",",
"οΌ›": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
".": ".",
"…": "...",
"Β·Β·Β·": "...",
"・・・": "...",
"Β·": ",",
"・": ",",
"、": ",",
"$": ".",
"β€œ": "'",
"”": "'",
'"': "'",
"β€˜": "'",
"’": "'",
"(": "'",
"οΌ‰": "'",
"(": "'",
")": "'",
"γ€Š": "'",
"》": "'",
"【": "'",
"】": "'",
"[": "'",
"]": "'",
"β€”": "-",
"βˆ’": "-",
"~": "-",
"~": "-",
"γ€Œ": "'",
"」": "'",
}
pattern = re.compile("|".join(re.escape(p) for p in REPLACE_MAP))
replaced_text = pattern.sub(lambda x: REPLACE_MAP[x.group()], text)
# replaced_text = re.sub(
# r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
# + "".join(punctuation)
# + r"]+",
# "",
# replaced_text,
# )
return replaced_text
def __normalize_numbers(text: str) -> str:
text = re.sub(__COMMA_NUMBER_PATTERN, __remove_commas, text)
text = re.sub(__POUNDS_PATTERN, r"\1 pounds", text)
text = re.sub(__DOLLARS_PATTERN, __expand_dollars, text)
text = re.sub(__DECIMAL_NUMBER_PATTERN, __expand_decimal_point, text)
text = re.sub(__ORDINAL_PATTERN, __expand_ordinal, text)
text = re.sub(__NUMBER_PATTERN, __expand_number, text)
return text
def __expand_dollars(m: re.Match[str]) -> str:
match = m.group(1)
parts = match.split(".")
if len(parts) > 2:
return match + " dollars" # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = "dollar" if dollars == 1 else "dollars"
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = "dollar" if dollars == 1 else "dollars"
return "%s %s" % (dollars, dollar_unit)
elif cents:
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s" % (cents, cent_unit)
else:
return "zero dollars"
def __remove_commas(m: re.Match[str]) -> str:
return m.group(1).replace(",", "")
def __expand_ordinal(m: re.Match[str]) -> str:
return __INFLECT.number_to_words(m.group(0)) # type: ignore
def __expand_number(m: re.Match[str]) -> str:
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return "two thousand"
elif num > 2000 and num < 2010:
return "two thousand " + __INFLECT.number_to_words(num % 100) # type: ignore
elif num % 100 == 0:
return __INFLECT.number_to_words(num // 100) + " hundred" # type: ignore
else:
return __INFLECT.number_to_words(
num, andword="", zero="oh", group=2 # type: ignore
).replace(
", ", " "
) # type: ignore
else:
return __INFLECT.number_to_words(num, andword="") # type: ignore
def __expand_decimal_point(m: re.Match[str]) -> str:
return m.group(1).replace(".", " point ")