|
|
|
|
|
import re |
|
import unicodedata |
|
from typing import Iterable |
|
|
|
|
|
ADDITIONAL_DIACRITICS = { |
|
"œ": "oe", |
|
"Œ": "OE", |
|
"ø": "o", |
|
"Ø": "O", |
|
"æ": "ae", |
|
"Æ": "AE", |
|
"ß": "ss", |
|
"ẞ": "SS", |
|
"đ": "d", |
|
"Đ": "D", |
|
"ð": "d", |
|
"Ð": "D", |
|
"þ": "th", |
|
"Þ": "th", |
|
"ł": "l", |
|
"Ł": "L", |
|
} |
|
|
|
PORTUGUESE_ACCENTED_CHARACTERS = [ |
|
"ç", |
|
"á", |
|
"é", |
|
"í", |
|
"ó", |
|
"ú", |
|
"â", |
|
"ê", |
|
"ô", |
|
"ã", |
|
"õ", |
|
"à", |
|
"ò", |
|
"è", |
|
"ì", |
|
"ù" |
|
] |
|
|
|
PORTUGUESE_DIACRITICS = ['̧', '̂', '̀', '̃', '́'] |
|
|
|
|
|
def remove_symbols_and_diacritics(s: str, keep: Iterable[str] = "") -> str: |
|
""" |
|
Replace any other markers, symbols, and punctuations with a space, |
|
and drop any diacritics (category 'Mn' and some manual mappings) |
|
""" |
|
return "".join( |
|
c |
|
if c in keep |
|
else ADDITIONAL_DIACRITICS[c] |
|
if c in ADDITIONAL_DIACRITICS |
|
else "" |
|
if unicodedata.category(c) == "Mn" |
|
else " " |
|
if unicodedata.category(c)[0] in "MSP" |
|
else c |
|
for c in unicodedata.normalize("NFKD", s) |
|
) |
|
|
|
|
|
class PortugueseTextNormalizer: |
|
def __init__(self): |
|
self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh)\b" |
|
self.replacers = { |
|
|
|
r"\bsr\b": "senhor ", |
|
r"\bsra\b": "senhora ", |
|
r"\bsto\b": "santo ", |
|
r"\bsta\b": "santa ", |
|
r"\bdr\b": "doutor ", |
|
r"\bdra\b": "doutora ", |
|
r"\bprof\b": "professor ", |
|
r"\bcap\b": "capitão ", |
|
} |
|
|
|
def __call__(self, s: str): |
|
s = s.lower() |
|
|
|
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) |
|
s = re.sub(r"\(([^)]+?)\)", "", s) |
|
s = re.sub(self.ignore_patterns, "", s) |
|
|
|
for pattern, replacement in self.replacers.items(): |
|
s = re.sub(pattern, replacement, s) |
|
|
|
|
|
|
|
|
|
s = re.sub(r"(\d),(\d)", r"\1\2", s) |
|
s = re.sub(r"(\d)\.(\d)", r"\1\2", s) |
|
|
|
s = remove_symbols_and_diacritics(s, keep=PORTUGUESE_DIACRITICS) |
|
|
|
s = re.sub(r"\s+", " ", s) |
|
|
|
return s.lower() |
|
|