porttagger / preprocessing.py
felmateos's picture
Fixed contraction on preprocessing.py
0db0140
raw
history blame
No virus
3.83 kB
import re
contractions = {
r"(?<![\w.])no(s)?(?![$\w])": r"em o\g<1>",
r"(?<![\w.])na(s)?(?![$\w])": r"em a\g<1>",
r"(?<![\w.])da(s)?(?![$\w])": r"de a\g<1>",
r"(?<![\w.])do(s)?(?![$\w])": r"de o\g<1>",
r"(?<![\w.])ao(s)?(?![$\w])": r"a o\g<1>",
r"(?<![\w.])à(s)?(?![$\w])": r"a a\g<1>",
r"(?<![\w.])pela(s)?(?![$\w])": r"por a\g<1>",
r"(?<![\w.])pelo(s)?(?![$\w])": r"por o\g<1>",
r"(?<![\w.])nesta(s)?(?![$\w])": r"em esta\g<1>",
r"(?<![\w.])neste(s)?(?![$\w])": r"em este\g<1>",
r"(?<![\w.])nessa(s)?(?![$\w])": r"em essa\g<1>",
r"(?<![\w.])nesse(s)?(?![$\w])": r"em esse\g<1>",
r"(?<![\w.])num(?![$\w])": r"em um",
r"(?<![\w.])nuns(?![$\w])": r"em uns",
r"(?<![\w.])numa(s)?(?![$\w])": r"em uma\g<1>",
r"(?<![\w.])nisso(?![$\w])": r"em isso",
r"(?<![\w.])naquele(s)?(?![$\w])": r"em aquele\g<1>",
r"(?<![\w.])naquela(s)?(?![$\w])": r"em aquela\g<1>",
r"(?<![\w.])naquilo(?![$\w])": r"em aquilo",
r"(?<![\w.])duma(s)?(?![$\w])": r"de uma\g<1>",
r"(?<![\w.])daqui(?![$\w])": r"de aqui",
r"(?<![\w.])dali(?![$\w])": r"de ali",
r"(?<![\w.])daquele(s)?(?![$\w])": r"de aquele\g<1>",
r"(?<![\w.])daquela(s)?(?![$\w])": r"de aquela\g<1>",
r"(?<![\w.])deste(s)?(?![$\w])": r"de este\g<1>",
r"(?<![\w.])desta(s)?(?![$\w])": r"de esta\g<1>",
r"(?<![\w.])desse(s)?(?![$\w])": r"de esse\g<1>",
r"(?<![\w.])dessa(s)?(?![$\w])": r"de essa\g<1>",
r"(?<![\w.])daí(?![$\w])": r"de aí",
r"(?<![\w.])dum(?![$\w])": r"de um",
r"(?<![\w.])donde(?![$\w])": r"de onde",
r"(?<![\w.])disto(?![$\w])": r"de isto",
r"(?<![\w.])disso(?![$\w])": r"de isso",
r"(?<![\w.])daquilo(?![$\w])": r"de aquilo",
r"(?<![\w.])dela(s)?(?![$\w])": r"de ela\g<1>",
r"(?<![\w.])dele(s)?(?![$\w])": r"de ele\g<1>",
r"(?<![\w.])nisto(?![$\w])": r"em isto",
r"(?<![\w.])nele(s)?(?![$\w])": r"em ele\g<1>",
r"(?<![\w.])nela(s)?(?![$\w])": r"em ela\g<1>",
r"(?<![\w.])d'?ele(s)?(?![$\w])": r"de ele\g<1>",
r"(?<![\w.])d'?ela(s)?(?![$\w])": r"de ela\g<1>",
r"(?<![\w.])noutro(s)?(?![$\w])": r"em outro\g<1>",
r"(?<![\w.])aonde(?![$\w])": r"a onde",
r"(?<![\w.])àquela(s)?(?![$\w])": r"a aquela\g<1>",
r"(?<![\w.])àquele(s)?(?![$\w])": r"a aquele\g<1>",
r"(?<![\w.])àquilo(?![$\w])": r"a aquelo",
r"(?<![\w.])contigo(?![$\w])": r"com ti",
r"(?<![\w.])né(?![$\w])": r"não é",
r"(?<![\w.])comigo(?![$\w])": r"com mim",
r"(?<![\w.])contigo(?![$\w])": r"com ti",
r"(?<![\w.])conosco(?![$\w])": r"com nós",
r"(?<![\w.])consigo(?![$\w])": r"com si",
r"(?<![\w.])pra(?![$\w])": r"para a",
r"(?<![\w.])pro(?![$\w])": r"para o",
}
def replace_keep_case(word, replacement, text):
"""
Custom function for replace keeping the original case.
Parameters
----------
word: str
Text to be replaced.
replacement: str
String to replace word.
text:
Text to be processed.
Returns
-------
str:
Processed string
"""
def func(match):
g = match.group()
repl = match.expand(replacement)
if g.islower():
return repl.lower()
if g.istitle():
return repl.capitalize()
if g.isupper():
return repl.upper()
return repl
return re.sub(word, func, text, flags=re.I)
def expand_contractions(text: str) -> str:
"""
Replace contractions to their based form.
Parameters
----------
text: str
Text that may contain contractions.
Returns
-------
str:
Text with expanded contractions.
"""
for contraction in contractions.keys():
replace_str = contractions[contraction]
text = replace_keep_case(contraction, replace_str, text)
return text