|
|
|
import re |
|
import ftfy |
|
from legalnlp.mask_functions import * |
|
|
|
|
|
def clean_bert(text): |
|
""" |
|
Cleans a text based on bad Unicode and other characters |
|
Parameters |
|
----------- |
|
texto: str |
|
A piece of text |
|
Returns |
|
----------- |
|
str |
|
Fixed text |
|
""" |
|
|
|
txt = ftfy.fix_text(text) |
|
txt = txt.replace("\n", " ") |
|
txt = re.sub(' +', ' ', txt) |
|
return(txt) |
|
|
|
|
|
def clean(text, lower=True, return_masked=False): |
|
""" |
|
Cleans a text by removing general patterns, such as url, email, acronyms and other symbols, plural |
|
of words and specific Portuguese-related grammar |
|
Parameters |
|
----------- |
|
texto: str |
|
A piece of text |
|
lower: bool |
|
Whether to lowercase text (Default: True) |
|
return_masked: bool |
|
If return_masked == False, the function outputs a clean text. Otherwise, it returns a dictionary containing the clean text and the information extracted by RegEx (Default: False) |
|
Returns |
|
----------- |
|
dict or str |
|
|
|
""" |
|
|
|
dic = {} |
|
|
|
|
|
dic['txt'], dic['url'] = mask_url(text) |
|
dic['txt'], dic['email'] = mask_email(dic['txt']) |
|
|
|
dic['txt'] = re.sub("([A-Z])\.", r"\1", dic['txt']) |
|
if lower: |
|
dic['txt'] = dic['txt'].lower() |
|
dic['txt'] = re.sub("s[\/\.]a", " sa ", dic['txt'], |
|
flags=re.I) |
|
dic['txt'] = dic['txt'].replace(" - - ", " - ") |
|
dic['txt'] = dic['txt'].replace(" - ", " - - ") |
|
|
|
dic['txt'] = re.sub("(\W)", r" \1 ", dic['txt']) |
|
dic['txt'] = dic['txt'].replace("\n", " ") |
|
dic['txt'] = dic['txt'].replace("\t", " ") |
|
|
|
|
|
dic['txt'] = dic['txt'].replace("( s )", "(s)") |
|
dic['txt'] = dic['txt'].replace("( a )", "(a)") |
|
dic['txt'] = dic['txt'].replace("( as )", "(as)") |
|
dic['txt'] = dic['txt'].replace("( o )", "(o)") |
|
dic['txt'] = dic['txt'].replace("( os )", "(os)") |
|
|
|
|
|
dic['txt'] = re.sub("(?<=\d) [-\.] (?=\d)", '', dic['txt']) |
|
dic['txt'] = re.sub("(?<=\d) , (?=\d)", ',', dic['txt']) |
|
dic['txt'] = dic['txt'].replace("[ email ]", "[email]") |
|
dic['txt'] = dic['txt'].replace("[ url ]", "[url]") |
|
|
|
dic['txt'] = re.sub("(\w) - (\w)", r"\1-\2", dic['txt']) |
|
dic['txt'] = re.sub(' +', ' ', dic['txt']) |
|
|
|
|
|
dic['txt'], dic['oab'] = mask_oab(dic['txt']) |
|
dic['txt'], dic['data'] = mask_data(dic['txt']) |
|
dic['txt'], dic['processo'] = mask_processo(dic['txt']) |
|
|
|
dic['txt'], dic['valor'] = mask_valor(dic['txt']) |
|
dic['txt'], dic['numero'] = mask_numero(dic['txt']) |
|
|
|
|
|
dic['txt'] = re.sub(' +', ' ', dic['txt']) |
|
dic['txt'] = dic['txt'].strip() |
|
|
|
|
|
if return_masked: |
|
return dic |
|
else: |
|
return dic['txt'] |