demo_DiagTrast / utils.py
Stremie's picture
First try testing model
3be88b5
raw
history blame
1.16 kB
import re
import nltk
from nltk.corpus import stopwords
def eliminar_acento(s):
replacements = (
("á", "a"),
("é", "e"),
("í", "i"),
("ó", "o"),
("ú", "u"),
)
for a, b in replacements:
s = s.replace(a, b).replace(a.upper(), b.upper())
return s
def eliminar_patrones_stopwords(text):
nltk.download('stopwords')
lstopwords = set(stopwords.words('spanish'))
text = [word for word in text.strip().split() if not word in lstopwords]
text = ' '.join(text)
return text
def eliminar_espacios_blancos(texto):
texto = re.sub(r"\:|\_", '', texto)
texto = re.sub(r"o\/a", 'o', texto)
texto = re.sub(r'[^\w\s]', '', texto)
return texto
def clean_text(original):
original = re.sub(r'\w+(?:\.+\w+)*', lambda x: x.group(0).replace('.', ' '), original)
original = re.sub(r'\.','' , original)
texto = eliminar_acento(original)
texto = eliminar_espacios_blancos(texto)
texto = re.sub(r" +", ' ', texto)
texto = texto.lower()
texto = eliminar_patrones_stopwords(texto)
original = re.sub(r" +", ' ', texto)
return texto