|
import re |
|
import nltk |
|
from nltk.corpus import stopwords |
|
|
|
def eliminar_acento(s): |
|
replacements = ( |
|
("á", "a"), |
|
("é", "e"), |
|
("í", "i"), |
|
("ó", "o"), |
|
("ú", "u"), |
|
) |
|
for a, b in replacements: |
|
s = s.replace(a, b).replace(a.upper(), b.upper()) |
|
return s |
|
|
|
def eliminar_patrones_stopwords(text): |
|
nltk.download('stopwords') |
|
lstopwords = set(stopwords.words('spanish')) |
|
|
|
text = [word for word in text.strip().split() if not word in lstopwords] |
|
text = ' '.join(text) |
|
return text |
|
|
|
def eliminar_espacios_blancos(texto): |
|
texto = re.sub(r"\:|\_", '', texto) |
|
texto = re.sub(r"o\/a", 'o', texto) |
|
|
|
texto = re.sub(r'[^\w\s]', '', texto) |
|
return texto |
|
|
|
def clean_text(original): |
|
original = re.sub(r'\w+(?:\.+\w+)*', lambda x: x.group(0).replace('.', ' '), original) |
|
original = re.sub(r'\.','' , original) |
|
|
|
texto = eliminar_acento(original) |
|
texto = eliminar_espacios_blancos(texto) |
|
texto = re.sub(r" +", ' ', texto) |
|
|
|
texto = texto.lower() |
|
texto = eliminar_patrones_stopwords(texto) |
|
original = re.sub(r" +", ' ', texto) |
|
return texto |