File size: 1,163 Bytes
3be88b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import re
import nltk
from nltk.corpus import stopwords
def eliminar_acento(s):
replacements = (
("á", "a"),
("é", "e"),
("í", "i"),
("ó", "o"),
("ú", "u"),
)
for a, b in replacements:
s = s.replace(a, b).replace(a.upper(), b.upper())
return s
def eliminar_patrones_stopwords(text):
nltk.download('stopwords')
lstopwords = set(stopwords.words('spanish'))
text = [word for word in text.strip().split() if not word in lstopwords]
text = ' '.join(text)
return text
def eliminar_espacios_blancos(texto):
texto = re.sub(r"\:|\_", '', texto)
texto = re.sub(r"o\/a", 'o', texto)
texto = re.sub(r'[^\w\s]', '', texto)
return texto
def clean_text(original):
original = re.sub(r'\w+(?:\.+\w+)*', lambda x: x.group(0).replace('.', ' '), original)
original = re.sub(r'\.','' , original)
texto = eliminar_acento(original)
texto = eliminar_espacios_blancos(texto)
texto = re.sub(r" +", ' ', texto)
texto = texto.lower()
texto = eliminar_patrones_stopwords(texto)
original = re.sub(r" +", ' ', texto)
return texto |