efeperro commited on
Commit
5edc75a
1 Parent(s): a55b9b1

Update functions_preprocess.py

Browse files
Files changed (1) hide show
  1. functions_preprocess.py +7 -0
functions_preprocess.py CHANGED
@@ -147,3 +147,10 @@ def build_vocab(data_iter):
147
  vocab.set_default_index(vocab["<unk>"])
148
  return vocab, tokenizer
149
 
 
 
 
 
 
 
 
 
147
  vocab.set_default_index(vocab["<unk>"])
148
  return vocab, tokenizer
149
 
150
+ def clean_text(text):
151
+ text = text.lower()
152
+ text = re.sub(r'\d+', '', text)
153
+ text = re.sub(r'[^\w\s]', '', text)
154
+ text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
155
+ return text
156
+