Sa-m commited on
Commit
c90d42e
1 Parent(s): 2439eed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -2
app.py CHANGED
@@ -74,9 +74,11 @@ stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This
74
 
75
  def clean_text(text):
76
  '''
77
- Function which returns clean text
78
  '''
79
  text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters
 
 
80
  text = re.sub(r"\n", " ", text)
81
  text = re.sub(r"\n\n", " ", text)
82
  text = re.sub(r"\t", " ", text)
@@ -84,7 +86,7 @@ def clean_text(text):
84
  text = text.strip(" ")
85
  text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
86
 
87
- text = [word for word in text.split() if word not in STOPWORDS]
88
  text = ' '.join(text)
89
  return text
90
 
 
74
 
75
  def clean_text(text):
76
  '''
77
+ The function which returns clean text
78
  '''
79
  text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters
80
+ text=unidecode.unidecode(text)# diacritics remove
81
+ text=contractions.fix(text) # contraction fix
82
  text = re.sub(r"\n", " ", text)
83
  text = re.sub(r"\n\n", " ", text)
84
  text = re.sub(r"\t", " ", text)
 
86
  text = text.strip(" ")
87
  text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
88
 
89
+ text = [word for word in text.split() if word not in stop_words]
90
  text = ' '.join(text)
91
  return text
92