Soufianesejjari commited on
Commit
3d1c35c
·
1 Parent(s): 4f3e60d

add stopword

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. preprocessor.py +22 -0
  3. requirements.txt +2 -1
app.py CHANGED
@@ -3,7 +3,7 @@ import preprocessor
3
  import helper
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
- import pandas as pd
7
 
8
  st.set_option('deprecation.showPyplotGlobalUse', False)
9
 
 
3
  import helper
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
+ import pandas as pd
7
 
8
  st.set_option('deprecation.showPyplotGlobalUse', False)
9
 
preprocessor.py CHANGED
@@ -1,5 +1,9 @@
1
  import re
2
  import pandas as pd
 
 
 
 
3
 
4
  def preprocess(data):
5
  pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
@@ -28,6 +32,24 @@ def preprocess(data):
28
  df['message'] = messages
29
  df.drop(columns=['user_message'], inplace=True)
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  df['only_date'] = df['date'].dt.date
32
  df['year'] = df['date'].dt.year
33
  df['month_num'] = df['date'].dt.month
 
1
  import re
2
  import pandas as pd
3
+ from nltk.corpus import stopwords
4
+ import nltk
5
+
6
+ nltk.download('stopwords')
7
 
8
  def preprocess(data):
9
  pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
 
32
  df['message'] = messages
33
  df.drop(columns=['user_message'], inplace=True)
34
 
35
+ # Additional preprocessing
36
+ stop_words_en = set(stopwords.words('english'))
37
+ stop_words_fr = set(stopwords.words('french'))
38
+ combined_stop_words = stop_words_en.union(stop_words_fr)
39
+
40
+ def clean_message(message):
41
+ # Remove messages containing '<Media ...>'
42
+ if '<Media' in message:
43
+ return ''
44
+ # Remove words with less than 3 characters
45
+ words = [word for word in message.split() if len(word) >= 3]
46
+ # Remove stopwords
47
+ words = [word for word in words if word.lower() not in combined_stop_words]
48
+ return ' '.join(words)
49
+
50
+ df['message'] = df['message'].apply(clean_message)
51
+ df = df[df['message'] != ''] # Remove empty messages
52
+
53
  df['only_date'] = df['date'].dt.date
54
  df['year'] = df['date'].dt.year
55
  df['month_num'] = df['date'].dt.month
requirements.txt CHANGED
@@ -4,4 +4,5 @@ seaborn
4
  urlextract
5
  wordcloud
6
  pandas
7
- emoji
 
 
4
  urlextract
5
  wordcloud
6
  pandas
7
+ emoji
8
+ nltk