Commit
·
3d1c35c
1
Parent(s):
4f3e60d
add stopword
Browse files- app.py +1 -1
- preprocessor.py +22 -0
- requirements.txt +2 -1
app.py
CHANGED
@@ -3,7 +3,7 @@ import preprocessor
|
|
3 |
import helper
|
4 |
import matplotlib.pyplot as plt
|
5 |
import seaborn as sns
|
6 |
-
import pandas as pd
|
7 |
|
8 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
9 |
|
|
|
3 |
import helper
|
4 |
import matplotlib.pyplot as plt
|
5 |
import seaborn as sns
|
6 |
+
import pandas as pd
|
7 |
|
8 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
9 |
|
preprocessor.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
import re
|
2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
3 |
|
4 |
def preprocess(data):
|
5 |
pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
|
@@ -28,6 +32,24 @@ def preprocess(data):
|
|
28 |
df['message'] = messages
|
29 |
df.drop(columns=['user_message'], inplace=True)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
df['only_date'] = df['date'].dt.date
|
32 |
df['year'] = df['date'].dt.year
|
33 |
df['month_num'] = df['date'].dt.month
|
|
|
1 |
import re
|
2 |
import pandas as pd
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
import nltk
|
5 |
+
|
6 |
+
nltk.download('stopwords')
|
7 |
|
8 |
def preprocess(data):
|
9 |
pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
|
|
|
32 |
df['message'] = messages
|
33 |
df.drop(columns=['user_message'], inplace=True)
|
34 |
|
35 |
+
# Additional preprocessing
|
36 |
+
stop_words_en = set(stopwords.words('english'))
|
37 |
+
stop_words_fr = set(stopwords.words('french'))
|
38 |
+
combined_stop_words = stop_words_en.union(stop_words_fr)
|
39 |
+
|
40 |
+
def clean_message(message):
|
41 |
+
# Remove messages containing '<Media ...>'
|
42 |
+
if '<Media' in message:
|
43 |
+
return ''
|
44 |
+
# Remove words with less than 3 characters
|
45 |
+
words = [word for word in message.split() if len(word) >= 3]
|
46 |
+
# Remove stopwords
|
47 |
+
words = [word for word in words if word.lower() not in combined_stop_words]
|
48 |
+
return ' '.join(words)
|
49 |
+
|
50 |
+
df['message'] = df['message'].apply(clean_message)
|
51 |
+
df = df[df['message'] != ''] # Remove empty messages
|
52 |
+
|
53 |
df['only_date'] = df['date'].dt.date
|
54 |
df['year'] = df['date'].dt.year
|
55 |
df['month_num'] = df['date'].dt.month
|
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ seaborn
|
|
4 |
urlextract
|
5 |
wordcloud
|
6 |
pandas
|
7 |
-
emoji
|
|
|
|
4 |
urlextract
|
5 |
wordcloud
|
6 |
pandas
|
7 |
+
emoji
|
8 |
+
nltk
|