# Spam Classifer

### Classification of SPAM or HAM using standard classifers

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# import string
import string

# import countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

# import train_test_split
from sklearn.model_selection import train_test_split

# import a bunch of classifiers and compare their accuracy with the train and test data
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

# import accuracy score
from sklearn.metrics import accuracy_score

# import confusion matrix
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('spam.tsv', sep='\t', names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!


In [5]:
# check info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5567 entries, 0 to 5566
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5567 non-null   object
 1   message  5567 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB


In [6]:
# add a new column to the dataset to count the length of the message
df['length'] = df['message'].apply(len)
df.head(10)

Unnamed: 0,label,message,length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!,36
5,ham,As per your request 'Melle Melle (Oru Minnamin...,160
6,spam,WINNER!! As a valued network customer you have...,157
7,spam,Had your mobile 11 months or more? U R entitle...,154
8,ham,I'm gonna be home soon and i don't want to tal...,109
9,spam,"SIX chances to win CASH! From 100 to 20,000 po...",136


In [7]:
# check the mean length of message of ham and spam messages
df.groupby('label').mean()

Unnamed: 0_level_0,length
label,Unnamed: 1_level_1
ham,71.442854
spam,138.659517


Seems like spams are geenrally longer than ham. So, we can use the length of the message as a feature.

In [8]:
features = ['spam', 'ham']

# check the description of the messages for each feature
for i in features:
    print(i)
    print(df[df['label'] == i]['length'].describe())
    print()

spam
count    746.000000
mean     138.659517
std       28.891361
min       13.000000
25%      133.000000
50%      149.000000
75%      157.000000
max      223.000000
Name: length, dtype: float64

ham
count    4821.000000
mean       71.442854
std        58.373866
min         2.000000
25%        33.000000
50%        52.000000
75%        93.000000
max       910.000000
Name: length, dtype: float64



## Data Preprocessing

We will first remove the punctuations from the messages and then tokenize them. We will then remove the stopwords and then stem the words. We will then create a bag of words model.

In [9]:
# write a function to remove punctuations from meassages
def remove_punctuation(text):
    no_punct = [char for char in text if char not in string.punctuation]
    no_punct = ''.join(no_punct)
    return no_punct

In [10]:
# apply the function to the message column
df['message'] = df['message'].apply(remove_punctuation)

In [11]:
# after removing punctuations, check the length of the message and also description of the message
df['length'] = df['message'].apply(len)

In [12]:
# check the description of the labels
for i in features:
    print(i)
    print(df[df['label'] == i]['length'].describe())
    print()


spam
count    746.000000
mean     132.950402
std       27.847503
min       12.000000
25%      127.000000
50%      143.000000
75%      151.000000
max      207.000000
Name: length, dtype: float64

ham
count    4821.000000
mean       67.506741
std        55.333532
min         1.000000
25%        31.000000
50%        50.000000
75%        88.000000
max       888.000000
Name: length, dtype: float64



We need to convert the messages into a vector format. We will use the CountVectorizer class from the sklearn library. We will pass stop_words='english' to remove the stopwords.

Setting stop_words='english' will exclude a predefined list of English language words that are considered to be stop words. Stop words are words that are commonly used in natural language but typically do not carry significant meaning or context. Examples of stop words in English include "the", "and", "a", "in", and "of". By excluding these words from the token count, the resulting matrix will be less sparse and more meaningful, as the focus will be on the words that carry more significance and context.

In [13]:
CV = CountVectorizer(stop_words='english')

In [14]:
# assign the contents of each 'message' to X and 'label' to y
X = df['message'].values
y = df['label'].values

In [15]:
# split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# fit the conutervectorizer transformer to the training data
X_train_CV = CV.fit_transform(X_train)

# fit the countvectorizer transformer to the testing data
X_test_CV = CV.transform(X_test)


In [16]:
# create a list of classifiers
classifiers = [
    MultinomialNB(),
    LogisticRegression(),
    RandomForestClassifier(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ExtraTreesClassifier(),
    SGDClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    MLPClassifier(),
]


# create a dataframe to store the accuracy of each classifier
df_acc = pd.DataFrame(columns=['classifier', 'accuracy'])

# create a dataframe to store the confusion matrix of each classifier
df_cm = pd.DataFrame(columns=['classifier', 'confusion_matrix'])

# create a function to train the model and store the accuracy, confusion matrix, classification report, f1 score, precision score, recall score, roc_auc score, jaccard score and log loss in the dataframe

def train_model(classifier):
    clf = classifier
    clf.fit(X_train_CV, y_train)
    y_pred = clf.predict(X_test_CV)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print('Accuracy of ' + str(classifier) + ' is: ' + str(acc))
    print('Confusion Matrix of ' + str(classifier) + ' is: ' + str(cm))

# train the model
for classifier in classifiers:
    train_model(classifier)


Accuracy of MultinomialNB() is: 0.9892280071813285
Confusion Matrix of MultinomialNB() is: [[963   6]
 [  6 139]]
Accuracy of LogisticRegression() is: 0.9829443447037702
Confusion Matrix of LogisticRegression() is: [[969   0]
 [ 19 126]]
Accuracy of RandomForestClassifier() is: 0.9820466786355476
Confusion Matrix of RandomForestClassifier() is: [[969   0]
 [ 20 125]]
Accuracy of SVC() is: 0.981149012567325
Confusion Matrix of SVC() is: [[968   1]
 [ 20 125]]
Accuracy of KNeighborsClassifier() is: 0.9156193895870736
Confusion Matrix of KNeighborsClassifier() is: [[969   0]
 [ 94  51]]
Accuracy of DecisionTreeClassifier() is: 0.9685816876122083
Confusion Matrix of DecisionTreeClassifier() is: [[956  13]
 [ 22 123]]
Accuracy of AdaBoostClassifier() is: 0.9631956912028725
Confusion Matrix of AdaBoostClassifier() is: [[959  10]
 [ 31 114]]
Accuracy of GradientBoostingClassifier() is: 0.9631956912028725
Confusion Matrix of GradientBoostingClassifier() is: [[967   2]
 [ 39 106]]
Accuracy of E

We choose the MultinomialNB since it is the one with highest accuracy.

In [17]:
# create an instance of the classifier
NB = MultinomialNB()

# fit the classifier to the training data
NB.fit(X_train_CV, y_train)

In [18]:
# test the accuracy with test data
y_pred = NB.predict(X_test_CV)

# check the accuracy
accuracy_score(y_test, y_pred)*100

98.92280071813285

In [72]:
# write a function that will accept user input and will predict if it is a spam or ham. Based on the prediction it will save the result and message in a csv file
def predict_message(message):
    message = remove_punctuation(message)
    tmessage = CV.transform([message])
    prediction = NB.predict(tmessage)
    print(prediction)
    if prediction == 'spam':
        df = pd.read_csv('results/results.tsv', sep='\t', names = ['label', 'message'])
        df = df.append({'label': 'spam', 'message': message}, ignore_index = True)
        df.to_csv('results/results.tsv', index=False)
    else:
        df = pd.read_csv('results/results.tsv', sep='\t', names = ['label', 'message'])
        df = df.append({'label': 'ham', 'message': message}, ignore_index=True)
        df.to_csv('results/results.tsv', index=False)
    return prediction

In [73]:
predict_message('Good Morning Madam! Have a Nice Day!!')

['ham']


  df = df.append({'label': 'ham', 'message': message}, ignore_index=True)


array(['ham'], dtype='<U4')

In [2]:
!pip install "modin[all]"

Defaulting to user installation because normal site-packages is not writeable
Collecting modin[all]
  Downloading modin-0.19.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m56.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Collecting fsspec
  Downloading fsspec-2023.3.0-py3-none-any.whl (145 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.4/145.4 kB[0m [31m93.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pandas==1.5.3
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m413.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting distributed>=2.22.0
  Downloading distributed-2023.3.2-py3-none-any.whl (956 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m956.9/956.9 kB[0m [31m106.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m