In [4]:
# Importing Libraries : 
import pandas as pd
import pickle

In [5]:
# importing data to work on :
dataset = pd.read_csv("data/spam.csv")
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
dataset.shape

(5572, 2)

In [7]:
dataset.columns

Index(['Category', 'Message'], dtype='object')

In [8]:
# to check if there are NULL values in our dataset :
print(dataset.info())
dataset.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


Category    0
Message     0
dtype: int64

In [9]:
# to check if there are values other than spam and ham :
dataset.nunique()

Category       2
Message     5157
dtype: int64

In [10]:
# Encoding Categories into 0 and 1 :
dataset["Spam"] = [1 if i=="spam" else 0 for i in dataset["Category"]]
dataset.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [11]:
X = dataset["Message"]
y = dataset.Spam

### Train-Test Split :

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [14]:
# Importing CountVectorizer which converta the text into matrics :
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
# Importing Different classifiers to compare :
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB       # ‚úîÔ∏è‚úîÔ∏è Works well with this type of problems, i.e. when data is discrete.

In [16]:
# Creating a pipeline :

from sklearn.pipeline import Pipeline
clf=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [17]:
# Fitting Data :

clf.fit(X_train, y_train)

In [18]:
# Accuracy check :
clf.score(X_test,y_test)

0.97847533632287

### *TESTING :*

In [19]:
msg = ["Thanks for your subscription to Ringtone - 'Shila ki jawaani', your mobile will be charged RS.5/month Please confirm by replying YES or NO. If you reply NO you will not be charged",
"Oops, I'll let you know when my roommate's done",
"hello, i am akshat, are you free today?",
"free free free, get free coins, just download this xyz app (100 RS. Instant Cash)",
"subscribe to get unlimited benefits",
" i want some money, can you plz send me? "]

# True Values : 1 0 0 1 1 0
# i.e. - Spam, Ham, Ham, Spam, Spam, Ham

y_pred = clf.predict(msg) 
for i in y_pred:
    if i==0:
        print("Good to go üëç")
    else:
        print("Spam!")

Spam!
Good to go üëç
Good to go üëç
Spam!
Spam!
Good to go üëç


#### *Saving the model using `Pickle` :*

In [22]:
# with open("models/spam-clf.pkl", "wb") as f:
#     pickle.dump(clf, f)