Python CA 4
Name : Subham Patra
REG NO : 12215814
# SMS SPAM DETECTION
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('spam.csv',encoding='latin1')
df.sample(5)
df.shape
## Data Cleaning
df.info()
# drop last 3 columns
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
df
# rename columns
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head()
# change target into binary
from sklearn.preprocessing import LabelEncoder as LE
encoder=LE()
df['target']=encoder.fit_transform(df['target'])
df.head()
# null values
df.isnull().sum()
#check duplicates
df.duplicated().sum()
#drop duplicates
df=df.drop_duplicates(keep='first')
df.duplicated().sum()
df.shape
# EDA
df['target'].value_counts()
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
plt.show()
#making new columns(no. of chars,words and sentences) for better analyzing
import nltk
!pip install nltk
nltk.download('punkt')
df['num_chars']=df['text'].apply(len)
df.sample(3)
#num of words
df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))
df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))
df.head()
df[['num_chars','num_words','num_sentences']].describe()
#hams
df[df['target']==0][['num_chars','num_words','num_sentences']].describe()
#spams
df[df['target']==1][['num_chars','num_words','num_sentences']].describe()
import seaborn as sns
sns.histplot(df[df['target']==0]['num_chars'])
sns.histplot(df[df['target']==1]['num_chars'],color='red')
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color='red')
sns.pairplot(df,hue='target')
sns.heatmap(df.corr(),annot=True)
# Data Preprocessing
### Lower case
### Tokenization
### Removing special characters
### Removing stop words and punctuation
### Stemming
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords=stopwords.words('english')
import string
puncs=string.punctuation
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def transform_text(text):
text=text.lower()
text=nltk.word_tokenize(text)
y=[]
for i in text:
if i.isalnum():
y.append(i)
text=y[:]
y.clear()
for i in text:
if i not in stopwords+list(puncs):
y.append(i)
text=y[:]
y.clear()
for i in text:
y.append(ps.stem(i))
return " ".join(y)
df['transformed_text']=df['text'].apply(transform_text)
df.sample(5)
# !pip install wordcloud
from wordcloud import WordCloud
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
spam_wc=wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))
# plt.figure(figsize=(15,6))
plt.imshow(spam_wc)
ham_wc=wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" "))
# plt.figure(figsize=(15,6))
plt.imshow(ham_wc)
spam_words=[]
for msg in df[df['target']==1]['transformed_text'].tolist():
for word in msg.split():
spam_words.append(word)
len(spam_words)
from collections import Counter
plt.bar(pd.DataFrame(Counter(spam_words).most_common(30))[0],pd.DataFrame(Counter(spam_w
ords).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()
ham_words=[]
for msg in df[df['target']==0]['transformed_text'].tolist():
for word in msg.split():
ham_words.append(word)
len(ham_words)
from collections import Counter
plt.bar(pd.DataFrame(Counter(ham_words).most_common(30))[0],pd.DataFrame(Counter(ham_wor
ds).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()
# MODEL BUILDING->naive bayes start
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['transformed_text']).toarray()
X.shape
y = df['target'].values
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
!pip install xgboost
from xgboost import XGBClassifier
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)
def train_classifier(clf,X_train,y_train,X_test,y_test):
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
return accuracy,precision
train_classifier(svc,X_train,y_train,X_test,y_test)
clfs = {
'SVC' : svc,
'KN' : knc,
'NB': mnb,
'DT': dtc,
'LR': lrc,
'RF': rfc,
'AdaBoost': abc,
'BgC': bc,
'ETC': etc,
'GBDT':gbdt,
'xgb':xgb
# accuracy_scores = []
# precision_scores = []
# for name,clf in clfs.items():
# current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
# print("For ",name)
# print("Accuracy - ",current_accuracy)
# print("Precision - ",current_precision)
# accuracy_scores.append(current_accuracy)
# precision_scores.append(current_precision)
# performance_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_
values('Precision',ascending=False)
# performance_df
# performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")
# performance_df1
# sns.catplot(x = 'Algorithm', y='value', hue = 'variable',data=performance_df1, kind='bar',height=5)
# plt.ylim(0.5,1.0)
# plt.xticks(rotation='vertical')
# plt.show()
# model improve
# 1. Change the max_features parameter of TfIdf
temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3
000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)
temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precisio
n_scores}).sort_values('Precision_scaling',ascending=False)
new_df = performance_df.merge(temp_df,on='Algorithm')
new_df_scaled = new_df.merge(temp_df,on='Algorithm')
temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':
precision_scores}).sort_values('Precision_num_chars',ascending=False)
# new_df_scaled.merge(temp_df,on='Algorithm')
# Voting Classifier
svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
mnb = MultinomialNB()
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')
voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))
# Applying stacking
estimators=[('svm', svc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()
# from sklearn.ensemble import StackingClassifier
# clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
# clf.fit(X_train,y_train)
# y_pred = clf.predict(X_test)
# print("Accuracy",accuracy_score(y_test,y_pred))
# print("Precision",precision_score(y_test,y_pred))
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))