12/5/2020 TwitterHate_NLP.
ipynb - Colaboratory
Importing Packages
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
True
Loading Twitter Dataset
sentiment_data = pd.read_csv('/content/TwitterHate.csv')
print(len(sentiment_data))
sentiment_data.head()
31962
id label tweet
0 1 0 @user when a father is dysfunctional and is s...
1 2 0 @user @user thanks for #lyft credit i can't us...
2 3 0 bihday your majesty
3 4 0 #model i love u take with u all the time in ...
4 5 0 factsguide: society now #motivation
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 1/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
sentiment_data['label'].value_counts()
#Imbalanced Dataset
0 29720
1 2242
Name: label, dtype: int64
# from imblearn.over_sampling import RandomOverSampler
/usr/local/lib/python3.6/dist-packages/sklearn/externals/six.py:31: FutureWarning: The m
"(https://pypi.org/project/six/).", FutureWarning)
/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning:
warnings.warn(message, FutureWarning)
Cleaning Text using Regex
def textcleanup(data):
tk = TweetTokenizer()
stop_words = set(stopwords.words('english'))
tweet_list = []
word_list = []
for tweet in list(data['tweet']):
tweet = tweet.encode('ascii', 'ignore').decode('ascii')
tweet = re.sub('[^ ]+\.[^ ]+','',tweet) # Remove URL
tweet = re.sub("[#'']",'',tweet) # Remove #
tweet = re.sub('\@\w+','',tweet) # Remove User handle
tweet = re.sub(r'^[RT]','',tweet)#remove RT-tags
tweet = re.sub("\W+\\+[A-Za-z0-9]+\d+\D|\\+[A-Za-z0-9]+\d+\D+\w",'',tweet) #Remove redu
tweet = re.sub("\b[a]+[m]+[p]\b",'',tweet)
tweet = tweet.lower().lstrip().rstrip()
tweet = tk.tokenize(tweet)
tweet = [word for word in tweet if word not in stop_words]
tweet = list(filter(lambda sentiment: len(sentiment) > 1, tweet))
tweet_list.append(tweet)
word_list.extend(tweet)
return tweet_list,word_list,stop_words
cleantext,wordlist,stop_words = textcleanup(sentiment_data)
Getting 10 most common terms after cleaning the text
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 2/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
word_count = Counter(wordlist)
word_count.most_common(10)
[('love', 2725),
('day', 2247),
('happy', 1673),
('im', 1155),
('time', 1115),
('life', 1114),
('like', 1089),
('today', 993),
('new', 989),
('positive', 934)]
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stop_words,
min_font_size = 10).generate(str(wordlist))
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 3/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
Joining the token back to form strings.
clean_sentiments = []
for sent in cleantext:
detokanized_sent = TreebankWordDetokenizer().detokenize(sent)
clean_sentiments.append(detokanized_sent)
clean_sentiments[0]
'father dysfunctional selfish drags kids dysfunction run'
newframe = {'labels' : sentiment_data['label'], 'clean_sentiments' : clean_sentiments }
sentiments_frame = pd.DataFrame(newframe)
sentiments_frame.head()
labels clean_sentiments
0 0 father dysfunctional selfish drags kids dysfun...
1 0 thanks lyft credit cant use cause dont offer w...
2 0 bihday majesty
3 0 model love take time ur
4 0 factsguide society motivation
Using TF-IDF values for the terms as a feature to get into a
vector space model
tfidf_vectorizer = TfidfVectorizer(
max_df=0.5,
min_df=10,
strip_accents='unicode',
max_features=5000
)
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 4/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
)
tfidf_data = tfidf_vectorizer.fit_transform(sentiments_frame['clean_sentiments'])
Splitting Data into train, test and Creating Model
#Splitting Data
i te
X_train, X_test, y_train, y_test = train_test_split(tfidf_data,sentiments_frame['labels'],
#Creating Model
model = LogisticRegression()
model.fit(X_train,y_train)
train_score = model.score(X_train,y_train)
test_score = model.score(X_test,y_test)
print(train_score)
print(test_score)
0.9557276389377762
0.9510402002189895
#Generating and Plotting Confusion Matrix
cf_matrix =confusion_matrix(y_test,model.predict(X_test))
plt.figure(figsize = (7,5))
sns.heatmap(cf_matrix, annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f2f86c36320>
#Classification Report for Test Data
print(classification_report(y_test, model.predict(X_test)))
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 5/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
precision recall f1-score support
0 0.95 1.00 0.97 5937
1 0.90 0.35 0.51 456
accuracy 0.95 6393
macro avg 0.93 0.68 0.74 6393
weighted avg 0.95 0.95 0.94 6393
#Classification Report for Train Data
print(classification_report(y_train, model.predict(X_train)))
precision recall f1-score support
0 0.96 1.00 0.98 23783
1 0.94 0.39 0.55 1786
accuracy 0.96 25569
macro avg 0.95 0.69 0.76 25569
weighted avg 0.96 0.96 0.95 25569
Using Grid Search and Strati ed Kfold for Hyperparameter
Tuning
parameters = [{'penalty': ['l1', 'l2'],
'C': [1, 10, 100, 1000],
'class_weight': ['auto','balanced']}]
grid_sr = GridSearchCV(
LogisticRegression(class_weight="balanced"), parameters, scoring='recal
)
grid_sr.fit(X_train, y_train)
grid_sr.best_params_
{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(tfidf_data,sentiments_frame['labels']):
# select rows
train_X, test_X = tfidf_data[train_ix], tfidf_data[test_ix]
train_y, test_y = sentiments_frame['labels'][train_ix], sentiments_frame['labels'][test_ix]
model_test = LogisticRegression(C= 1, class_weight= 'balanced', penalty= 'l2')
model test fit(X train y train)
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 6/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
model_test.fit(X_train,y_train)
train_score1 = model_test.score(train_X,train_y)
test_score1 = model_test.score(test_X,test_y)
print('Train Score', train_score1)
print('Test Score', test_score1)
if test_score1 > train_score1:
break
Train Score 0.9289141045429894
Test Score 0.9300463020898511
print(classification_report(test_y,model_test.predict(test_X)))
precision recall f1-score support
0 0.99 0.93 0.96 7430
1 0.50 0.91 0.65 561
accuracy 0.93 7991
macro avg 0.75 0.92 0.80 7991
weighted avg 0.96 0.93 0.94 7991
print(classification_report(train_y,model_test.predict(train_X)))
precision recall f1-score support
0 0.99 0.93 0.96 22290
1 0.50 0.93 0.65 1681
accuracy 0.93 23971
macro avg 0.75 0.93 0.80 23971
weighted avg 0.96 0.93 0.94 23971
Best Parameters : (C= 1, class_weight= 'balanced', penalty= '12')
Recall on the test set for the toxic comments : 93
f_1 Score on the test set for the toxic comments : 65
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 7/8
12/5/2020 TwitterHate_NLP.ipynb - Colaboratory
https://colab.research.google.com/drive/1kZR_kENRbH_zbQ4BKUJ3zuhFOaoL_2ce#scrollTo=LsXwJKKw910w&printMode=true 8/8