ment-analysis-text-classification
March 24, 2024
[24]: import pandas as pd
import nltk
import re #regex
#Splitting the data into trainig and testing
from sklearn.model_selection import train_test_split
#model
from sklearn.naive_bayes import MultinomialNB
#evaluation metrics
from sklearn import metrics
#stemming
from nltk.stem import PorterStemmer
#stopwords
from nltk.corpus import stopwords
# pandas and numpy
import pandas as pd
import numpy as np
#import count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
#tokernizers
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
#classification results
from sklearn.metrics import confusion_matrix,␣
↪accuracy_score,classification_report
#visualizations
import seaborn as sns
1
import matplotlib.pyplot as plt
[25]: #Loading the Dataset
data = pd.read_csv('Feedback.csv')
[26]: data.head()
[26]: Text Sentiment
0 I love spending time with my family. Positive
1 This movie is absolutely terrible. Negative
2 The food at that restaurant was amazing. Positive
3 I had a horrible experience at the dentist. Negative
4 The weather today is perfect. Positive
[27]: #row and coloumn count
data.shape
[27]: (20, 2)
[28]: # count of the negative and positive sentiments
data['Sentiment'].value_counts()
[28]: Positive 10
Negative 10
Name: Sentiment, dtype: int64
[29]: # assign the count vectorizer to a variable
countvectorizer=CountVectorizer()
# get the document term matrix
DTM=pd.DataFrame(countvectorizer.fit_transform(data["Text"]).toarray(),
columns=countvectorizer.get_feature_names_out(),index=None)
DTM
[29]: absolutely amazing and at awful bad being best book breathtaking \
0 0 0 0 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 0 0 0
2 0 1 0 1 0 0 0 0 0 0
3 0 0 0 1 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0
5 0 0 0 1 1 0 0 0 0 0
6 0 0 0 0 0 0 0 0 1 0
7 0 0 0 0 0 0 0 0 0 0
8 0 1 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 1 0 0 0
10 0 0 0 0 0 0 0 1 0 0
2
11 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 1
13 0 0 0 1 0 0 0 0 0 0
14 0 0 0 0 0 0 0 0 0 0
15 0 0 0 0 0 1 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0
17 0 0 1 0 0 0 0 0 0 0
18 0 0 0 1 0 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0
… too top traffic ve view was wasn waste weather with
0 … 0 0 0 0 0 0 0 0 0 1
1 … 0 0 0 0 0 0 0 0 0 0
2 … 0 0 0 0 0 1 0 0 0 0
3 … 0 0 0 0 0 0 0 0 0 0
4 … 0 0 0 0 0 0 0 0 1 0
5 … 0 0 0 0 0 1 0 0 0 0
6 … 0 0 0 0 0 0 0 0 0 0
7 … 0 0 0 0 0 0 0 0 0 1
8 … 0 0 0 0 0 1 0 0 0 0
9 … 0 0 1 0 0 0 0 0 0 0
10 … 0 0 0 1 0 0 0 0 0 0
11 … 0 0 0 0 0 0 0 0 0 1
12 … 0 1 0 0 1 1 0 0 0 0
13 … 0 0 0 0 0 1 0 0 0 0
14 … 0 0 0 0 0 0 0 0 0 0
15 … 1 0 1 0 0 0 1 0 0 0
16 … 0 0 0 0 0 0 0 0 0 1
17 … 0 0 0 0 0 1 0 1 0 0
18 … 0 0 0 0 0 0 0 0 0 0
19 … 0 0 0 0 0 0 0 0 0 0
[20 rows x 76 columns]
[32]: DTM['Sentiment']=data['Sentiment']
DTM
[32]: absolutely amazing and at awful bad being best book breathtaking \
0 0 0 0 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 0 0 0
2 0 1 0 1 0 0 0 0 0 0
3 0 0 0 1 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0
5 0 0 0 1 1 0 0 0 0 0
6 0 0 0 0 0 0 0 0 1 0
7 0 0 0 0 0 0 0 0 0 0
3
8 0 1 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 1 0 0 0
10 0 0 0 0 0 0 0 1 0 0
11 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 1
13 0 0 0 1 0 0 0 0 0 0
14 0 0 0 0 0 0 0 0 0 0
15 0 0 0 0 0 1 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0
17 0 0 1 0 0 0 0 0 0 0
18 0 0 0 1 0 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0
… top traffic ve view was wasn waste weather with Sentiment
0 … 0 0 0 0 0 0 0 0 1 Positive
1 … 0 0 0 0 0 0 0 0 0 Negative
2 … 0 0 0 0 1 0 0 0 0 Positive
3 … 0 0 0 0 0 0 0 0 0 Negative
4 … 0 0 0 0 0 0 0 1 0 Positive
5 … 0 0 0 0 1 0 0 0 0 Negative
6 … 0 0 0 0 0 0 0 0 0 Positive
7 … 0 0 0 0 0 0 0 0 1 Negative
8 … 0 0 0 0 1 0 0 0 0 Positive
9 … 0 1 0 0 0 0 0 0 0 Negative
10 … 0 0 1 0 0 0 0 0 0 Positive
11 … 0 0 0 0 0 0 0 0 1 Negative
12 … 1 0 0 1 1 0 0 0 0 Positive
13 … 0 0 0 0 1 0 0 0 0 Negative
14 … 0 0 0 0 0 0 0 0 0 Positive
15 … 0 1 0 0 0 1 0 0 0 Positive
16 … 0 0 0 0 0 0 0 0 1 Negative
17 … 0 0 0 0 1 0 1 0 0 Negative
18 … 0 0 0 0 0 0 0 0 0 Positive
19 … 0 0 0 0 0 0 0 0 0 Negative
[20 rows x 77 columns]
[8]: #preprocessing text function
def preprocess_text(text):
# Convert text to lowercase
text = text.apply(lambda x: x.lower())
# Tokenize text
text = text.apply(lambda x: nltk.word_tokenize(x))
# Remove stop words
4
stop_words = set(stopwords.words('english'))
text = text.apply(lambda x: [word for word in x if word not in stop_words])
# Stem text
stemmer = nltk.PorterStemmer()
text = text.apply(lambda x: [stemmer.stem(word) for word in x])
# Combine words back into a single string
text = text.apply(lambda x: ' '.join(x))
# Remove non-alphanumeric characters using regex
text = text.apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
# Vectorize text using CountVectorizer
countvectorizer = CountVectorizer()
X = countvectorizer.fit_transform(text)
# Return the vectorized text and the vocabulary
return X, countvectorizer.vocabulary_
[9]: #Summarizing the Encoded Texts into a sparse matrix
text_counts=preprocess_text(data["Text"])[0]
[10]: #sparse matrix converting it to an array.
preprocess_text(data["Text"])[0].toarray()
preprocess_text(data["Text"])[0].toarray().shape
[10]: (20, 58)
[11]: #Printing the identified Unique words along with their indices
preprocess_text(data["Text"])[1]
[11]: {'love': 27,
'spend': 44,
'time': 50,
'famili': 18,
'movi': 30,
'absolut': 0,
'terribl': 49,
'food': 20,
'restaur': 41,
'amaz': 1,
'horribl': 22,
'experi': 17,
'dentist': 12,
'weather': 57,
5
'today': 51,
'perfect': 34,
'custom': 11,
'servic': 43,
'store': 46,
'aw': 2,
'realli': 40,
'enjoy': 15,
'book': 5,
'disappoint': 14,
'concert': 9,
'ca': 7,
'nt': 33,
'stand': 45,
'stuck': 47,
'traffic': 53,
'best': 4,
'pizza': 35,
've': 54,
'ever': 16,
'qualiti': 39,
'product': 36,
'view': 55,
'top': 52,
'mountain': 29,
'breathtak': 6,
'new': 31,
'design': 13,
'room': 42,
'bad': 3,
'surprisingli': 48,
'frustrat': 21,
'lack': 25,
'progress': 37,
'project': 38,
'complet': 8,
'wast': 56,
'money': 28,
'fantast': 19,
'last': 26,
'night': 32,
'internet': 24,
'connect': 10,
'hotel': 23}
[12]: #Splitting the data into trainig and testing
# x = text_counts
6
# y = data['Sentiment']
X_train, X_test, Y_train, Y_test = train_test_split(text_counts,␣
↪data['Sentiment'], test_size=0.2, random_state=5)
Training the model
[13]: #Creating the Naïve Bayes Classifier Model
MNB = MultinomialNB()
# Train the model with training data
MNB.fit(X_train, Y_train)
[13]: MultinomialNB()
Predict the class of the unseen data
[14]: #get the model predictions for the test set
y_pred = MNB.predict(X_test)
y_pred
[14]: array(['Negative', 'Negative', 'Positive', 'Negative'], dtype='<U8')
[15]: # compare the outputs
data = {'Actual': Y_test,
'Predicted': y_pred}
outputs = pd.DataFrame(data)
outputs
[15]: Actual Predicted
2 Positive Negative
5 Negative Negative
17 Negative Positive
19 Negative Negative
Get Evaluation Metrics
[16]: #accuracy values
accuracy_score(Y_test,y_pred)
[16]: 0.5
[17]: #obtain the confusion matrix
confusion_matrix(Y_test,y_pred)
[17]: array([[2, 1],
[1, 0]], dtype=int64)
7
[18]: #confusion matrix visualization
sns.heatmap(confusion_matrix(Y_test,y_pred),annot=True,fmt="g")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Classifcation Report
[19]: print(classification_report(Y_test,y_pred))
precision recall f1-score support
Negative 0.67 0.67 0.67 3
Positive 0.00 0.00 0.00 1
accuracy 0.50 4
macro avg 0.33 0.33 0.33 4
weighted avg 0.50 0.50 0.50 4
[ ]:
8
[ ]:
[ ]:
[ ]: