Natural Language Processing
Install nltk
conda install -c anaconda nltk
Data Set: Restaurant_Reviews.tsv (Tab Separated File)
Import Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
Import Data Set
os.chdir('C:\\Noble\\Training\\Deep Learning\\Training\Data\\')
os.getcwd()
# \t – for tab separated
# quoting = 3 – ignore “” from processing
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
dataset
Get one row from data set – example line 5
dataset['Review'][5]
To Print / View all stop words
import nltk # for stop words
from nltk.corpus import stopwords
nltk.download('stopwords')
all_stopwords = stopwords.words('english')
print (all_stopwords)
Cleaning the Data Set
import re
# re – Regular expression - https://docs.python.org/3/library/re.html
import nltk # for stop words
nltk.download('stopwords') # importing all stopwords
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # For applying steming in the
dataset , to get the root of the word
corpus = [] # create a list to store all cleaned words
for i in range(0, 1000):
# dataset['Review'][i] - source data to prcess - i th record in the data
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # Replace punctuations
with space, other than letters replace with space
review = review.lower()
review = review.split() # split into different words
ps = PorterStemmer() # get root words
all_stopwords = stopwords.words('english') # get english stop words
all_stopwords.remove('not') # Remove “not” from stop words
review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
review = ' '.join(review)
corpus.append(review)
Print Corpus
print (corpus)
To check Number of Distinct Words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer() # 1500 is decided by statement len(X[0]). Fist execute
without max features
X = cv.fit_transform(corpus).toarray()
len(X[0])
Create a Bag of Words (tokenization)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500) # 1500 is decided by statement
len(X[0]). Fist execute without max features
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values # this is dependent variable
print(len(X[0])) # this gives me the max_features count
print (X)
print (y)
Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,
random_state = 0)
Print Size
print (X.shape)
print (X_train.shape)
print (X_test.shape)
Create Naïve Bayce Algorithms
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
Prediction
y_pred = classifier.predict(X_test)
Print Result Actual and Predict
print(np.concatenate((y_pred.reshape(len(y_pred),1),
y_test.reshape(len(y_test),1)),1))
Confusion Matrix to print Accuracy
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
Create Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dt= DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
cm = confusion_matrix(y_test,dt_pred)
print(cm)
accuracy_score(y_test,dt_pred)