NLP LAB
PROGRAMS
PROGRAM 1
AIM: To study Preprocessing of text (Tokenization, Filtration, Script Validation, Stop Word
Removal, and Stemming)
i. Word and sentence tokenization:
PROGRAM:
from nltk import word_tokenize, sent_tokenize
sent = "GeeksforGeeks is a great learning platform.\
It is one of the best for Computer Science students."
print(word_tokenize(sent))
print(sent_tokenize(sent))
OUTPUT:
ii. Stemming:
PROGRAM:
from nltk.stem import PorterStemmer
# create an object of class PorterStemmer
porter = PorterStemmer()
print(porter.stem("play"))
print(porter.stem("playing"))
print(porter.stem("plays"))
print(porter.stem("played"))
OUTPUT:
iii. Lemmatization:
PROGRAM:
from nltk.stem import WordNetLemmatizer
# create an object of class WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("plays", 'v'))
print(lemmatizer.lemmatize("played", 'v'))
print(lemmatizer.lemmatize("play", 'v'))
print(lemmatizer.lemmatize("playing", 'v'))
OUTPUT:
iv. Speech tagging:
PROGRAM:
from nltk import pos_tag
from nltk import word_tokenize
text = "GeeksforGeeks is a Computer Science platform."
tokenized_text = word_tokenize(text)
tags = tokens_tag = pos_tag(tokenized_text)
OUTPUT:
v. Stop word:
PROGRAM:
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))
from nltk.tokenize import sent_tokenize, word_tokenize
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and
Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."
print(sent_tokenize(EXAMPLE_TEXT))
OUTPUT:
RESULT: The above program is successfully executed in given python software and output also
verified successfully.
PROGRAM 2
AIM: To write a program in python using NLTK library to study Word Generation.
LIST:
PROGRAM:
i. General list:
>>> sent2
['The', 'family', 'of', 'Dashwood', 'had', 'long',
'been', 'settled', 'in', 'Sussex', '.']
>>> sent3
['In', 'the', 'beginning', 'God', 'created', 'the',
'heaven', 'and', 'the', 'earth', '.']
ii. Concatenation of two list:
>>> ['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail']
['Monty', 'Python', 'and', 'the', 'Holy', 'Grail']
>>>
>>> sent4 + sent1
['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the',
'House', 'of', 'Representatives', ':', 'Call', 'me', 'Ishmael', '.']
>>>
iii. Appending list:
>>> sent1.append("Some")
>>> sent1
['Call', 'me', 'Ishmael', '.', 'Some']
>>>
INDEXING LISTS:
PROGRAM:
i. Find index of the list
>>> text4.index('awaken')
173
>>>
ii. Slicing
>>> text5[16715:16735]
['U86', 'thats', 'why', 'something', 'like', 'gamefly', 'is', 'so',
'good',
'because', 'you', 'can', 'actually', 'play', 'a', 'full', 'game',
'without',
'buying', 'it']
>>> text6[1600:1625]
['We', "'", 're', 'an', 'anarcho', '-', 'syndicalist', 'commune', '.',
'We',
'take', 'it', 'in', 'turns', 'to', 'act', 'as', 'a', 'sort', 'of',
'executive',
'officer', 'for', 'the', 'week']
>>>
VARIABLES:
PROGRAM:
i. Declaration:
>>> sent1 = ['Call', 'me', 'Ishmael', '.']
>>>
ii. Variable and assignments:
>>> my_sent = ['Bravely', 'bold', 'Sir', 'Robin', ',', 'rode',
... 'forth', 'from', 'Camelot', '.']
>>> noun_phrase = my_sent[1:4]
>>> noun_phrase
['bold', 'Sir', 'Robin']
>>> wOrDs = sorted(noun_phrase)
>>> wOrDs
['Robin', 'Sir', 'bold']
>>>
STRINGS:
PROGRAM:
i. Declaration of string:
ii. Multiplication and addition with strings:
iii. split a string into a list:
STEMMING WORDS WITH NLTK
PROGRAM:
# import these modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
# choose some words to be stemmed
words = ["program", "programs", "programmer", "programming", "programmers"]
for w in words:
print(w, " : ", ps.stem(w))
OUTPUT:
RESULT: The above program is successfully executed in given python software and output also
verified successfully.
PROGRAM 3
AIM: To write a program in python using NLTK library to study Word Generation.
SENTIMENT ANALYZER:
PROGRAM:
Step 1: Import libraries and load dataset
# import libraries
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# download nltk corpus (first time only)
import nltk
nltk.download('all')
# Load the amazon review dataset
df =
pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/a
mazon.csv')
df
Step 2: Preprocessing text
# create preprocess_text function
def preprocess_text(text):
# Tokenize the text
tokens = word_tokenize(text.lower())
# Remove stop words
filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
# Lemmatize the tokens
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
# Join the tokens back into a string
processed_text = ' '.join(lemmatized_tokens)
return processed_text
# apply the function df
df['reviewText'] = df['reviewText'].apply(preprocess_text)
df
Step 3: NLTK Sentiment Analyzer
# initialize NLTK sentiment analyzer
analyzer = SentimentIntensityAnalyzer()
# create get_sentiment function
def get_sentiment(text):
scores = analyzer.polarity_scores(text)
sentiment = 1 if scores['pos'] > 0 else 0
return sentiment
# apply get_sentiment function
df['sentiment'] = df['reviewText'].apply(get_sentiment)
df
Step 4: confusion matrix.
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df['Positive'], df['sentiment']))
OUTPUT:
Step 5: classification report
from sklearn.metrics import classification_report
print(classification_report(df['Positive'], df['sentiment']))
OUTPUT:
RESULT: The above program is successfully executed in given python software and output also
verified successfully.
PROGRAM 4
AIM: To write a program in python using NLTK library to study N-gram model.
PROGRAM:
# imports
import string
import random
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('reuters')
from nltk.corpus import reuters
from nltk import FreqDist
# input the reuters sentences
sents =reuters.sents()
# write the removal characters such as : Stopwords and punctuation
stop_words = set(stopwords.words('english'))
string.punctuation = string.punctuation +'"'+'"'+'-'+'''+'''+'—'
string.punctuation
removal_list = list(stop_words) + list(string.punctuation)+ ['lt','rt']
removal_list
# generate unigrams bigrams trigrams
unigram=[]
bigram=[]
trigram=[]
tokenized_text=[]
for sentence in sents:
sentence = list(map(lambda x:x.lower(),sentence))
for word in sentence:
if word== '.':
sentence.remove(word)
else:
unigram.append(word)
tokenized_text.append(sentence)
bigram.extend(list(ngrams(sentence, 2,pad_left=True, pad_right=True)))
trigram.extend(list(ngrams(sentence, 3, pad_left=True, pad_right=True)))
# remove the n-grams with removable words
def remove_stopwords(x):
y = []
for pair in x:
count = 0
for word in pair:
if word in removal_list:
count = count or 0
else:
count = count or 1
if (count==1):
y.append(pair)
return (y)
unigram = remove_stopwords(unigram)
bigram = remove_stopwords(bigram)
trigram = remove_stopwords(trigram)
# generate frequency of n-grams
freq_bi = FreqDist(bigram)
freq_tri = FreqDist(trigram)
d = defaultdict(Counter)
for a, b, c in freq_tri:
if(a != None and b!= None and c!= None):
d[a, b] += freq_tri[a, b, c]
# Next word prediction
s=''
def pick_word(counter):
"Chooses a random element."
return random.choice(list(counter.elements()))
prefix = "he", "said"
print(" ".join(prefix))
s = " ".join(prefix)
for i in range(19):
suffix = pick_word(d[prefix])
s=s+' '+suffix
print(s)
prefix = prefix[1], suffix
OUTPUT:
RESULT: The above program is successfully executed in given python software and output also
verified successfully.
PROGRAM 5
AIM: To write a program in python using NLTK library to study N-Grams Smoothing.
PROGRAM:
Step:1
import nltk
from nltk.corpus import brown
from collections import defaultdict, Counter
OUTPUT:
wds = brown.words()
N = len(wds)
print(N)
step 2:
mle_unigram_dist = nltk.FreqDist([w.lower() for w in wds])
bigram_seq = list(nltk.bigrams(wds))
bigram_N = len(bigram_seq)
print(bigram_N)
OUTPUT:
1161192
1161191
Step 3:
wds[:10]
OUTPUT:
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']
Step 4:
bigram_seq[:10]
Output:
[('The', 'Fulton'), ('Fulton', 'County'), ('County', 'Grand'), ('Grand', 'Jury'), ('Jury', 'said'), ('said',
'Friday'), ('Friday', 'an'), ('an', 'investigation'), ('investigation', 'of'), ('of', "Atlanta's")]
Step 5: frequency distribution:
# MLE stands for Maximum Likelihood Estimate
mle_bigram_dist = nltk.FreqDist((x.lower(),y.lower()) for (x,y) in bigram_seq)
print(mle_unigram_dist)
print(mle_unigram_dist['the'])
print(mle_bigram_dist)
print(mle_bigram_dist['the','only'])
OUTPUT:
<FreqDist with 49815 samples and 1161192 outcomes>
69971
<FreqDist with 436003 samples and 1161191 outcomes>
258
Step 6:
print(mle_bigram_dist['the','time'])
print(mle_bigram_dist['the','boy'])
print(mle_bigram_dist['the','red'])
OUTPUT:
251
81
44
Step 7:
print(49815**2)
PRINT(F'{49815**2:,}')
OUTPUT:
2481534225
2,481,534,225
Step 8:
print(436003/(49815**2))
print(f'{436003/(49815**2):.3%}')
OUTPUT:
0.00017569896703721667
0.018%
Step 9: Normalization
norm_factor = float(N)/(N + V**2)
bigram_norm_factor = float(bigram_N)/(bigram_N + V**2)
new_cts = defaultdict(lambda:norm_factor)
for w in mle_unigram_dist:
new_cts[w] = mle_unigram_dist[w] * norm_factor
new_bigram_cts = defaultdict(lambda:bigram_norm_factor)
for big in mle_bigram_dist:
new_bigram_cts[big] = mle_bigram_dist[big] * norm_factor
new_laplace_cts = [float(new_bigram_cts[('in','the')]),
float(new_bigram_cts[('said','the')]),
float(new_bigram_cts[('sewer','brother')]),
]
old_cts = [float(mle_bigram_dist[('in','the')]),
float(mle_bigram_dist[('said','the')]),
float(mle_bigram_dist[('sewer','brother')]),
]
print()
print('{0:<16s} {1:^9s} {2:^9s}'.format('','Raw cts','Smthd Cts'))
for (i,p) in enumerate(['in_the','said_the','sewer_brother']):
print('{0:<16s} {1:<6} {2:<6.7f}'.format(p,old_cts[i],new_laplace_cts[i]))
print()
print('{0:<16s} {1:^9s} {2:^9s}'.format('','Raw cts','Smthd Cts'))
for (i,p) in enumerate(['in']):
print('{0:<16s} {1:<6} {2:<6.7f}'.format(p,mle_unigram_dist[p],new_cts[p]))
print()
print('P(the | in)', end = ' ')
print(new_bigram_cts['in','the']/new_cts['in'])
OUTPUT:
RESULT: The above program is successfully executed in given python software and output also
verified successfully.