Text Concordance alpha_tokens = [token.
lower() for token in tokens if
token.isalpha()]
import nltk english_words = set(words.words())
from nltk.corpus import gutenberg valid_tokens = [token for token in alpha_tokens if
from nltk.text import Text token in english_words]
corpus=gutenberg.words("shakespeare-macbeth.tx filtered_tokens = [token for token in valid_tokens if
t") token not in stop_words]
text=Text(corpus) stemmer_tokens = [stemmer.stem(token) for token
text.concordance("monstrous") in filtered_tokens] # Corrected variable name
Output print("Original text :", text)
print("Tokenized text :", tokens)
Displaying 1 of 1 matches: print("Filtered text :", filtered_tokens)
Who cannot want the thought , how monstrous It print("Validated text :", valid_tokens)
was for Malcolme , and for Dona. print("Alpha text :", alpha_tokens)
print("Stemmed text :", stemmer_tokens)
Vocabulary Count
Output
import nltk
text=("welcome to the world") Original text : This is a sample text that we used to
words = nltk.word_tokenize(text) demonstrate NLTK text processing 123
num_words=len(words) Tokenized text : ['This', 'is', 'a', 'sample', 'text', 'that',
num_the = words.count('the') 'we', 'used', 'to', 'demonstrate', 'NLTK', 'text',
unique_words=set(words) 'processing', '123']
num_unique_words=len(unique_words) Filtered text : ['sample', 'text', 'used', 'demonstrate',
percen_unique=(num_unique_words/num_words)* 'text']
100 Validated text : ['this', 'is', 'a', 'sample', 'text', 'that',
print(words) 'we', 'used', 'to', 'demonstrate', 'text']
print("the number of words:",num_words) Alpha text : ['this', 'is', 'a', 'sample', 'text', 'that', 'we',
print('number of occurence of "the":',num_the) 'used', 'to', 'demonstrate', 'nltk', 'text', 'processing']
print("number of unique
words:",num_unique_words)
print("percentage of unique
words:",percen_unique)
Bag of Words
from sklearn.feature_extraction.text import
Output
CountVectorizer
['welcome', 'to', 'the', 'world'] corpus = ["This is the first document",
the number of words: 4 "This document is the second document",
number of occurence of "the": 1 "And this is the third one",
number of unique words: 4 "Is this the first document"]
percentage of unique words: 100.0
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
Text Preprocessing for i in range(len(corpus)):
print(f"BoW representation of Document {i+1}:
import nltk {X[i].toarray()[0]}")
nltk.download('stopwords')
nltk.download('words')
from nltk.tokenize import word_tokenize Output
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer BoW representation of
from nltk.corpus import words Document 1: [0 1 1 1 0 0 1 0 1]
BoW representation of
text = 'This is a sample text that we used to Document 2: [0 2 0 1 0 1 1 0 1]
demonstrate NLTK text processing 123' BoW representation of
tokens = word_tokenize(text) Document 3: [1 0 0 1 1 0 1 1 1]
stop_words = set(stopwords.words('english')) BoW representation of
Document 4: [0 1 1 1 0 0 1 0 1]
# Corrected variable name
stemmer = PorterStemmer()
TF-IDF filtered_tokens = [token for token in tokens if
token.lower() not in stop_words]
from nltk.tokenize import word_tokenize lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords lemmatized_tokens =
from nltk.stem import PorterStemmer [lemmatizer.lemmatize(token) for token in
from collections import Counter filtered_tokens]
import math pos_tags = nltk.pos_tag(lemmatized_tokens)
pos_word_corpus = [(word, tag) for word, tag in
def calculate_tf(word, document): pos_tags]
word_frequency = document.count(word)
return word_frequency / len(document) for word, tag in pos_word_corpus:
print(word, ":", tag)
def calculate_idf(word, corpus):
num_documents_containing_word = len([True
for document in corpus if word in document]) Output
if num_documents_containing_word == 0:
return 0 quick : JJ
else: brown : NN
return math.log10(len(corpus) / fox : JJ
num_documents_containing_word) jump : NN
lazy : NN
def calculate_tfidf(document, corpus): dog : NN
PS = PorterStemmer()
stop_words = set(stopwords.words('english'))
words = [PS.stem(word.lower()) for word in Named Entity Recognition
word_tokenize(document) if word.lower() not in
stop_words] import nltk
word_tfidf_values = {} nltk.download('averaged_perceptron_tagger')
for word in words: nltk.download('maxent_ne_chunker')
if word not in word_tfidf_values: nltk.download('words')
tf = calculate_tf(word, words) text="Josh works for Twitter in California."
idf = calculate_idf(word, corpus) tokens=nltk.word_tokenize(text)
word_tfidf_values[word] = tf * idf tagged=nltk.pos_tag(tokens)
return word_tfidf_values entities=nltk.chunk.ne_chunk(tagged)
for entity in entities:
corpus = [ "This is the first document", "This if hasattr(entity,'label'):
document is the second document", "And this is the print(entity.label(),''.join(c[0] for c in
third one", "Is this the first document" ] entity.leaves()))
document = "This is the second document"
tfidf_vector = calculate_tfidf(document, corpus)
print(tfidf_vector) Output
PERSON Josh
Output GPE Twitter
GPE California
{'second': 0.3010299956639812,
'document': 0.06246936830414996}
Pos Tagging
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
line = "quick brown fox jumps over the lazy dog"
tokens = nltk.word_tokenize(line)
stop_words = set(stopwords.words('english'))
Pos Tagging via HMM Chatbot
import nltk import nltk
nltk.download('brown') from nltk.chat.util import Chat,reflections
from nltk.corpus import brown pairs=[[r"Hello|hi|hey|hola",
["Hello,I am Aura,your AI assistant. How may I help
def train_hmm_tagger(): you?"]],
tagged_sentence = [r"How are you|How are you doing",
brown.tagged_sents(categories='news') ["I'm good, how about you?"]],
size = int(len(tagged_sentence) * 0.9) [r"What song always gets you in a good mood?",
trained_sents = tagged_sentence[:size] ['"Happy" by Pharrell Williams never fails to put a
test_sents = tagged_sentence[size:] smile on my face.']],
symbols = set([word for sentence in [r"Suggest a trending song",
tagged_sentence for word, _ in sentence]) ['Good 4 U by Olivia Rodrigo',
states = set([tag for sentence in 'Montero(Call Me By Your Name) by Lil Nas X',
tagged_sentence for _, tag in sentence]) 'Save Your Tears by The Weeknd',
trainer = 'Levitating by Dua Lipa']],
nltk.tag.hmm.HiddenMarkovModelTrainer(states=st [r"quit",["Good bye"]],
ates, symbols=symbols) [r"(.*)",["Could you try again?"]]]
hmm_tagger = bot=Chat(pairs,reflections)
trainer.train_supervised(trained_sents) bot.converse()
return hmm_tagger
Output
def pos_tag_sentence(sentence, hmm_tagger):
tokens = nltk.word_tokenize(sentence) >hi
tagged_tokens = hmm_tagger.tag(tokens) Hello,I am Aura,your AI assistant. How may I help
return tagged_tokens you?
>how are you
hmm_tagger = train_hmm_tagger() I'm good, how about you?
sentence = input("Enter the sentence to be >What song always gets you in a good mood?
tagged?") "Happy" by Pharrell Williams never fails to put a
tagged = pos_tag_sentence(sentence, smile on my face.
hmm_tagger) >Suggest a trending song
print(tagged) Save Your Tears by The Weeknd
>bye
Good bye
Output
Enter the sentence to be tagged?
The sky is so beautiful.
[('The', 'AT'), ('sky', 'NN'), ('is', 'BEZ'), ('so', 'QL'),
('beautiful', 'JJ')]
TEXT CLASSIFICATION USING LOGISTIC TEXT CLASSIFICATION USING NAÏVE
REGRESSION BAYES
from nltk.tokenize import word_tokenize import nltk
from nltk.corpus import stopwords from nltk.corpus import movie_reviews
from nltk.stem import PorterStemmer from nltk.corpus import stopwords
from sklearn.feature_extraction.text import from nltk.classify import NaiveBayesClassifier
CountVectorizer from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import from sklearn.model_selection import train_test_split
LogisticRegression nltk.download('movie_reviews')
def preprocess(text): nltk.download('stopwords')
ps = PorterStemmer() stop_words = set(stopwords.words('english'))
stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer()
words = [word_tokenize(sentence) for sentence
in text] def preprocess(text):
filtered_words = [[ps.stem(word) for word in tokens = word_tokenize(text.lower())
tokenized if word not in stop_words and tokens = [lemmatizer.lemmatize(token) for token
word.isalpha()] for in tokens if token not in stop_words and
tokenized in words] token.isalpha()]
filtered_sentences = [' '.join(sentence) for return dict(nltk.FreqDist(tokens))
sentence in filtered_words] pos_reviews = [(movie_reviews.raw(fileid),
return filtered_sentences 'positive') for fileid in movie_reviews.fileids('pos')]
neg_reviews = [(movie_reviews.raw(fileid),
sentences = ["The food is tasty", "the quality of 'negative') for fileid in movie_reviews.fileids('neg')]
food is low", "i will never recommend their food", tot_rev = pos_reviews + neg_reviews
"I got sick after having their food", "I was processed_data = [(preprocess(text), category) for
in cloudnine after tasting their food", (text, category) in tot_rev]
"My favourite is their desserts", "the food train_data, val_data =
was not cooked properly"] train_test_split(processed_data, test_size=0.2,
classes = [1, 0, 0, 0, 1, 1, 0] random_state=42)
test_sentences = ["food is not cooked properly", "I
feel sick after having food", "I love their desserts", classifier = NaiveBayesClassifier.train(train_data)
"was in cloudnine after tasting their food"] new_text = ["The movie was amazing", "the movie
was terrible", "The movie was awful"]
vectorizer = CountVectorizer() for text in new_text:
sentences = preprocess(sentences) new_features = preprocess(text)
vect1 = vectorizer.fit_transform(sentences) predicted_category =
# Splitting data for testing classifier.classify(new_features)
# train_data, test_data, train_labels, test_labels = print(f"The predicted category for '{text}' is
train_test_split(vect1, classes, test_size=0.2, '{predicted_category}'")
random_state=42
nb = LogisticRegression() Output
nb.fit(vect1, classes)
test_sentences = preprocess(test_sentences) The predicted category for 'The movie was
vect2 = vectorizer.transform(test_sentences) amazing' is 'positive'
pred_classes = nb.predict(vect2) The predicted category for 'the movie was terrible'
print(pred_classes) is 'negative'
The predicted category for 'The movie was awful' is
Output 'negative’
[0 0 1 1]