0% found this document useful (0 votes)

10 views4 pages

NLP Projects

Uploaded by

Joshua David

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

10 views4 pages

NLP Projects

Uploaded by

Joshua David

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 4

Text Concordance alpha_tokens = [token.

lower() for token in tokens if

token.isalpha()]
import nltk english_words = set(words.words())
from nltk.corpus import gutenberg valid_tokens = [token for token in alpha_tokens if
from nltk.text import Text token in english_words]
corpus=gutenberg.words("shakespeare-macbeth.tx filtered_tokens = [token for token in valid_tokens if
t") token not in stop_words]
text=Text(corpus) stemmer_tokens = [stemmer.stem(token) for token
text.concordance("monstrous") in filtered_tokens] # Corrected variable name

Output print("Original text :", text)

print("Tokenized text :", tokens)
Displaying 1 of 1 matches: print("Filtered text :", filtered_tokens)
Who cannot want the thought , how monstrous It print("Validated text :", valid_tokens)
was for Malcolme , and for Dona. print("Alpha text :", alpha_tokens)
print("Stemmed text :", stemmer_tokens)

Vocabulary Count
Output
import nltk
text=("welcome to the world") Original text : This is a sample text that we used to
words = nltk.word_tokenize(text) demonstrate NLTK text processing 123
num_words=len(words) Tokenized text : ['This', 'is', 'a', 'sample', 'text', 'that',
num_the = words.count('the') 'we', 'used', 'to', 'demonstrate', 'NLTK', 'text',
unique_words=set(words) 'processing', '123']
num_unique_words=len(unique_words) Filtered text : ['sample', 'text', 'used', 'demonstrate',
percen_unique=(num_unique_words/num_words)* 'text']
100 Validated text : ['this', 'is', 'a', 'sample', 'text', 'that',
print(words) 'we', 'used', 'to', 'demonstrate', 'text']
print("the number of words:",num_words) Alpha text : ['this', 'is', 'a', 'sample', 'text', 'that', 'we',
print('number of occurence of "the":',num_the) 'used', 'to', 'demonstrate', 'nltk', 'text', 'processing']
print("number of unique
words:",num_unique_words)
print("percentage of unique
words:",percen_unique)
Bag of Words

from sklearn.feature_extraction.text import

Output
CountVectorizer
['welcome', 'to', 'the', 'world'] corpus = ["This is the first document",
the number of words: 4 "This document is the second document",
number of occurence of "the": 1 "And this is the third one",
number of unique words: 4 "Is this the first document"]
percentage of unique words: 100.0
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

Text Preprocessing for i in range(len(corpus)):

print(f"BoW representation of Document {i+1}:
import nltk {X[i].toarray()[0]}")
nltk.download('stopwords')
nltk.download('words')
from nltk.tokenize import word_tokenize Output
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer BoW representation of
from nltk.corpus import words Document 1: [0 1 1 1 0 0 1 0 1]
BoW representation of
text = 'This is a sample text that we used to Document 2: [0 2 0 1 0 1 1 0 1]
demonstrate NLTK text processing 123' BoW representation of
tokens = word_tokenize(text) Document 3: [1 0 0 1 1 0 1 1 1]
stop_words = set(stopwords.words('english')) BoW representation of
Document 4: [0 1 1 1 0 0 1 0 1]
# Corrected variable name
stemmer = PorterStemmer()
TF-IDF filtered_tokens = [token for token in tokens if
token.lower() not in stop_words]
from nltk.tokenize import word_tokenize lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords lemmatized_tokens =
from nltk.stem import PorterStemmer [lemmatizer.lemmatize(token) for token in
from collections import Counter filtered_tokens]
import math pos_tags = nltk.pos_tag(lemmatized_tokens)
pos_word_corpus = [(word, tag) for word, tag in
def calculate_tf(word, document): pos_tags]
word_frequency = document.count(word)
return word_frequency / len(document) for word, tag in pos_word_corpus:
print(word, ":", tag)
def calculate_idf(word, corpus):
num_documents_containing_word = len([True
for document in corpus if word in document]) Output
if num_documents_containing_word == 0:
return 0 quick : JJ
else: brown : NN
return math.log10(len(corpus) / fox : JJ
num_documents_containing_word) jump : NN
lazy : NN
def calculate_tfidf(document, corpus): dog : NN
PS = PorterStemmer()
stop_words = set(stopwords.words('english'))
words = [PS.stem(word.lower()) for word in Named Entity Recognition
word_tokenize(document) if word.lower() not in
stop_words] import nltk
word_tfidf_values = {} nltk.download('averaged_perceptron_tagger')
for word in words: nltk.download('maxent_ne_chunker')
if word not in word_tfidf_values: nltk.download('words')
tf = calculate_tf(word, words) text="Josh works for Twitter in California."
idf = calculate_idf(word, corpus) tokens=nltk.word_tokenize(text)
word_tfidf_values[word] = tf * idf tagged=nltk.pos_tag(tokens)
return word_tfidf_values entities=nltk.chunk.ne_chunk(tagged)
for entity in entities:
corpus = [ "This is the first document", "This if hasattr(entity,'label'):
document is the second document", "And this is the print(entity.label(),''.join(c[0] for c in
third one", "Is this the first document" ] entity.leaves()))
document = "This is the second document"
tfidf_vector = calculate_tfidf(document, corpus)
print(tfidf_vector) Output

PERSON Josh
Output GPE Twitter
GPE California
{'second': 0.3010299956639812,
'document': 0.06246936830414996}

Pos Tagging

import nltk
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

line = "quick brown fox jumps over the lazy dog"

tokens = nltk.word_tokenize(line)
stop_words = set(stopwords.words('english'))
Pos Tagging via HMM Chatbot

import nltk import nltk

nltk.download('brown') from nltk.chat.util import Chat,reflections
from nltk.corpus import brown pairs=[[r"Hello|hi|hey|hola",
["Hello,I am Aura,your AI assistant. How may I help
def train_hmm_tagger(): you?"]],
tagged_sentence = [r"How are you|How are you doing",
brown.tagged_sents(categories='news') ["I'm good, how about you?"]],
size = int(len(tagged_sentence) * 0.9) [r"What song always gets you in a good mood?",
trained_sents = tagged_sentence[:size] ['"Happy" by Pharrell Williams never fails to put a
test_sents = tagged_sentence[size:] smile on my face.']],
symbols = set([word for sentence in [r"Suggest a trending song",
tagged_sentence for word, _ in sentence]) ['Good 4 U by Olivia Rodrigo',
states = set([tag for sentence in 'Montero(Call Me By Your Name) by Lil Nas X',
tagged_sentence for _, tag in sentence]) 'Save Your Tears by The Weeknd',
trainer = 'Levitating by Dua Lipa']],
nltk.tag.hmm.HiddenMarkovModelTrainer(states=st [r"quit",["Good bye"]],
ates, symbols=symbols) [r"(.*)",["Could you try again?"]]]
hmm_tagger = bot=Chat(pairs,reflections)
trainer.train_supervised(trained_sents) bot.converse()
return hmm_tagger
Output
def pos_tag_sentence(sentence, hmm_tagger):
tokens = nltk.word_tokenize(sentence) >hi
tagged_tokens = hmm_tagger.tag(tokens) Hello,I am Aura,your AI assistant. How may I help
return tagged_tokens you?
>how are you
hmm_tagger = train_hmm_tagger() I'm good, how about you?
sentence = input("Enter the sentence to be >What song always gets you in a good mood?
tagged?") "Happy" by Pharrell Williams never fails to put a
tagged = pos_tag_sentence(sentence, smile on my face.
hmm_tagger) >Suggest a trending song
print(tagged) Save Your Tears by The Weeknd
>bye
Good bye
Output

Enter the sentence to be tagged?

The sky is so beautiful.

[('The', 'AT'), ('sky', 'NN'), ('is', 'BEZ'), ('so', 'QL'),

('beautiful', 'JJ')]
TEXT CLASSIFICATION USING LOGISTIC TEXT CLASSIFICATION USING NAÏVE
REGRESSION BAYES

from nltk.tokenize import word_tokenize import nltk

from nltk.corpus import stopwords from nltk.corpus import movie_reviews
from nltk.stem import PorterStemmer from nltk.corpus import stopwords
from sklearn.feature_extraction.text import from nltk.classify import NaiveBayesClassifier
CountVectorizer from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import from sklearn.model_selection import train_test_split
LogisticRegression nltk.download('movie_reviews')
def preprocess(text): nltk.download('stopwords')
ps = PorterStemmer() stop_words = set(stopwords.words('english'))
stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer()
words = [word_tokenize(sentence) for sentence
in text] def preprocess(text):
filtered_words = [[ps.stem(word) for word in tokens = word_tokenize(text.lower())
tokenized if word not in stop_words and tokens = [lemmatizer.lemmatize(token) for token
word.isalpha()] for in tokens if token not in stop_words and
tokenized in words] token.isalpha()]
filtered_sentences = [' '.join(sentence) for return dict(nltk.FreqDist(tokens))
sentence in filtered_words] pos_reviews = [(movie_reviews.raw(fileid),
return filtered_sentences 'positive') for fileid in movie_reviews.fileids('pos')]
neg_reviews = [(movie_reviews.raw(fileid),
sentences = ["The food is tasty", "the quality of 'negative') for fileid in movie_reviews.fileids('neg')]
food is low", "i will never recommend their food", tot_rev = pos_reviews + neg_reviews
"I got sick after having their food", "I was processed_data = [(preprocess(text), category) for
in cloudnine after tasting their food", (text, category) in tot_rev]
"My favourite is their desserts", "the food train_data, val_data =
was not cooked properly"] train_test_split(processed_data, test_size=0.2,
classes = [1, 0, 0, 0, 1, 1, 0] random_state=42)
test_sentences = ["food is not cooked properly", "I
feel sick after having food", "I love their desserts", classifier = NaiveBayesClassifier.train(train_data)
"was in cloudnine after tasting their food"] new_text = ["The movie was amazing", "the movie
was terrible", "The movie was awful"]
vectorizer = CountVectorizer() for text in new_text:
sentences = preprocess(sentences) new_features = preprocess(text)
vect1 = vectorizer.fit_transform(sentences) predicted_category =
# Splitting data for testing classifier.classify(new_features)
# train_data, test_data, train_labels, test_labels = print(f"The predicted category for '{text}' is
train_test_split(vect1, classes, test_size=0.2, '{predicted_category}'")
random_state=42
nb = LogisticRegression() Output
nb.fit(vect1, classes)
test_sentences = preprocess(test_sentences) The predicted category for 'The movie was
vect2 = vectorizer.transform(test_sentences) amazing' is 'positive'
pred_classes = nb.predict(vect2) The predicted category for 'the movie was terrible'
print(pred_classes) is 'negative'
The predicted category for 'The movie was awful' is
Output 'negative’

[0 0 1 1]

NLP Exp3
No ratings yet
NLP Exp3
3 pages
(Ebook) Czech: An Essential Grammar by James Naughton, Karen Von Kunes ISBN 9780367861858, 0367861852 Get PDF
100% (2)
(Ebook) Czech: An Essential Grammar by James Naughton, Karen Von Kunes ISBN 9780367861858, 0367861852 Get PDF
93 pages
Natural Language Processing Journal
No ratings yet
Natural Language Processing Journal
73 pages
Word+Picture+Pairs+Complete +book+1+
No ratings yet
Word+Picture+Pairs+Complete +book+1+
76 pages
x0 Process
No ratings yet
x0 Process
4 pages
TSA Lab Manual New
No ratings yet
TSA Lab Manual New
14 pages
Dsbda 7
No ratings yet
Dsbda 7
1 page
NLP Lab Manual for CSE Students
No ratings yet
NLP Lab Manual for CSE Students
28 pages
NLP Lab Codes Till Mod3
No ratings yet
NLP Lab Codes Till Mod3
7 pages
NLP Session 4
No ratings yet
NLP Session 4
13 pages
WIDA Performance Definitions: Produce or Use
No ratings yet
WIDA Performance Definitions: Produce or Use
1 page
Natural Langauage Processing (NLP) : Tokenization of Words
No ratings yet
Natural Langauage Processing (NLP) : Tokenization of Words
8 pages
HSC Examination 2020 05-03-2020
No ratings yet
HSC Examination 2020 05-03-2020
2 pages
NLP Lab Assignment 8
No ratings yet
NLP Lab Assignment 8
14 pages
NLP Techniques for Text Processing
No ratings yet
NLP Techniques for Text Processing
41 pages
NLP Core Using NLTK: Dr. Muhammad Nouman Durrani
No ratings yet
NLP Core Using NLTK: Dr. Muhammad Nouman Durrani
42 pages
NLP Lab Programs
No ratings yet
NLP Lab Programs
18 pages
NLP Practical Journal 2023-24
No ratings yet
NLP Practical Journal 2023-24
22 pages
Record
No ratings yet
Record
6 pages
English GRD 7
No ratings yet
English GRD 7
3 pages
NLP Lab Manual - Final
No ratings yet
NLP Lab Manual - Final
15 pages
Programs Code
No ratings yet
Programs Code
7 pages
DSBDA Practical 7 Tutorial
No ratings yet
DSBDA Practical 7 Tutorial
11 pages
NLP Practical Journal
No ratings yet
NLP Practical Journal
36 pages
NLP Lab
No ratings yet
NLP Lab
18 pages
NLP Lab1
No ratings yet
NLP Lab1
6 pages
C24064 - NLP - Lab Manual
No ratings yet
C24064 - NLP - Lab Manual
28 pages
DS 7
No ratings yet
DS 7
3 pages
7 TextAnalysis
No ratings yet
7 TextAnalysis
3 pages
Phonology: Dialects & Accents
No ratings yet
Phonology: Dialects & Accents
17 pages
Systemic Grammar in Language
No ratings yet
Systemic Grammar in Language
99 pages
Self Evaluation Exercises
No ratings yet
Self Evaluation Exercises
12 pages
Lecture 11
No ratings yet
Lecture 11
28 pages
Ubject Erb Greement: Reteaching
No ratings yet
Ubject Erb Greement: Reteaching
3 pages
NLP Tasks for MCA Students
No ratings yet
NLP Tasks for MCA Students
16 pages
NLP Study Plan For Beginners - HW Samples
No ratings yet
NLP Study Plan For Beginners - HW Samples
47 pages
33880-Article Text-41762-1-10-20200604
No ratings yet
33880-Article Text-41762-1-10-20200604
11 pages
Logical Connectives for Test Prep
No ratings yet
Logical Connectives for Test Prep
17 pages
NLPPractical
No ratings yet
NLPPractical
12 pages
TP1 NLP
No ratings yet
TP1 NLP
7 pages
English Week 4 Lesson 1 Proverbs and Idioms
No ratings yet
English Week 4 Lesson 1 Proverbs and Idioms
12 pages
Yugoslav Serbocroatian Studies
No ratings yet
Yugoslav Serbocroatian Studies
39 pages
Shubham Jade MSC It 31031420010 NLP Practical Journal
No ratings yet
Shubham Jade MSC It 31031420010 NLP Practical Journal
17 pages
RPH UNIT 9 (Full)
No ratings yet
RPH UNIT 9 (Full)
19 pages
08le 03 Gs Sentence ST Pa
No ratings yet
08le 03 Gs Sentence ST Pa
13 pages
Arabic 2 English
No ratings yet
Arabic 2 English
7 pages
Senior Two English Exam 2023
No ratings yet
Senior Two English Exam 2023
6 pages
NLP Assignment 4 (22bce9560)
No ratings yet
NLP Assignment 4 (22bce9560)
12 pages
Minggu 3
No ratings yet
Minggu 3
12 pages
Didactics PDF
No ratings yet
Didactics PDF
4 pages
NLP Record
No ratings yet
NLP Record
23 pages
1a NLTK
No ratings yet
1a NLTK
10 pages
English Language Evolution
No ratings yet
English Language Evolution
10 pages
Ontology Based Word Sense Disambiguation
No ratings yet
Ontology Based Word Sense Disambiguation
8 pages
NLP Using Python
No ratings yet
NLP Using Python
4 pages
Ai&Ml Bai601 NLP Lab Manual
No ratings yet
Ai&Ml Bai601 NLP Lab Manual
48 pages
R22 NLP Python Programs
No ratings yet
R22 NLP Python Programs
15 pages
NLP Pratical
No ratings yet
NLP Pratical
14 pages
NLP FinAL
No ratings yet
NLP FinAL
27 pages
Articles Rules and Articles Worksheet With PDF
No ratings yet
Articles Rules and Articles Worksheet With PDF
5 pages
115 Ir 7
No ratings yet
115 Ir 7
6 pages
English 3 Adjectives
No ratings yet
English 3 Adjectives
3 pages
NLP
No ratings yet
NLP
12 pages
Practice Test
No ratings yet
Practice Test
20 pages
Linkers of Contrast: © Cambridge University Press 2017
No ratings yet
Linkers of Contrast: © Cambridge University Press 2017
10 pages
Python NLP Techniques Guide
No ratings yet
Python NLP Techniques Guide
18 pages
Natural Language Processing
No ratings yet
Natural Language Processing
22 pages
Module 3 Test Quizlet
No ratings yet
Module 3 Test Quizlet
3 pages
Gerund and Infinitive
No ratings yet
Gerund and Infinitive
2 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
19 pages
Soundarya 256 NLP Practs
No ratings yet
Soundarya 256 NLP Practs
14 pages
Natural Language Processing Lab 9
No ratings yet
Natural Language Processing Lab 9
13 pages
0232 2101intro To Linguistics Course Outline Batch 51
No ratings yet
0232 2101intro To Linguistics Course Outline Batch 51
5 pages
PEL 200 Syllabus
No ratings yet
PEL 200 Syllabus
2 pages
NLP Op
No ratings yet
NLP Op
16 pages
English Tenses Quiz Game
100% (1)
English Tenses Quiz Game
61 pages
NLP - Practical List
No ratings yet
NLP - Practical List
14 pages
MarkingCriteria2020 3elem
No ratings yet
MarkingCriteria2020 3elem
1 page
1 - Write A Python Program To Perform Following Tasks On Text A) Tokenization
No ratings yet
1 - Write A Python Program To Perform Following Tasks On Text A) Tokenization
13 pages
DSBD 7 Ass
No ratings yet
DSBD 7 Ass
9 pages
NLP - Cheatsheet
No ratings yet
NLP - Cheatsheet
10 pages
NLP Record
No ratings yet
NLP Record
15 pages
20BCP112 - NLP Lab - LAB - Manual
No ratings yet
20BCP112 - NLP Lab - LAB - Manual
65 pages
AP19110010110 Lab Assignment-2 - Jupyter Notebook
No ratings yet
AP19110010110 Lab Assignment-2 - Jupyter Notebook
18 pages
Lab2 IR
No ratings yet
Lab2 IR
16 pages
All Practicals
No ratings yet
All Practicals
33 pages
Python NLP Tasks with NLTK
No ratings yet
Python NLP Tasks with NLTK
17 pages

NLP Projects

Uploaded by

NLP Projects

Uploaded by

Text Concordance alpha_tokens = [token.

lower() for token in tokens if

Output print("Original text :", text)

from sklearn.feature_extraction.text import

Text Preprocessing for i in range(len(corpus)):

from nltk.corpus import stopwords

line = "quick brown fox jumps over the lazy dog"

import nltk import nltk

Enter the sentence to be tagged?

[('The', 'AT'), ('sky', 'NN'), ('is', 'BEZ'), ('so', 'QL'),

from nltk.tokenize import word_tokenize import nltk

You might also like