Basic Text Processing operation on text document.
for line in open("data.txt"):
for word in line.split():
if word.endswith("ing"):
print(word)
print(len(word))
import re
data="The biggest 5 Animals are 1.Elephant 2.Girrafe 3.Tiger 4.Lion 5.Cheetah"
result=re.sub("\d+",'',data)
print(result)
def punctuations(data):
text=data
text=text.replace("'nt"," not")
text=text.replace("'s"," is")
text=text.replace("'re"," are")
text=text.replace("'ll"," will")
return text
s="How's my team doin,you're supposed to be not losing"
returned_data=punctuations(s)
print(returned_data)
Tokenization, Stemming
import nltk
nltk.download()
import nltk
data="Welcome to TIMSCDR!!"
tokens=nltk.sent_tokenize(data)
print(tokens)
tokens=nltk.word_tokenize(data)
print(tokens)
from nltk.stem import PorterStemmer
port_stemmer=PorterStemmer()
print(port_stemmer.stem("Liked"))
data=["liked","liking","likes","killing","killed"]
for words in data:
print(words," :",port_stemmer.stem(words))
from nltk.stem import WordNetLemmatizer
lemmati=WordNetLemmatizer()
print("Socks :",lemmati.lemmatize("socks"))
print("corpora :",lemmati.lemmatize("corpora"))
print("better :",lemmati.lemmatize("better",pos="a"))
Removal of Stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords=set(stopwords.words('english'))
print(stopwords)
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
stopWords=set(stopwords.words('english'))
data="All work and no play.All work and no play makes jack a dull boy"
tokens=nltk.word_tokenize(data)
filtered_Data=[]
for w in tokens:
if w not in stopWords:
filtered_Data.append(w)
print(filtered_Data)
Implementation of POS Tag
1.
import spacy
nlp=spacy.load("en_core_web_sm")
doc=nlp("Don't be afraid to give up the good to go for the great")
POS_count= doc.count_by(spacy.attrs.POS)
print(POS_count)
for k,v in sorted(POS_count.items()):
print(f'{k}. {doc.vocab[k].text} : {v}')
2.
import spacy
from spacy import displacy
nlp=spacy.load("en_core_web_sm")
doc=nlp("This is my School")
options={'color':'red','bg':'blue','compact':'True','distance':100}
displacy.render(doc,style="dep",options=options)
3.
import nltk
from nltk.tag import DefaultTagger
text="The way to get started is to quit talking"
tokens=nltk.word_tokenize(text)
tagging=DefaultTagger("Ad")
print(tagging.tag(tokens))
4.
import nltk
import nltk
sentence= "the little yellow dog barked at the car"
#Tokenization
tokens= nltk.word_tokenize(sentence)
#POs Tagging
tag=nltk.pos_tag(tokens)
phrase= "NP: {<DT>?<JJ>*<NN>}"
cp=nltk.RegexpParser(phrase)
result=cp.parse(tag)
print(result)
TF-IDF implementation
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
text = ["I love writing code in Python. I love Python code",
"I hate writing code in Java. I hate Java code"]
df = pd.DataFrame({'review':['review1','review2'],'text':text})
cv = CountVectorizer(stop_words='english')
cv_matrix = cv.fit_transform(df['text'])
df_dtm=pd.DataFrame(cv_matrix.toarray(),index=df['review'].values,
columns=cv.get_feature_names_out())
df_dtm
pip install sklearn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
text = ["I love writing code in python.I love python code",
"I hate writing code in java.i hate java code"]
df = pd.DataFrame({'review':['review1','review2'],'text':text})
tfidf = TfidfVectorizer(stop_words='english',norm=None)
tfidf_matrix = tfidf.fit_transform(df['text'])
output = pd.DataFrame(tfidf_matrix.toarray(),index=df['review'],
columns=tfidf.get_feature_names_out())
output
Practical 4: Creating and comparing different text representations
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def compare_text_representations(texts):
# Bag of Words (BoW) representation
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(texts)
# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
# Compare similarities
bow_similarity = cosine_similarity(bow_matrix[0], bow_matrix[1])
tfidf_similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
print("BoW vectors:")
print(bow_matrix.toarray())
print("\nTF-IDF vectors:")
print(tfidf_matrix.toarray())
print(f"\nBoW Cosine Similarity: {bow_similarity[0][0]:.4f}")
print(f"TF-IDF Cosine Similarity: {tfidf_similarity[0][0]:.4f}")
# Example usage
texts = [
"The cat sat on the mat",
"The dog sat on the log"
]
compare_text_representations(texts)
Training and using word embeddings
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
def train_word_embeddings(sentences):
# Tokenize sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
# Train Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100,
window=5, min_count=1, workers=4)
return model
def use_word_embeddings(model, word, top_n=5):
try:
similar_words = model.wv.most_similar(word, topn=top_n)
print(f"Words most similar to '{word}':")
for w, score in similar_words:
print(f"{w}: {score:.4f}")
except KeyError:
print(f"'{word}' not in vocabulary")
# Example usage
sentences = [
"The quick brown fox jumps over the lazy dog",
"A fox is a cunning animal",
"The dog barks at night",
"Foxes and dogs are different species"
]
model = train_word_embeddings(sentences)
use_word_embeddings(model, "fox")
Implement N gram Language model
1.
import nltk
from nltk.util import ngrams
text="The little boy ran away"
Ngram= ngrams(sequence=nltk.word_tokenize(text),n=3)
for gram in Ngram:
print(gram)
2.
import nltk
from nltk import bigrams, trigrams
from collections import defaultdict
import random
nltk.download('punkt')
def build_language_model(text, n=2):
words = nltk.word_tokenize(text.lower())
if n == 2:
pairs = list(bigrams(words))
elif n == 3:
pairs = list(trigrams(words))
else:
raise ValueError("n must be 2 or 3")
model = defaultdict(lambda: defaultdict(int))
for pair in pairs:
if n == 2:
model[pair[0]][pair[1]] += 1
else:
model[(pair[0], pair[1])][pair[2]] += 1
return model
def generate_text(model, num_words=20, start_word=None, n=2):
if start_word is None:
start_word = random.choice(list(model.keys()))
words = [start_word] if n == 2 else list(start_word)
for _ in range(num_words - n + 1):
if n == 2:
last_word = words[-1]
next_word = max(model[last_word], key=model[last_word].get)
else:
last_words = tuple(words[-2:])
next_word = max(model[last_words], key=model[last_words].get)
words.append(next_word)
return ' '.join(words)
# Example usage
text = """
The cat sat on the mat. The dog ran in the park.
Cats like to play with toys. Dogs enjoy chasing balls.
"""
bigram_model = build_language_model(text, n=2)
trigram_model = build_language_model(text, n=3)
print("Generated text (bigram model):")
print(generate_text(bigram_model, num_words=15, start_word="the", n=2))
print("\nGenerated text (trigram model):")
print(generate_text(trigram_model, num_words=15, start_word=("the", "cat"),
n=3))
Implementation text classification using Naïve Bayes
import pandas as pd
df = pd.read_csv('twitter_training.csv',header=None,usecols=[2,3])
df.head()
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
#Rename columns
df.columns=['Sentiment','Text']
#Removes the missing values
df= df.dropna()
#encode label
le= LabelEncoder()
df['Sentiment']=le.fit_transform(df['Sentiment'])
df.head()
X=list(df['Text'])
y=list(df['Sentiment'])
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,
random_state=42)
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(analyzer='word',ngram_range=(1,1),stop_words='english')
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
import numpy as np
#train Naive Bayes Classifier
clf=MultinomialNB()
clf.fit(X_train_cv, y_train)
y_pred=clf.predict(X_test_cv)
score=f1_score(y_test,y_pred, average="micro")
print('F-1 score : {}'.format(np.round(score,4)))
for i in range(1,11):
cv=CountVectorizer(analyzer='word',ngram_range=(1,i),stop_words='english')
#creating BOG
X_train_cv=cv.fit_transform(X_train)
X_test_cv=cv.transform(X_test)
#training of classifiers
clf2=MultinomialNB()
clf2.fit(X_train_cv,y_train)
y_pred=clf2.predict(X_test_cv)
score=f1_score(y_test,y_pred,average="micro")
print('F1-Score: {}'.format(np.round(score,4)))
Implementing a text classifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
def train_text_classifier(X, y):
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Create a CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)
# Make predictions
y_pred = classifier.predict(X_test_vectorized)
# Print classification report
print(classification_report(y_test, y_pred))
return vectorizer, classifier
def classify_text(text, vectorizer, classifier):
text_vectorized = vectorizer.transform([text])
prediction = classifier.predict(text_vectorized)
return prediction[0]
# Example usage
X=[
"I love this movie, it's amazing!",
"This book is terrible, I couldn't finish it.",
"The food at this restaurant is delicious.",
"The service here is awful, I'm never coming back.",
"What a great experience, highly recommended!",
y = ["positive", "negative", "positive", "negative", "positive"]
vectorizer, classifier = train_text_classifier(X, y)
new_text = "The product exceeded my expectations, I'm very satisfied."
prediction = classify_text(new_text, vectorizer, classifier)
print(f"Prediction for '{new_text}': {prediction}")
Building a sentiment analysis system
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
nltk.download('vader_lexicon')
def analyze_sentiment(text):
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
if sentiment_scores['compound'] >= 0.05:
sentiment = "Positive"
elifsentiment_scores['compound'] <= -0.05:
sentiment = "Negative"
else:
sentiment = "Neutral"
return sentiment, sentiment_scores
def analyze_sentiments(texts):
results = []
for text in texts:
sentiment, scores = analyze_sentiment(text)
results.append({
'text': text,
'sentiment': sentiment,
'pos_score': scores['pos'],
'neg_score': scores['neg'],
'neu_score': scores['neu'],
'compound_score': scores['compound']
})
return pd.DataFrame(results)
# Example usage
texts = [
"I absolutely love this product! It's amazing!",
"This is the worst experience I've ever had.",
"The movie was okay, nothing special.",
"I'm feeling pretty neutral about the whole situation.",
"The customer service was excellent and very helpful!"
results_df = analyze_sentiments(texts)
print(results_df)
Implementation of Text Summarization
from transformers import pipeline
def summarize_text(text, max_length=150, min_length=50):
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(text, max_length=max_length, min_length=min_length,
do_sample=False)
return summary[0]['summary_text']
# Example usage
long_text = """
Climate change is one of the most pressing issues facing our planet today. It refers
to long-term shifts in temperatures and weather patterns, mainly caused by human
activities, especially the burning of fossil fuels. These activities release greenhouse
gases into the atmosphere, trapping heat and causing the Earth's average
temperature to rise. The consequences of climate change are far-reaching and
include more frequent and severe weather events, rising sea levels, and disruptions
to ecosystems. To address this global challenge, countries and organizations
worldwide are working on strategies to reduce greenhouse gas emissions and
transition to cleaner energy sources. Individual actions, such as reducing energy
consumption and adopting sustainable practices, also play a crucial role in
mitigating the effects of climate change.
"""
summary = summarize_text(long_text)
print("Original text length:", len(long_text))
print("Summary length:", len(summary))
print("\nSummary:")
print(summary)