1. How do you tokenize a sentence using Python?
Answer:
python
Copy
from nltk.tokenize import word_tokenize
sentence = "Hello, how are you?"
tokens = word_tokenize(sentence)
print(tokens) # Output: ['Hello', ',', 'how', 'are', 'you', '?']
2. How do you remove stop words using NLTK?
Answer:
python
Copy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
sentence = "This is a sample sentence."
words = word_tokenize(sentence)
filtered_words = [word for word in words if word.lower() not in stop_words]
print(filtered_words) # Output: ['sample', 'sentence', '.']
3. How do you perform stemming using NLTK?
Answer:
python
Copy
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
word = "running"
stemmed_word = stemmer.stem(word)
print(stemmed_word) # Output: 'run'
4. How do you perform lemmatization using NLTK?
Answer:
python
Copy
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
word = "better"
lemma = lemmatizer.lemmatize(word, pos='a') # 'a' for adjective
print(lemma) # Output: 'good'
5. How do you extract named entities using spaCy?
Answer:
python
Copy
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
print(ent.text, ent.label_) # Output: Apple ORG, U.K. GPE, $1 billion MONEY
6. How do you calculate TF-IDF using scikit-learn?
Answer:
python
Copy
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["This is a sample sentence.", "This is another example sentence."]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
print(tfidf_matrix.toarray())
7. How do you train a Word2Vec model using Gensim?
Answer:
python
Copy
from gensim.models import Word2Vec
sentences = [["I", "love", "NLP"], ["NLP", "is", "fun"]]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
print(model.wv["NLP"]) # Output: Word vector for "NLP"
8. How do you load a pre-trained GloVe model?
Answer:
python
Copy
import numpy as np
def load_glove(file):
embeddings = {}
with open(file, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], dtype='float32')
embeddings[word] = vector
return embeddings
glove_embeddings = load_glove("glove.6B.100d.txt")
print(glove_embeddings["the"])
9. How do you perform sentiment analysis using TextBlob?
Answer:
python
Copy
from textblob import TextBlob
text = "I love NLP!"
blob = TextBlob(text)
print(blob.sentiment) # Output: Sentiment(polarity=0.5, subjectivity=0.5)
10. How do you create a bag-of-words model?
Answer:
python
Copy
from sklearn.feature_extraction.text import CountVectorizer
corpus = ["This is a sample sentence.", "This is another example sentence."]
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(corpus)
print(bow_matrix.toarray())
11. How do you perform part-of-speech tagging using NLTK?
Answer:
python
Copy
from nltk import pos_tag
from nltk.tokenize import word_tokenize
sentence = "I love NLP."
tokens = word_tokenize(sentence)
tags = pos_tag(tokens)
print(tags) # Output: [('I', 'PRP'), ('love', 'VBP'), ('NLP', 'NNP'), ('.', '.')]
12. How do you perform dependency parsing using spaCy?
Answer:
python
Copy
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("I love NLP.")
for token in doc:
print(token.text, token.dep_, token.head.text) # Output: I nsubj love, love ROOT love, NLP dobj love, . pu
nct love
13. How do you generate n-grams using NLTK?
Answer:
python
Copy
from nltk import ngrams
sentence = "I love NLP."
tokens = sentence.split()
bigrams = list(ngrams(tokens, 2))
print(bigrams) # Output: [('I', 'love'), ('love', 'NLP.')]
14. How do you perform text classification using scikit-learn?
Answer:
python
Copy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
corpus = ["I love NLP.", "I hate spam."]
labels = [1, 0] # 1 for positive, 0 for negative
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(corpus, labels)
print(model.predict(["I enjoy learning."])) # Output: [1]
15. How do you visualize a word cloud in Python?
Answer:
python
Copy
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text = "NLP is fun and exciting. NLP is the future."
wordcloud = WordCloud().generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
16. How do you preprocess text for NLP tasks?
Answer:
python
Copy
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
def preprocess(text):
text = re.sub(r'\W', ' ', text) # Remove special characters
text = text.lower() # Convert to lowercase
tokens = word_tokenize(text) # Tokenize
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words] # Remove stop words
stemmer = PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens] # Stemming
return tokens
print(preprocess("I love NLP!")) # Output: ['love', 'nlp']
17. How do you calculate word frequencies in a text?
Answer:
python
Copy
from collections import Counter
text = "I love NLP. NLP is fun."
words = text.split()
word_freq = Counter(words)
print(word_freq) # Output: Counter({'I': 1, 'love': 1, 'NLP.': 1, 'NLP': 1, 'is': 1, 'fun.': 1})
18. How do you perform sentence segmentation using spaCy?
Answer:
python
Copy
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("I love NLP. It is fun.")
for sent in doc.sents:
print(sent.text) # Output: I love NLP. \n It is fun.
19. How do you perform topic modeling using Gensim?
Answer:
python
Copy
from gensim import corpora
from gensim.models import LdaModel
documents = [["I", "love", "NLP"], ["NLP", "is", "fun"]]
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]
lda = LdaModel(corpus, num_topics=2, id2word=dictionary)
print(lda.print_topics())
20. How do you evaluate a text classification model?
Answer:
python
Copy
from sklearn.metrics import classification_report, accuracy_score
y_true = [1, 0, 1, 0]
y_pred = [1, 1, 0, 0]
print(classification_report(y_true, y_pred))
print("Accuracy:", accuracy_score(y_true, y_pred))
Intermediate NLP Programming Questions (21-40)
21. How do you fine-tune a pre-trained BERT model?
Answer:
python
Copy
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
dataset = load_dataset("imdb")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_datas
et=tokenized_datasets["test"])
trainer.train()
21. How do you fine-tune a pre-trained BERT model?
Answer:
python
Copy
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
dataset = load_dataset("imdb")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_datas
et=tokenized_datasets["test"])
trainer.train()
22. How do you use a pre-trained GPT-2 model for text generation?
Answer:
python
Copy
from transformers import GPT2LMHeadModel, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
input_text = "Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
print(tokenizer.decode(output[0], skip_special_tokens=True))
23. How do you perform text summarization using Hugging Face Transformers?
Answer:
python
Copy
from transformers import pipeline
summarizer = pipeline("summarization")
text = "Natural Language Processing (NLP) is a field of AI focused on the interaction between computers a
nd humans using natural language."
summary = summarizer(text, max_length=30, min_length=10, do_sample=False)
print(summary[0]['summary_text'])
24. How do you perform question answering using a pre-trained BERT model?
Answer:
python
Copy
from transformers import pipeline
qa_pipeline = pipeline("question-answering")
context = "Natural Language Processing (NLP) is a field of AI focused on the interaction between compute
rs and humans using natural language."
question = "What is NLP?"
result = qa_pipeline(question=question, context=context)
print(result['answer']) # Output: a field of AI
25. How do you perform named entity recognition (NER) using spaCy?
Answer:
python
Copy
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
print(ent.text, ent.label_) # Output: Apple ORG, U.K. GPE, $1 billion MONEY
26. How do you train a custom NER model using spaCy?
Answer:
python
Copy
import spacy
from spacy.training import Example
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
ner.add_label("ORG")
train_data = [
("Apple is looking at buying U.K. startup for $1 billion", {"entities": [(0, 5, "ORG")]})
]
optimizer = nlp.begin_training()
for _ in range(10):
for text, annotations in train_data:
example = Example.from_dict(nlp.make_doc(text), annotations)
nlp.update([example], sgd=optimizer)
doc = nlp("Apple is a tech company.")
print([(ent.text, ent.label_) for ent in doc.ents]) # Output: [('Apple', 'ORG')]
27. How do you perform sentiment analysis using Hugging Face Transformers?
Answer:
python
Copy
from transformers import pipeline
sentiment_analyzer = pipeline("sentiment-analysis")
text = "I love NLP!"
result = sentiment_analyzer(text)
print(result) # Output: [{'label': 'POSITIVE', 'score': 0.9998}]
28. How do you perform machine translation using Hugging Face
Transformers?
Answer:
python
Copy
from transformers import pipeline
translator = pipeline("translation_en_to_fr")
text = "Hello, how are you?"
translation = translator(text)
print(translation[0]['translation_text']) # Output: Bonjour, comment ça va ?
29. How do you visualize word embeddings using t-SNE?
Answer:
python
Copy
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
# Example word vectors
word_vectors = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
words = ["apple", "banana", "orange"]
# Reduce dimensionality using t-SNE
tsne = TSNE(n_components=2, random_state=0)
word_vectors_2d = tsne.fit_transform(word_vectors)
# Plot
plt.scatter(word_vectors_2d[:, 0], word_vectors_2d[:, 1])
for i, word in enumerate(words):
plt.annotate(word, xy=(word_vectors_2d[i, 0], word_vectors_2d[i, 1]))
plt.show()
30. How do you perform text clustering using K-Means?
Answer:
python
Copy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
documents = ["I love NLP.", "I hate spam.", "NLP is fun.", "Spam is bad."]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(X)
print(kmeans.labels_) # Output: [0, 1, 0, 1]
31. How do you perform text classification using a pre-trained BERT model?
Answer:
python
Copy
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
result = classifier("I love NLP!")
print(result) # Output: [{'label': 'LABEL_0', 'score': 0.9998}]
32. How do you perform text similarity using cosine similarity?
Answer:
python
Copy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
documents = ["I love NLP.", "I enjoy natural language processing."]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print(similarity[0][0]) # Output: Similarity score between 0 and 1
33. How do you perform text preprocessing using spaCy?
Answer:
python
Copy
import spacy
nlp = spacy.load("en_core_web_sm")
text = "I love NLP! It's amazing."
doc = nlp(text)
# Lemmatization and stop word removal
preprocessed_text = [token.lemma_ for token in doc if not token.is_stop]
print(preprocessed_text) # Output: ['love', 'nlp', '!', 'amazing', '.']
34. How do you perform text classification using LSTM in TensorFlow?
Answer:
python
Copy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Sample data
texts = ["I love NLP", "I hate spam"]
labels = [1, 0]
# Tokenization
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=10)
# LSTM Model
model = tf.keras.Sequential([
tf.keras.layers.Embedding(1000, 64),
tf.keras.layers.LSTM(64),
tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(padded_sequences, labels, epochs=5)
35. How do you perform text generation using an LSTM model?
Answer:
python
Copy
import tensorflow as tf
import numpy as np
# Sample text
text = "I love NLP"
chars = sorted(list(set(text)))
char_to_index = {c: i for i, c in enumerate(chars)}
# Prepare data
seq_length = 3
X = []
y = []
for i in range(len(text) - seq_length):
X.append([char_to_index[c] for c in text[i:i+seq_length]])
y.append(char_to_index[text[i+seq_length]])
X = np.array(X)
y = tf.keras.utils.to_categorical(y, num_classes=len(chars))
# LSTM Model
model = tf.keras.Sequential([
tf.keras.layers.LSTM(128, input_shape=(seq_length, 1)),
tf.keras.layers.Dense(len(chars), activation="softmax")
])
model.compile(optimizer="adam", loss="categorical_crossentropy")
model.fit(X, y, epochs=100)
# Generate text
def generate_text(seed, length):
for _ in range(length):
seed_encoded = [char_to_index[c] for c in seed]
seed_encoded = np.array(seed_encoded).reshape(1, seq_length)
pred = model.predict(seed_encoded, verbose=0)
next_char = chars[np.argmax(pred)]
seed = seed[1:] + next_char
return seed
print(generate_text("I l", 10)) # Output: Generated text
Advanced NLP Programming Questions (41-50)
41. How do you fine-tune a GPT-3 model using OpenAI's API?
Answer:
python
Copy
import openai
openai.api_key = "your-api-key"
response = openai.Completion.create(
engine="davinci",
prompt="Translate English to French: 'Hello, how are you?'",
max_tokens=50
)
print(response.choices[0].text.strip())
42. How do you perform zero-shot text classification using Hugging Face
Transformers?
Answer:
python
Copy
from transformers import pipeline
classifier = pipeline("zero-shot-classification")
result = classifier(
"I love NLP!",
candidate_labels=["positive", "negative"]
)
print(result) # Output: {'labels': ['positive', 'negative'], 'scores': [0.99, 0.01]}
43. How do you perform multilingual text classification?
Answer:
python
Copy
from transformers import pipeline
classifier = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")
result = classifier("J'adore le NLP!")
print(result) # Output: [{'label': '5 stars', 'score': 0.99}]
44. How do you perform text summarization using BART?
Answer:
python
Copy
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
text = "Natural Language Processing (NLP) is a field of AI focused on the interaction between computers a
nd humans using natural language."
summary = summarizer(text, max_length=30, min_length=10, do_sample=False)
print(summary[0]['summary_text'])
45. How do you perform text generation using T5?
Answer:
python
Copy
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
input_text = "translate English to French: Hello, how are you?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output = model.generate(input_ids)
print(tokenizer.decode(output[0], skip_special_tokens=True)) # Output: Bonjour, comment ça va ?