# !
pip install seaborn
# !pip install scikit-learn
# 1. Perform dimensionality reduction using scikit-learn’s TSNE
# estimator on the Iris dataset, then graph
# the results.
print("Piyush Aanand")
print("2241001002")
from sklearn.datasets import load_iris
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
iris = load_iris()
X = iris.data
y = iris.target
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1],
hue=iris.target_names[y], palette='viridis')
plt.title('t-SNE Visualization of Iris Dataset')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(title='Species')
plt.show()
Piyush Aanand
2241001002
# 2. Create a Seaborn pairplot graph for the
# California Housing dataset. Try the Matplotlib features to
# panning and zoom in on the diagram.
# These are accessible via the icons in the Matplotlib window.
print("Piyush Aanand")
print("2241001002")
from sklearn.datasets import fetch_california_housing
import seaborn as sns
import pandas as pd
california = fetch_california_housing()
data = pd.DataFrame(california.data, columns=california.feature_names)
data['MedHouseVal'] = california.target
sns.pairplot(data, vars=['MedInc', 'HouseAge', 'AveRooms',
'AveBedrms', 'Population', 'AveOccup', 'MedHouseVal'])
plt.suptitle('California Housing Dataset Pairplot', y=1.02)
plt.show()
Piyush Aanand
2241001002
# 3. Go to NOAA’s Climate at a Glance page (Link)
# and download the available time series data for
# the average annual temperatures of New York City
# from 1895 to today (1895-2025). Implement simple
# linear regression using average annual temperature
# data. Also, show how does the temperature trend
# compare to the average January high temperatures?
print("Piyush Aanand")
print("2241001002")
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
data = pd.read_csv('nyc_temp.csv')
data['time'] = pd.to_datetime(data['time'], errors='coerce')
data['Year'] = data['time'].dt.year
data['Temperature'] = pd.to_numeric(data['temperature_2m (°C)'],
errors='coerce')
annual_avg = data.groupby('Year')['Temperature'].mean().reset_index()
X = annual_avg[['Year']]
y = annual_avg['Temperature']
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Avg Annual Temperature')
plt.plot(X, y_pred, color='red', label='Trend Line')
plt.xlabel('Year')
plt.ylabel('Average Annual Temperature (°C)')
plt.title('NYC Average Annual Temperature Trend')
plt.legend()
plt.grid(True)
plt.show()
Piyush Aanand
2241001002
# 4. Load the Iris dataset from the scikit-learn library
# and perform classification on it with the k-nearest
# neighbors algorithm. Use a KNeighborsClassifier with
# the default k value. What is the prediction
# accuracy?
print("Piyush Aanand")
print("2241001002")
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3, random_state=42)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction Accuracy: {accuracy:.2f}")
Piyush Aanand
2241001002
Prediction Accuracy: 1.00
# 5. You are given a dataset of 2D points
# with their corresponding class labels.
# The dataset is as follows:
# Point ID x y Class
# A 2.0 3.0 0
# B 1.0 1.0 0
# C 4.0 4.0 1
# D 5.0 2.0 1
# A new point P with coordinates (3.0, 3.0)
# needs to be classified using the KNN algorithm.
# Use the Euclidean distance to calculate the distance
# between points.
print("Piyush Aanand")
print("2241001002")
import numpy as np
from collections import Counter
points = {
'A': {'x': 2.0, 'y': 3.0, 'class': 0},
'B': {'x': 1.0, 'y': 1.0, 'class': 0},
'C': {'x': 4.0, 'y': 4.0, 'class': 1},
'D': {'x': 5.0, 'y': 2.0, 'class': 1}
}
P = (3.0, 3.0)
distances = []
for name, data in points.items():
dist = np.sqrt((data['x'] - P[0])**2 + (data['y'] - P[1])**2)
distances.append((dist, data['class']))
distances.sort()
k = 3
nearest_classes = [d[1] for d in distances[:k]]
most_common = Counter(nearest_classes).most_common(1)
predicted_class = most_common[0][0]
print(f"Predicted class for point P: {predicted_class}")
Piyush Aanand
2241001002
Predicted class for point P: 1
# 6. A teacher wants to classify
# students as ”Pass” or ”Fail”
# based on their performance
# in three exams.
# The dataset includes three features:
# Exam 1 Score Exam 2 Score Exam 3 Score Class (Pass/Fail)
# 85 90 88 Pass
# 70 75 80 Pass
# 60 65 70 Fail
# 50 55 58 Fail
# 95 92 96 Pass
# 45 50 48 Fail
# A new student has the following scores:
# • Exam 1 Score: 72
# • Exam 2 Score: 78
# • Exam 3 Score: 75
# Classify this student using the K-Nearest Neighbors
# (KNN) algorithm with k = 3.
print("Piyush Aanand")
print("2241001002")
import numpy as np
from collections import Counter
data = [
[85, 90, 88, 1],
[70, 75, 80, 1],
[60, 65, 70, 0],
[50, 55, 58, 0],
[95, 92, 96, 1],
[45, 50, 48, 0]
]
new_student = [72, 78, 75]
distances = []
for row in data:
dist = np.sqrt((row[0]-new_student[0])**2 + (row[1]-
new_student[1])**2 + (row[2]-new_student[2])**2)
distances.append((dist, row[3]))
distances.sort()
k = 3
nearest_classes = [d[1] for d in distances[:k]]
most_common = Counter(nearest_classes).most_common(1)
result = "Pass" if most_common[0][0] == 1 else "Fail"
print(f"The student is predicted to: {result}")
Piyush Aanand
2241001002
The student is predicted to: Pass
# 7.Using scikit-learn’s KFold class and
# the cross val score function, determine
# the optimal value for k to classify the
# Iris dataset using a KNeighborsClassifier.
print("Piyush Aanand")
print("2241001002")
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
iris = load_iris()
X = iris.data
y = iris.target
k_values = range(1, 21)
cv_scores = []
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
cv_scores.append(scores.mean())
optimal_k = k_values[np.argmax(cv_scores)]
print(f"Optimal k value: {optimal_k}")
import matplotlib.pyplot as plt
plt.plot(k_values, cv_scores)
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.title('Accuracy vs k for KNN on Iris Dataset')
plt.show()
Piyush Aanand
2241001002
Optimal k value: 13
# 8. Write a Python script to perform K-Means
# clustering on the following dataset:
# Dataset: {(1, 1),(2, 2),(3, 3),(8, 8),(9, 9),(10, 10)}
# Use k=2 and visualize the clusters.
print("Piyush Aanand")
print("2241001002")
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
data = np.array([[1, 1], [2, 2], [3, 3], [8, 8], [9, 9], [10, 10]])
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(data)
labels = kmeans.labels_
plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,
1],
marker='x', s=200, linewidths=3, color='red')
plt.title('K-Means Clustering (k=2)')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
Piyush Aanand
2241001002
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
# 9. Write a Python script to perform K-Means clustering
# on the following dataset: Mall Customer Segmentation.
# Use k = 5 (also, determine optimal k via the Elbow Method)
# and visualize the clusters to
# identify customer segments.
# Expected Output:
# • Scatter plot showing clusters (e.g., “High
# Income-Low Spenders,” “Moderate Income-Moderate Spenders”).
# • Insights for targeted marketing strategies.
print("Piyush Aanand")
print("2241001002")
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
mall_data = pd.read_csv('mall_customers.csv')
X = mall_data[['Annual Income (k$)', 'Spending Score (1-100)']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, random_state=42)
kmeans.fit(X_scaled)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
plt.figure(figsize=(10, 6))
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=clusters, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,
1],
s=300, c='red', marker='x')
plt.title('Customer Segments')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()
segment_names = {
0: 'Low Income-High Spenders',
1: 'Moderate Income-Moderate Spenders',
2: 'High Income-Low Spenders',
3: 'Low Income-Low Spenders',
4: 'High Income-High Spenders'
}
print("\nCustomer Segments and Marketing Strategies:")
for seg_id, seg_name in segment_names.items():
print(f"\nSegment {seg_id}: {seg_name}")
if "High Income-High Spenders" in seg_name:
print("Strategy: Premium loyalty programs, exclusive offers")
elif "Low Income-High Spenders" in seg_name:
print("Strategy: Budget-friendly premium options, payment
plans")
elif "High Income-Low Spenders" in seg_name:
print("Strategy: Highlight value and quality, personalized
recommendations")
elif "Low Income-Low Spenders" in seg_name:
print("Strategy: Discounts, value bundles, essential items")
else:
print("Strategy: Balanced offers, seasonal promotions")
Piyush Aanand
2241001002
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
D:\Installed-Apps\Anaconda\Lib\site-packages\sklearn\cluster\
_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on
Windows with MKL, when there are less chunks than available threads.
You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
Customer Segments and Marketing Strategies:
Segment 0: Low Income-High Spenders
Strategy: Budget-friendly premium options, payment plans
Segment 1: Moderate Income-Moderate Spenders
Strategy: Balanced offers, seasonal promotions
Segment 2: High Income-Low Spenders
Strategy: Highlight value and quality, personalized recommendations
Segment 3: Low Income-Low Spenders
Strategy: Discounts, value bundles, essential items
Segment 4: High Income-High Spenders
Strategy: Premium loyalty programs, exclusive offers
# 10. Perform the following tasks
# using the pandas Series object:
# (a) Create a Series from the list [7, 11, 13, 17].
# (b) Create a Series with five
# elements where each element is 100.0.
# (c) Create a Series with 20 elements that
# are all random numbers in the range 0 to 100. Use the
# describe method to produce
# the Series’ basic descriptive statistics.
# (d) Create a Series called temperatures with
# the following floating-point values: 98.6, 98.9,
# 100.2, and 97.9. Use the index keyword argument
# to specify the custom indices ’Julie’,
# ’Charlie’, ’Sam’, and ’Andrea’.
# (e) Form a dictionary from the names
# and values in Part (d), then use it to initialize a Series.
print("Piyush Aanand")
print("2241001002")
import pandas as pd
import numpy as np
s1 = pd.Series([7, 11, 13, 17])
print("(a) Series from list:\n", s1)
s2 = pd.Series([100.0]*5)
print("\n(b) Series with five 100.0 elements:\n", s2)
s3 = pd.Series(np.random.randint(0, 101, size=20)) print("\
n(c) Series with 20 random numbers (0-100):\n", s3)
print("Descriptive statistics:\n", s3.describe())
temperatures = pd.Series([98.6, 98.9, 100.2, 97.9],
index=['Julie', 'Charlie', 'Sam', 'Andrea'])
print("\n(d) Temperatures Series with custom indices:\n",
temperatures)
temp_dict = temperatures.to_dict()
s5 = pd.Series(temp_dict)
print("\n(e) Series from dictionary:\n", s5)
Piyush Aanand
2241001002
(a) Series from list:
0 7
1 11
2 13
3 17
dtype: int64
(b) Series with five 100.0 elements:
0 100.0
1 100.0
2 100.0
3 100.0
4 100.0
dtype: float64
(c) Series with 20 random numbers (0-100):
0 34
1 90
2 36
3 30
4 61
5 39
6 90
7 31
8 90
9 73
10 98
11 34
12 51
13 84
14 6
15 68
16 0
17 79
18 97
19 69
dtype: int32
Descriptive statistics:
count 20.000000
mean 58.000000
std 30.255056
min 0.000000
25% 34.000000
50% 64.500000
75% 85.500000
max 98.000000
dtype: float64
(d) Temperatures Series with custom indices:
Julie 98.6
Charlie 98.9
Sam 100.2
Andrea 97.9
dtype: float64
(e) Series from dictionary:
Julie 98.6
Charlie 98.9
Sam 100.2
Andrea 97.9
dtype: float64