Assignment 3 Solution
Question 1:
Write a python code for reading a large dataset and map the textual features to
numerical values. Then apply PCA to reduce the dimensionality of the dataset to 3
principle componets and plot in 3-D.
Using Iris Dataset
1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt
4 from sklearn.datasets import load_iris
5 from sklearn.decomposition import PCA
6 from mpl_toolkits.mplot3d import Axes3D
7
8 # Load the Iris dataset
9 iris = load_iris()
10 data = iris.data
11 target = iris.target
12 feature_names = iris.feature_names
13
14 # Convert the features to a DataFrame
15 df = pd.DataFrame(data, columns=feature_names)
16
17 # Apply PCA to reduce dimensionality to 3 components
18 pca = PCA(n_components=3)
19 principal_components = pca.fit_transform(df)
20
21 # Create a DataFrame with the principal components
22 df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', '
23
24 # Plot the 3D graph
25 fig = plt.figure(figsize=(8, 8))
26 ax = fig.add_subplot(111, projection='3d')
27
28 # Scatter plot
29 scatter = ax.scatter(df_pca['PC1'], df_pca['PC2'], df_pca['PC3'], c=targe
30
31 # Legend
32 legend_labels = [f'Class {i}' for i in range(3)]
33 ax.legend(handles=scatter.legend_elements()[0], labels=legend_labels)
34
35 # Axes labels
36 ax.set_xlabel('Principal Component 1')
37 ax.set ylabel('Principal Component 2')
37 ax.set_ylabel( Principal Component 2 )
38 ax.set_zlabel('Principal Component 3')
39
40 # Title
41 ax.set_title('3D PCA of Iris Dataset')
42
43 plt.show()
Using Wine dataset
1 import pandas as pd
2 import numpy as np
3 import matplotlib.pyplot as plt
4 from sklearn.decomposition import PCA
5 from sklearn.datasets import load_wine
6 from mpl_toolkits.mplot3d import Axes3D
7
8 # Load the Wine dataset from scikit-learn
9 wine = load_wine()
10 X = pd.DataFrame(data=wine.data, columns=wine.feature_names)
11
12 # Standardize the data
13 from sklearn.preprocessing import StandardScaler
14 scaler = StandardScaler()
15 X_std = scaler.fit_transform(X)
16
17 # Apply PCA to reduce dimensionality to 3 components
18 pca = PCA(n_components=3)
19 X_pca = pca.fit_transform(X_std)
20
21 # Create a 3D scatter plot of the three principal components
22 fig = plt.figure(figsize=(8, 8))
23 ax = fig.add_subplot(111, projection='3d')
24
25 ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=wine.target, cmap='vi
26
27 ax.set_xlabel('Principal Component 1')
28 ax.set_ylabel('Principal Component 2')
29 ax.set_zlabel('Principal Component 3')
30 plt.title('PCA of Wine Dataset (3 Components)')
31
32 plt.show()
Question 2:
Apply Bayes’ Theorem on Covid tests being conducted when total tests are 10000 .
Also calculate specificty, sensitivity, prior and posterior probabilities and false
positive and false negative rates.
1 # Hypothetical values
2 P_A = 0.01 # Prevalence (1%)
3 P_B_given_A = 0.95 # Sensitivity (95%)
4 P_B_given_not_A = 0.05 # False positive rate (5%)
5
6 # Total number of tests
7 total_tests = 10000
8
9 # Calculate probabilities
10 P_not_A = 1 - P_A # Complement of P(A)
11
12 # Calculate true positives, false positives, true negatives, and false ne
13 true_positives = P_B_given_A * P_A * total_tests
14 false_positives = P_B_given_not_A * P_not_A * total_tests
15 true_negatives = (1 - P_B_given_not_A) * P_not_A * total_tests
16 false_negatives = (1 - P_B_given_A) * P_A * total_tests
17
18 # Calculate performance measures
19 accuracy = (true_positives + true_negatives) / total_tests
20 precision = true_positives / (true_positives + false_positives)
21 recall = true_positives / (true_positives + false_negatives)
22 f1_score = 2 * (precision * recall) / (precision + recall)
23
24 # Calculate additional metrics
25 specificity = true_negatives / (true_negatives + false_positives)
26 sensitivity = recall
27 prior_probability = P_A
28 posterior_probability = true_positives / total_tests
29 false_positive_rate = false_positives / (false_positives + true_negative
30 false_negative_rate = false_negatives / (false_negatives + true_positive
31
32 # Print the results
33 print(f"True Positives: {true_positives:.0f}")
34 print(f"False Positives: {false_positives:.0f}")
35 print(f"True Negatives: {true_negatives:.0f}")
36 print(f"False Negatives: {false_negatives:.0f}\n")
37
38 print(f"Accuracy: {accuracy:.4f}")
39 print(f"Precision: {precision:.4f}")
40 print(f"Recall (Sensitivity): {recall:.4f}")
41 print(f"F1 Score: {f1_score:.4f}")
42 print(f"Specificity: {specificity:.4f}")
43 print(f"Sensitivity: {sensitivity:.4f}")
44 print(f"Prior Probability: {prior_probability:.4f}")
45 print(f"Posterior Probability: {posterior_probability:.4f}")
46 print(f"False Positive Rate: {false_positive_rate:.4f}")
True Positives: 95
False Positives: 495
True Negatives: 9405
False Negatives: 5
Accuracy: 0.9500
Precision: 0.1610
Recall (Sensitivity): 0.9500
F1 Score: 0.2754
Specificity: 0.9500
Sensitivity: 0.9500
Prior Probability: 0.0100
Posterior Probability: 0.0095
False Positive Rate: 0.0500
False Negative Rate: 0.0500
Question 3:
Solution 1: Generate a random dataset of true and predicted labels, and calculate
and interpret the performance measures and print the output
1 import numpy as np
2 from sklearn.metrics import accuracy_score, precision_score, recall_score
3
4 # Generate a random dataset of true and predicted labels
5 np.random.seed(42)
6
7 # Number of samples
8 num_samples = 1000
9
10 # True labels (0: Negative, 1: Positive)
11 true_labels = np.random.randint(2, size=num_samples)
12
13 # Simulate a classifier's predicted labels with some errors
14 # Let's introduce some false positives and false negatives
15 predicted_labels = true_labels.copy()
16 predicted_labels[np.random.choice(num_samples, size=int(0.2 * num_sample
17 predicted_labels[np.random.choice(num_samples, size=int(0.1 * num_sample
18
19 # Calculate performance measures
20 accuracy = accuracy_score(true_labels, predicted_labels)
21 precision = precision_score(true_labels, predicted_labels)
22 recall = recall_score(true_labels, predicted_labels)
23 f1 = f1_score(true_labels, predicted_labels)
24 conf_matrix = confusion_matrix(true_labels, predicted_labels)
25
26 # Print the results
27 #print(f"True Labels: {true_labels}")
28 #print(f"Predicted Labels: {predicted_labels}\n")
29
30 print(f"Confusion Matrix:\n{conf_matrix}\n")
31
32 print(f"Accuracy: {accuracy:.4f}")
33 print(f"Precision: {precision:.4f}")
34 print(f"Recall: {recall:.4f}")
Confusion Matrix:
[[398 92]
[ 54 456]]
Accuracy: 0.8540
Precision: 0.8321
Recall: 0.8941
F1 Score: 0.8620
Solution 2: Take an example of Machine Learning Classification and generate all
the performance measures to show their significance and relationship
1 from sklearn.model_selection import train_test_split
2 from sklearn.svm import SVC
3 from sklearn.metrics import accuracy_score, precision_score, recall_score
4 from sklearn.datasets import load_iris
5
6 # Load the Iris dataset
7 iris = load_iris()
8 X = iris.data[:, :2] # Using only sepal length and width for simplicity
9 y = iris.target
10
10
11 # Split the dataset into training and testing sets
12 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
13
14 # Initialize the SVM classifier
15 svm_classifier = SVC(kernel='linear', C=1)
16
17 # Train the classifier
18 svm_classifier.fit(X_train, y_train)
19
20 # Make predictions on the test set
21 y_pred = svm_classifier.predict(X_test)
22
23 # Calculate performance measures
24 accuracy = accuracy_score(y_test, y_pred)
25 precision = precision_score(y_test, y_pred, average='weighted')
26 recall = recall_score(y_test, y_pred, average='weighted')
27 f1 = f1_score(y_test, y_pred, average='weighted')
28 conf_matrix = confusion_matrix(y_test, y_pred)
29
30 # Print the results
31 print(f"Accuracy: {accuracy:.4f}")
32 print(f"Precision: {precision:.4f}")
33 print(f"Recall: {recall:.4f}")
34 print(f"F1 Score: {f1:.4f}")
35 print(f"Confusion Matrix:\n{conf_matrix}")
36 print("\nClassification Report:")
37 print(classification_report(y_test, y_pred, target_names=iris.target_name
38
output Accuracy: 0.9000
Precision: 0.9014
Recall: 0.9000
F1 Score: 0.8992
Confusion Matrix:
[[10 0 0]
[ 0 7 2]
[ 0 1 10]]
Classification Report:
precision recall f1-score support
setosa 1.00 1.00 1.00 10
versicolor 0.88 0.78 0.82 9
virginica 0.83 0.91 0.87 11
accuracy 0.90 30
macro avg 0.90 0.90 0.90 30
weighted avg 0.90 0.90 0.90 30