Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
33 views27 pages

ML 1-11

The document contains a series of Python programs utilizing libraries such as NumPy, Pandas, Matplotlib, Seaborn, and Scikit-learn for various data analysis and machine learning tasks. It covers creating arrays, data manipulation, exploratory data analysis, simple and multiple linear regression, logistic regression, and decision tree classification. Each section includes code snippets and explanations for loading datasets, preprocessing, model training, evaluation, and visualization.

Uploaded by

Sai
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
33 views27 pages

ML 1-11

The document contains a series of Python programs utilizing libraries such as NumPy, Pandas, Matplotlib, Seaborn, and Scikit-learn for various data analysis and machine learning tasks. It covers creating arrays, data manipulation, exploratory data analysis, simple and multiple linear regression, logistic regression, and decision tree classification. Each section includes code snippets and explanations for loading datasets, preprocessing, model training, evaluation, and visualization.

Uploaded by

Sai
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 27

WEEK -0 Write a python program using libraries

1.Using numpy

import numpy as np

# Create an array

array = np.array([1, 2, 3, 4, 5])

print("Array:", array)

# Perform operations

print("Array + 2:", array + 2)

print("Mean of Array:", np.mean(array))

2.Using Pandas

import pandas as pd

# Create a DataFrame

data = {"Name":["poojitha","Bob","Dharani"],

"Age":[25,30,35],

"City":["New York","Los Angeles","Chicago"]}

df = pd.DataFrame(data)

print(df)

# Add a new column

df["Age in 5 Years"] = df["Age"] + 5

print("\nUpdatedDataFrame:\n",df)

3.Using Matplotlib

import matplotlib.pyplot as plt

# Data for plotting

x = [1,2,3,4,5]
y = [2,4,6,8,10]

# Plotting

plt.plot(x, y, marker="o",color="blue", label="Line Plot")

plt.title("Simple Line Plot")

plt.xlabel("X-axis")

plt.ylabel("Y-axis")

plt.legend()

plt.show()

4.Using Seaborn

import seaborn as sns

import matplotlib.pyplot as plt

# Sample data

tips = sns.load_dataset("tips")

# Plot a scatter plot

sns.scatterplot(data=tips, x="total_bill", y="tip", hue="day")

plt.title("Scatter Plot of Tips")

plt.show()

5.Using Pandas

import pandas as pd

# Load the dataset

df = pd.read_csv("C:/Users/kishore/Desktop/small_dataset.csv")

# Display the dataset

print("Dataset:")

print(df)
# Preprocessing steps:

# 1. Check for missing values

print("\nMissing Values:")

print(df.isnull().sum())

# 2. Check data types of columns

print("\nData Types:")

print(df.dtypes)

# 4. Add a new column: Age Group

df["Age Group"] = pd.cut(df["Age"], bins=[20,30,40,50], labels=["20-30","30-40","40-50"])

print("\nDataset with Age Group:")

print(df)

# 5. Filter rows where Salary > 80,000

high_salary = df[df["Salary"]>80000]

print("\nRows with Salary > 80,000:")

print(high_salary)

Week – 01 Write a python program to implement Exploratory Data analysis on dataset

Modifying Data, Removing unwanted data,retrieving data,getting statistical information,draw


plot/graph

Program

import pandas as pd # For data manipulation

import numpy as np # For numerical operations

import matplotlib.pyplot as plt # For plotting graphs

import seaborn as sns # For advanced visualizations

import warnings # To suppress warnings

# Suppress warnings for cleaner output

warnings.filterwarnings("ignore")
# Load the dataset

data = pd.read_csv('C:/Users/kishore/Desktop/EDA_dataset.csv')

# Display dataset structure

print("\nDataset Loaded Successfully!")

print(f"Number of Rows: {data.shape[0]}, Number of Columns: {data.shape[1]}")

# Show column names

print("\nColumns in Dataset:")

print(list(data.columns))

# a. Modifying Data: Add a new column 'Annual_Salary' (Salary * 12)

data['Annual_Salary'] = data['Salary'] * 12

# Display the first 5 rows to confirm the modification

print("\nModified Data (Added 'Annual_Salary' Column):")

print(data[['ID', 'Salary', 'Annual_Salary']].head())

# b. Removing Unwanted Data: Drop rows where 'Performance_Score' or 'Salary' is missing

data_cleaned = data.dropna(subset=['Performance_Score', 'Salary'])

# Display the number of rows after cleaning

print(f"\nData Cleaned Successfully! Remaining Rows: {data_cleaned.shape[0]}")

# c. Retrieving Data: Filter employees in 'IT' department with salary > 50,000

it_high_salary = data_cleaned[(data_cleaned['Department'] == 'IT') & (data_cleaned['Salary'] > 50000)]

# Display the filtered results

print(f"\nNumber of IT Employees with Salary > 50,000: {it_high_salary.shape[0]}")

print(it_high_salary[['ID', 'Salary', 'Department']].head())

# d. Getting Statistical Information: Summary of numerical columns

print("\nStatistical Summary of Numerical Columns:")

print(data_cleaned.describe())

# e. Draw Plot/Graph: Visualize data insights


# Set a consistent style for plots

sns.set(style="whitegrid")

# 1. Distribution of Salary

plt.figure(figsize=(8, 6))

sns.histplot(data_cleaned['Salary'], kde=True, bins=30, color='blue')

plt.title("Distribution of Salaries", fontsize=16)

plt.xlabel("Salary", fontsize=12)

plt.ylabel("Frequency", fontsize=12)

plt.tight_layout()

plt.show()

# 2. Count of Employees by Department

plt.figure(figsize=(8, 6))

sns.countplot(data=data_cleaned, x='Department', palette='viridis')

plt.title("Count of Employees by Department", fontsize=16)

plt.xlabel("Department", fontsize=12)

plt.ylabel("Count", fontsize=12)

plt.xticks(rotation=45)

plt.tight_layout()

plt.show()

# 3. Boxplot of Salary by Education Level

plt.figure(figsize=(8, 6))

sns.boxplot(data=data_cleaned, x='Education_Level', y='Salary', palette='pastel')

plt.title("Boxplot of Salary by Education Level", fontsize=16)

plt.xlabel("Education Level", fontsize=12)

plt.ylabel("Salary", fontsize=12)

plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

# Save the cleaned dataset

data_cleaned.to_csv('cleaned_dataset.csv', index=False)

print("\nCleaned dataset saved as 'cleaned_dataset.csv'.")

WEEK 2.Prediction of house of a price using Simple Linear Regression.

Program:

# Importing necessary libraries

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

import warnings

# Suppress warnings

warnings.filterwarnings('ignore')

# Step 1: Load the dataset

# Assuming the dataset is in the same directory as the script

file_path = 'C:/Users/kishore/Desktop/SimpleLinearRegression_dataset.csv'

df = pd.read_csv(file_path)

# Display basic information about the dataset

print("Dataset Loaded Successfully!")

print("First 5 rows of the dataset:")

print(df.head())
# Step 2: Feature selection

# For simplicity, we use 'sqft_living' as the predictor (X) and 'price' as the target (y)

X = df[['sqft_living']]

y = df['price']

# Step 3: Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nDataset split into training and testing sets.")

print(f"Training set size: {X_train.shape[0]} samples")

print(f"Testing set size: {X_test.shape[0]} samples")

# Step 4: Train a Simple Linear Regression model

model = LinearRegression()

model.fit(X_train, y_train)

print("\nModel training completed.")

# Step 5: Model evaluation

# Predict house prices on the testing set

y_pred = model.predict(X_test)

# Calculate and display evaluation metrics

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")

print(f"Mean Squared Error (MSE): {mse:.2f}")

print(f"R-squared (R2 Score): {r2:.2f}")

# Step 6: Visualize the results

plt.figure(figsize=(10, 6))

plt.scatter(X_test, y_test, color='blue', label='Actual Prices')

plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted Prices')


plt.title('Actual vs Predicted House Prices')

plt.xlabel('Living Area (sqft)')

plt.ylabel('Price')

plt.legend()

plt.show()

# Step 7: Display model coefficients

print("\nModel Coefficients:")

print(f"Intercept: {model.intercept_:.2f}")

print(f"Slope: {model.coef_[0]:.2f}")

# Step 8: Display Predicted vs Actual Prices

comparison_df = pd.DataFrame({'Actual Price': y_test.values, 'Predicted Price': y_pred})

print("\nPredicted vs Actual Prices:")

print(comparison_df.head(10)) # Display first 10 rows for brevity

WEEK 3.Prediction of the home prices using Multiple Linear Regression for a given
dataset.

Program:

# Importing required libraries

import pandas as pd # For data manipulation and analysis

import numpy as np # For numerical computations

import matplotlib.pyplot as plt # For plotting (optional)

from sklearn.model_selection import train_test_split # For splitting the dataset

from sklearn.linear_model import LinearRegression # For creating the ML model

from sklearn.metrics import mean_squared_error, r2_score # For model evaluation

import warnings # To suppress warnings


# Suppress warnings for cleaner output

warnings.filterwarnings('ignore')

# Step 1: Load the dataset

# Make sure the file `MultipleLinearRegression_dataset.csv` is in the same directory

file_path = "C:/Users/kishore/Desktop/MultipleLinearRegression_dataset.csv"

data = pd.read_csv(file_path)

# Step 2: Display basic information about the dataset

print("Dataset Overview:")

print(data.head(), "\n") # Show the first few rows of the dataset

print("Dataset Info:")

print(data.info(), "\n") # Display dataset summary

# Step 3: Check for missing values

print("Missing Values in Dataset:")

print(data.isnull().sum(), "\n")

# Step 4: Define features (X) and target variable (y)

# Select all columns except the target column (House_Price)

X = data.drop(columns=['House_Price'])

y = data['House_Price']

# Step 5: Split the dataset into training and testing sets

# 80% data for training and 20% for testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Initialize and train the Multiple Linear Regression model

model = LinearRegression()

model.fit(X_train, y_train)

# Step 7: Make predictions on the test set

y_pred = model.predict(X_test)
# Step 8: Evaluate the model

mse = mean_squared_error(y_test, y_pred) # Calculate Mean Squared Error

r2 = r2_score(y_test, y_pred) # Calculate R-squared value

# Display evaluation metrics

print("Model Evaluation:")

print(f"Mean Squared Error (MSE): {mse:.2f}")

print(f"R-squared Value: {r2:.2f}")

# Step 9: Display feature importance (coefficients)

print("\nFeature Importance:")

coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})

print(coefficients)

# Plot actual vs predicted house prices

plt.figure(figsize=(8, 6))

plt.scatter(y_test, y_pred, alpha=0.7)

plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')

plt.xlabel("Actual House Prices")

plt.ylabel("Predicted House Prices")

plt.title("Actual vs Predicted House Prices")

plt.grid(True)

plt.show()

WEEK 4.Apply Logistic Regression Model and predict whether a person takes insurance
or not based on his age for a given dataset.

Program :

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split


from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler

# Suppress warnings for clean output

import warnings

warnings.filterwarnings('ignore')

# Step 1: Load the dataset

dataset_path = "C:/Users/kishore/Desktop/LogisticRegression_dataset.csv" # Path to the dataset

df = pd.read_csv(dataset_path)

# Display the first few rows of the dataset to verify

print("\nFirst few rows of the dataset:")

print(df.head())

# Prepare data for training

X = df[['Age', 'Income', 'Marital_Status']]

y = df['Has_Insurance']

# Feature scaling

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80% train, 20% test)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Logistic Regression model

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

# Evaluate model accuracy

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)


print("\nModel Accuracy:", accuracy)

# Predictions for test data

test_data = np.array([

[25, 20000, 0], # Age=25, Income=20000, Single (likely No Insurance)

[40, 35000, 1], # Age=40, Income=35000, Married (likely Yes Insurance)

[45, 40000, 1], # Age=45, Income=40000, Married (likely Yes Insurance)

[30, 25000, 0], # Age=30, Income=25000, Single (likely No Insurance)

[50, 46000, 1], # Age=50, Income=46000, Married (likely Yes Insurance)

[35, 29000, 0], # Age=35, Income=29000, Single (likely No Insurance)

[33, 28000, 1], # Age=33, Income=28000, Married (likely No Insurance)

[53, 45000, 1] # Age=53, Income=45000, Married (likely Yes Insurance)

])

# Scale the test data

test_data_scaled = scaler.transform(test_data)

# Making predictions for the test data

predictions = log_reg.predict(test_data_scaled)

# Output the results for test data

print("\nPredictions for Test Data:")

for i, (pred, data) in enumerate(zip(predictions, test_data)):

age, income, marital_status = data

has_insurance = "Yes" if pred == 1 else "No"

print(f"Age: {age}, Income: {income}, Marital Status: {'Married' if marital_status == 1 else 'Single'}, "

f"Prediction (Has Insurance?): {has_insurance}")


WEEK 5.Apply Decision Tree Classifier model to take a decision whether to play cricket or
not under given conditions.

Program:
# Importing necessary libraries

import pandas as pd # For data manipulation

from sklearn.model_selection import train_test_split # For splitting data into training and testing sets

from sklearn.tree import DecisionTreeClassifier # The Decision Tree Classifier

from sklearn.metrics import accuracy_score, classification_report # For model evaluation

from sklearn.tree import plot_tree #visualize the tree

import matplotlib.pyplot as plt

# Suppress warnings for clean output

import warnings

warnings.filterwarnings('ignore')

# Step 1: Load the dataset

dataset_path = "C:/Users/kishore/Downloads/balanced_cricket_dataset.csv" # Path to the dataset

data = pd.read_csv(dataset_path)

# Display the first few rows of the dataset

print("Dataset Preview:")

print(data.head())

# Step 2: Feature selection and preprocessing

# Separating features (X) and the target variable (y)

X = data[['Weather', 'Temperature', 'Humidity', 'Windy']] # Features

y = data['PlayCricket'] # Target variable

# Convert categorical data into numerical values (if necessary)

X = pd.get_dummies(X, columns=['Weather', 'Temperature', 'Humidity', 'Windy'], drop_first=True)

# Step 3: Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize and train the Decision Tree Classifier

model = DecisionTreeClassifier(random_state=42) # Creating the model

model.fit(X_train, y_train) # Training the model

# Step 5: Make predictions on the test set

y_pred = model.predict(X_test)

# Step 6: Evaluate the model

accuracy = accuracy_score(y_test, y_pred)

print("\nModel Accuracy: {:.2f}%".format(accuracy * 100))

print("\nClassification Report:")

print(classification_report(y_test, y_pred))

# Step 7: Display the feature importance

feature_importance = pd.DataFrame({

'Feature': X.columns,

'Importance': model.feature_importances_

}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")

print(feature_importance)

# Step 8: Display all possible outputs

# Define all possible conditions based on the dataset's features

conditions = pd.DataFrame({

'Weather': ['Sunny', 'Sunny', 'Overcast', 'Overcast', 'Rain', 'Rain'],

'Temperature': ['Hot', 'Mild', 'Hot', 'Cool', 'Mild', 'Cool'],

'Humidity': ['High', 'Normal', 'High', 'Normal', 'High', 'Normal'],

'Windy': ['False', 'True', 'False', 'True', 'False', 'True']

})
# Convert conditions to the same feature structure as training data

conditions_encoded = pd.get_dummies(conditions, columns=['Weather', 'Temperature', 'Humidity',


'Windy'], drop_first=True)

conditions_encoded = conditions_encoded.reindex(columns=X.columns, fill_value=0)

# Predict outcomes for all conditions

predictions = model.predict(conditions_encoded)

# Combine conditions with predictions for display

conditions['PlayCricket'] = predictions

print("\nAll Possible Outputs:")

print(conditions)

#Structure of the Decision Tree

plt.figure(figsize=(20, 12)) # Increase figure size

plot_tree(model,

feature_names=X.columns,

class_names=['No', 'Yes'],

filled=True,

rounded=True,

fontsize=10, # Adjust font size for readability

max_depth=3) # Limit depth for clarity

plt.show()
Week6

6.Train the Random Forest on the scikit digits dataset and check whether the model is correctly
predicting the handwritten digits.

Program:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Suppress warnings for cleaner output


warnings.filterwarnings("ignore")

# Load the dataset


# Ensure the CSV file is in the working directory or provide its full path
data_path = '/content/balanced_digits_dataset.csv' # Update with the correct file path
data = pd.read_csv(data_path)

# Display the first few rows of the dataset


print("\nDataset Preview:\n")
print(data.head())

# Separate features (pixels) and labels (target)


X = data.drop(columns=['label']) # Drop the target column
y = data['label'] # Extract the target column

# Display dataset information


print("\nDataset Information:\n")
print(f"Number of samples: {len(data)}")
print(f"Number of features: {X.shape[1]}")
print(f"Unique labels: {sorted(y.unique())}")

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData Split Summary:\n")


print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Train a Random Forest classifier


rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

1
# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model


accuracy = accuracy_score(y_test, y_pred)
print("\nModel Evaluation:\n")
print(f"Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n")
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix


plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=sorted(y.unique()),
yticklabels=sorted(y.unique()))
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Display feature importance


importances = rf_model.feature_importances_
important_features = pd.DataFrame({"Feature": X.columns, "Importance":
importances}).sort_values(by="Importance", ascending=False)

print("\nTop 10 Most Important Features:\n")


print(important_features.head(10))

# Plot feature importance


plt.figure(figsize=(12, 6))
plt.bar(important_features['Feature'][:10], important_features['Importance'][:10], color='green')
plt.title("Top 10 Feature Importances")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.xticks(rotation=45)
plt.show()

# Include example predictions


print("\nExample Predictions:\n")
example_indices = np.random.choice(X_test.index, 5, replace=False) # Randomly select 5 test samples
example_inputs = X_test.loc[example_indices]
example_true_labels = y_test.loc[example_indices]
example_predictions = rf_model.predict(example_inputs)

2
for i, index in enumerate(example_indices):
print(f"Sample {i+1} - True Label: {example_true_labels.loc[index]}, Predicted Label:
{example_predictions[i]}")

Output:

Week 7

7.Analyze the weather data using Gaussian Naïve Bayes classifier and predict whether cricket can
be played or not based on given new data.

Program:

# Importing necessary libraries


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the dataset from the CSV file


df = pd.read_csv('/content/weather_data.csv')

# Splitting the dataset into features (X) and target (y)


X = df.drop('Can Play Cricket', axis=1) # Features: Temperature, Humidity, Wind Speed, Rainy
y = df['Can Play Cricket'] # Target: Whether cricket can be played or not

# Splitting the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initializing the Gaussian Naïve Bayes model


model = GaussianNB()

# Training the model with the training data


model.fit(X_train, y_train)

# Making predictions on the test set


y_pred = model.predict(X_test)

# Checking the accuracy of the model


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy * 100:.2f}%")

# Output the predictions for all possible outcomes (Yes and No)
print("\nPredictions on Test Data:")
for i in range(len(X_test)):
prediction = "Yes" if y_pred[i] == 1 else "No"
print(f"Data: {X_test.iloc[i].to_dict()}, Prediction: {prediction}")

3
# Making predictions on new data
def predict_cricket_playability(new_data):
"""
Predict whether cricket can be played based on the given weather conditions.

new_data: A dictionary containing weather conditions {'Temperature': value, 'Humidity': value, 'Wind
Speed': value, 'Rainy': value}
"""
new_df = pd.DataFrame([new_data])
prediction = model.predict(new_df)
return "Yes" if prediction[0] == 1 else "No"

# Example: New data to predict (can be replaced with real values)


new_weather_data = {
'Temperature': 28, # in °C
'Humidity': 65, # in percentage
'Wind Speed': 8, # in km/h
'Rainy': 0 # 0 = No Rain, 1 = Rain
}

# Predicting whether cricket can be played with the new weather data
result = predict_cricket_playability(new_weather_data)
print(f"\nCan cricket be played with new data? {result}")

Output:

Week 8

8.Prediction of breast cancer benign or malignant using KNN Classifier.

Program:

# Import necessary libraries


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings

# Suppress warnings for clean output


warnings.filterwarnings("ignore")

# Load the dataset


# Ensure the 'breast_cancer_knn_dataset.csv' file is in your working directory
4
dataset = pd.read_csv('/content/breast_cancer_knn_dataset.csv')

# Display the first 5 rows of the dataset to understand its structure


print("Dataset Preview:")
print(dataset.head())

# Basic information about the dataset


print("\nDataset Information:")
dataset.info()

# Check for missing values


print("\nMissing Values in the Dataset:")
print(dataset.isnull().sum())

# Encode the target column ('Diagnosis') into numerical values (Benign=0, Malignant=1)
le = LabelEncoder()
dataset['Diagnosis'] = le.fit_transform(dataset['Diagnosis'])

# Display unique values in the target column


print("\nUnique Values in 'Diagnosis':")
print(dataset['Diagnosis'].unique())

# Define features (X) and target (y)


X = dataset[['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness']]
y = dataset['Diagnosis']

# Standardize the feature values for better KNN performance


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a KNN Classifier


knn = KNeighborsClassifier(n_neighbors=5) # You can experiment with 'n_neighbors'
knn.fit(X_train, y_train)

# Make predictions on the test data


y_pred = knn.predict(X_test)

# Convert numerical predictions back to categorical labels


predictions_words = le.inverse_transform(y_pred)

# Evaluate the model's performance


print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

5
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

# Display a few predictions alongside actual values for manual verification


results = pd.DataFrame({
'Actual': le.inverse_transform(y_test.values),
'Predicted': predictions_words
})
print("\nSample Predictions:")
print(results.head())

Output:

Week 9

9.Classify the iris flowers dataset using SVM and find out the flower type depending on the given
input data.

Program :

# Import necessary libraries


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Suppress warnings for cleaner output


import warnings
warnings.filterwarnings("ignore")

# Step 1: Load the dataset


# The dataset should be in the same directory or provide the full path
file_path = "/content/svm_Iris_dataset.csv" # Replace with the correct path if needed
try:
iris_data = pd.read_csv(file_path)
print("Dataset loaded successfully!\n")
except FileNotFoundError:
print("Error: The dataset file was not found. Please check the file path.")
exit()

# Display the first 5 rows of the dataset for understanding


print("Preview of the dataset:\n")
print(iris_data.head())

# Step 2: Select features and the target variable


# Columns used as features: SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm
# Target variable: Species

6
features = iris_data[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
target = iris_data['Species']

# Step 3: Split the dataset into training and testing sets


# 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

print("\nData split into training and testing sets successfully!")


print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

# Step 4: Train the SVM model


# Using the SVC class from sklearn
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

print("\nSVM model trained successfully!")

# Step 5: Make predictions on the test set


y_pred = svm_model.predict(X_test)

# Step 6: Evaluate the model


print("\nEvaluation Metrics:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)


print(f"Accuracy of the model: {accuracy:.2f}")

# Step 7: Predict flower type for new input data


# Example input: SepalLengthCm=5.1, SepalWidthCm=3.5, PetalLengthCm=1.4, PetalWidthCm=0.2
sample_data = [[5.1, 3.5, 1.4, 0.2]] # Replace with desired input
predicted_species = svm_model.predict(sample_data)
print(f"\nPredicted flower type for input {sample_data}: {predicted_species[0]}")

# Optional: Displaying all possible outputs for clarity


print("\nPossible classes in the dataset:", svm_model.classes_)

Output:

7
Week 10

10.Use K-Means clustering model and classify the employees into various income groups or
clusters.

Program:

# Importing necessary libraries


import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore") # Suppress unnecessary warnings


# Step 1: Load the dataset
data_path = "/content/balanced_employee_dataset.csv" # Path to the dataset (ensure the file is in the
same directory or update the path)
df = pd.read_csv(data_path)

# Display the first few rows of the dataset


print("First 5 rows of the dataset:")
print(df.head())

# Step 2: Feature selection


# Select features relevant for clustering (excluding non-numeric columns like Job_Role and Cluster)
selected_features = ['Age', 'Experience', 'Education_Level', 'Skills_Rating', 'Income']
data = df[selected_features]

# Step 3: Data preprocessing


# Convert categorical data to numeric using one-hot encoding or label encoding
data_encoded = data.copy()

# Example: Encoding 'Education_Level' as a numeric column


data_encoded['Education_Level'] = data_encoded['Education_Level'].astype('category').cat.codes

# Standardizing the data


scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_encoded)

# Display the scaled data for verification


print("\nEncoded and scaled data (first 5 rows):")
print(pd.DataFrame(data_scaled, columns=selected_features).head())

# Step 4: Determine the optimal number of clusters using the Elbow Method
inertia = []
k_values = range(1, 11)
8
for k in k_values:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data_scaled)
inertia.append(kmeans.inertia_)

# Plot the Elbow curve


plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, marker='o')
plt.title('Elbow Method: Optimal Number of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid()
plt.show()

# Step 5: Apply K-Means clustering


# Choosing the number of clusters (e.g., 3, based on the Elbow curve)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(data_scaled)

# Add the cluster labels to the original dataset


df['Cluster'] = cluster_labels

# Display the updated dataset


print("\nDataset with Cluster labels:")
print(df.head())

# Step 6: Evaluate the clustering performance using Silhouette Score


sil_score = silhouette_score(data_scaled, cluster_labels)
print(f"\nSilhouette Score: {sil_score:.2f}")

# Step 7: Visualize the clusters (using first two features for simplicity)
plt.figure(figsize=(8, 6))
colors = ['red', 'blue', 'green']
for i in range(optimal_k):
plt.scatter(
data_scaled[cluster_labels == i, 0], # First feature
data_scaled[cluster_labels == i, 1], # Second feature
label=f'Cluster {i}',
s=50,
alpha=0.7,
color=colors[i]
)

plt.title('K-Means Clustering Visualization')


plt.xlabel(selected_features[0])
plt.ylabel(selected_features[1])
plt.legend()

9
plt.grid()
plt.show()

# Display final cluster centers


cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
print("\nCluster Centers:")
print(pd.DataFrame(cluster_centers, columns=selected_features))

# Step 8: Display income distribution in each cluster


print("\nIncome distribution in each cluster:")
for cluster_id in range(optimal_k):
cluster_data = df[df['Cluster'] == cluster_id]
print(f"\nCluster {cluster_id}:")
print(cluster_data[['Income']].describe())

# Step 9: Visualize income distributions


# Bar chart for average income per cluster
avg_income_per_cluster = df.groupby('Cluster')['Income'].mean()

plt.figure(figsize=(8, 6))
avg_income_per_cluster.plot(kind='bar', color=['red', 'blue', 'green'], alpha=0.7, edgecolor='black')
plt.title('Average Income per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Average Income')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Save the clustered dataset to a new CSV file


df.to_csv("clustered_dataset.csv", index=False)
print("\nClustered dataset saved to 'clustered_dataset.csv'")

Output:

10
Week 11

11.Performance analysis of Classification Algorithms on a specific dataset.

Program:

# Import necessary libraries


import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Suppress warnings for clean output


warnings.filterwarnings("ignore")

# Load the dataset


dataset_path = "/content/Performace_Iris_dataset.csv" # Update this path as per your file location
data = pd.read_csv(dataset_path)

# Display the first few rows of the dataset


print("\nDataset Overview:")
print(data.head())

# Check for missing values


print("\nChecking for missing values:")
print(data.isnull().sum())

# Select features and target


X = data.iloc[:, :-1] # Assuming the last column is the target
y = data.iloc[:, -1]

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features for better model performance


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

11
# Function to train and evaluate classifiers
def evaluate_classifier(model, model_name):
print(f"\nTraining {model_name}...")
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(f"\n{model_name} Metrics:")
print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, fmt='d', cmap='coolwarm')
plt.title(f'{model_name} Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# List of classifiers to evaluate


classifiers = [
(RandomForestClassifier(random_state=42), "Random Forest"),
(SVC(kernel='linear', random_state=42), "Support Vector Machine"),
(DecisionTreeClassifier(random_state=42), "Decision Tree"),
(GaussianNB(), "Naive Bayes"),
(KNeighborsClassifier(), "K-Nearest Neighbors"),
(LogisticRegression(random_state=42), "Logistic Regression")
]

# Evaluate each classifier


for model, name in classifiers:
evaluate_classifier(model, name)

Output:

12

You might also like