WEEK -0 Write a python program using libraries
1.Using numpy
import numpy as np
# Create an array
array = np.array([1, 2, 3, 4, 5])
print("Array:", array)
# Perform operations
print("Array + 2:", array + 2)
print("Mean of Array:", np.mean(array))
2.Using Pandas
import pandas as pd
# Create a DataFrame
data = {"Name":["poojitha","Bob","Dharani"],
"Age":[25,30,35],
"City":["New York","Los Angeles","Chicago"]}
df = pd.DataFrame(data)
print(df)
# Add a new column
df["Age in 5 Years"] = df["Age"] + 5
print("\nUpdatedDataFrame:\n",df)
3.Using Matplotlib
import matplotlib.pyplot as plt
# Data for plotting
x = [1,2,3,4,5]
y = [2,4,6,8,10]
# Plotting
plt.plot(x, y, marker="o",color="blue", label="Line Plot")
plt.title("Simple Line Plot")
plt.xlabel("X-axis")
plt.ylabel("Y-axis")
plt.legend()
plt.show()
4.Using Seaborn
import seaborn as sns
import matplotlib.pyplot as plt
# Sample data
tips = sns.load_dataset("tips")
# Plot a scatter plot
sns.scatterplot(data=tips, x="total_bill", y="tip", hue="day")
plt.title("Scatter Plot of Tips")
plt.show()
5.Using Pandas
import pandas as pd
# Load the dataset
df = pd.read_csv("C:/Users/kishore/Desktop/small_dataset.csv")
# Display the dataset
print("Dataset:")
print(df)
# Preprocessing steps:
# 1. Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())
# 2. Check data types of columns
print("\nData Types:")
print(df.dtypes)
# 4. Add a new column: Age Group
df["Age Group"] = pd.cut(df["Age"], bins=[20,30,40,50], labels=["20-30","30-40","40-50"])
print("\nDataset with Age Group:")
print(df)
# 5. Filter rows where Salary > 80,000
high_salary = df[df["Salary"]>80000]
print("\nRows with Salary > 80,000:")
print(high_salary)
Week – 01 Write a python program to implement Exploratory Data analysis on dataset
Modifying Data, Removing unwanted data,retrieving data,getting statistical information,draw
plot/graph
Program
import pandas as pd # For data manipulation
import numpy as np # For numerical operations
import matplotlib.pyplot as plt # For plotting graphs
import seaborn as sns # For advanced visualizations
import warnings # To suppress warnings
# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")
# Load the dataset
data = pd.read_csv('C:/Users/kishore/Desktop/EDA_dataset.csv')
# Display dataset structure
print("\nDataset Loaded Successfully!")
print(f"Number of Rows: {data.shape[0]}, Number of Columns: {data.shape[1]}")
# Show column names
print("\nColumns in Dataset:")
print(list(data.columns))
# a. Modifying Data: Add a new column 'Annual_Salary' (Salary * 12)
data['Annual_Salary'] = data['Salary'] * 12
# Display the first 5 rows to confirm the modification
print("\nModified Data (Added 'Annual_Salary' Column):")
print(data[['ID', 'Salary', 'Annual_Salary']].head())
# b. Removing Unwanted Data: Drop rows where 'Performance_Score' or 'Salary' is missing
data_cleaned = data.dropna(subset=['Performance_Score', 'Salary'])
# Display the number of rows after cleaning
print(f"\nData Cleaned Successfully! Remaining Rows: {data_cleaned.shape[0]}")
# c. Retrieving Data: Filter employees in 'IT' department with salary > 50,000
it_high_salary = data_cleaned[(data_cleaned['Department'] == 'IT') & (data_cleaned['Salary'] > 50000)]
# Display the filtered results
print(f"\nNumber of IT Employees with Salary > 50,000: {it_high_salary.shape[0]}")
print(it_high_salary[['ID', 'Salary', 'Department']].head())
# d. Getting Statistical Information: Summary of numerical columns
print("\nStatistical Summary of Numerical Columns:")
print(data_cleaned.describe())
# e. Draw Plot/Graph: Visualize data insights
# Set a consistent style for plots
sns.set(style="whitegrid")
# 1. Distribution of Salary
plt.figure(figsize=(8, 6))
sns.histplot(data_cleaned['Salary'], kde=True, bins=30, color='blue')
plt.title("Distribution of Salaries", fontsize=16)
plt.xlabel("Salary", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()
# 2. Count of Employees by Department
plt.figure(figsize=(8, 6))
sns.countplot(data=data_cleaned, x='Department', palette='viridis')
plt.title("Count of Employees by Department", fontsize=16)
plt.xlabel("Department", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 3. Boxplot of Salary by Education Level
plt.figure(figsize=(8, 6))
sns.boxplot(data=data_cleaned, x='Education_Level', y='Salary', palette='pastel')
plt.title("Boxplot of Salary by Education Level", fontsize=16)
plt.xlabel("Education Level", fontsize=12)
plt.ylabel("Salary", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Save the cleaned dataset
data_cleaned.to_csv('cleaned_dataset.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_dataset.csv'.")
WEEK 2.Prediction of house of a price using Simple Linear Regression.
Program:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings
# Suppress warnings
warnings.filterwarnings('ignore')
# Step 1: Load the dataset
# Assuming the dataset is in the same directory as the script
file_path = 'C:/Users/kishore/Desktop/SimpleLinearRegression_dataset.csv'
df = pd.read_csv(file_path)
# Display basic information about the dataset
print("Dataset Loaded Successfully!")
print("First 5 rows of the dataset:")
print(df.head())
# Step 2: Feature selection
# For simplicity, we use 'sqft_living' as the predictor (X) and 'price' as the target (y)
X = df[['sqft_living']]
y = df['price']
# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nDataset split into training and testing sets.")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
# Step 4: Train a Simple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
print("\nModel training completed.")
# Step 5: Model evaluation
# Predict house prices on the testing set
y_pred = model.predict(X_test)
# Calculate and display evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2 Score): {r2:.2f}")
# Step 6: Visualize the results
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual Prices')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted Prices')
plt.title('Actual vs Predicted House Prices')
plt.xlabel('Living Area (sqft)')
plt.ylabel('Price')
plt.legend()
plt.show()
# Step 7: Display model coefficients
print("\nModel Coefficients:")
print(f"Intercept: {model.intercept_:.2f}")
print(f"Slope: {model.coef_[0]:.2f}")
# Step 8: Display Predicted vs Actual Prices
comparison_df = pd.DataFrame({'Actual Price': y_test.values, 'Predicted Price': y_pred})
print("\nPredicted vs Actual Prices:")
print(comparison_df.head(10)) # Display first 10 rows for brevity
WEEK 3.Prediction of the home prices using Multiple Linear Regression for a given
dataset.
Program:
# Importing required libraries
import pandas as pd # For data manipulation and analysis
import numpy as np # For numerical computations
import matplotlib.pyplot as plt # For plotting (optional)
from sklearn.model_selection import train_test_split # For splitting the dataset
from sklearn.linear_model import LinearRegression # For creating the ML model
from sklearn.metrics import mean_squared_error, r2_score # For model evaluation
import warnings # To suppress warnings
# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
# Step 1: Load the dataset
# Make sure the file `MultipleLinearRegression_dataset.csv` is in the same directory
file_path = "C:/Users/kishore/Desktop/MultipleLinearRegression_dataset.csv"
data = pd.read_csv(file_path)
# Step 2: Display basic information about the dataset
print("Dataset Overview:")
print(data.head(), "\n") # Show the first few rows of the dataset
print("Dataset Info:")
print(data.info(), "\n") # Display dataset summary
# Step 3: Check for missing values
print("Missing Values in Dataset:")
print(data.isnull().sum(), "\n")
# Step 4: Define features (X) and target variable (y)
# Select all columns except the target column (House_Price)
X = data.drop(columns=['House_Price'])
y = data['House_Price']
# Step 5: Split the dataset into training and testing sets
# 80% data for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 6: Initialize and train the Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Step 7: Make predictions on the test set
y_pred = model.predict(X_test)
# Step 8: Evaluate the model
mse = mean_squared_error(y_test, y_pred) # Calculate Mean Squared Error
r2 = r2_score(y_test, y_pred) # Calculate R-squared value
# Display evaluation metrics
print("Model Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared Value: {r2:.2f}")
# Step 9: Display feature importance (coefficients)
print("\nFeature Importance:")
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print(coefficients)
# Plot actual vs predicted house prices
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel("Actual House Prices")
plt.ylabel("Predicted House Prices")
plt.title("Actual vs Predicted House Prices")
plt.grid(True)
plt.show()
WEEK 4.Apply Logistic Regression Model and predict whether a person takes insurance
or not based on his age for a given dataset.
Program :
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
# Suppress warnings for clean output
import warnings
warnings.filterwarnings('ignore')
# Step 1: Load the dataset
dataset_path = "C:/Users/kishore/Desktop/LogisticRegression_dataset.csv" # Path to the dataset
df = pd.read_csv(dataset_path)
# Display the first few rows of the dataset to verify
print("\nFirst few rows of the dataset:")
print(df.head())
# Prepare data for training
X = df[['Age', 'Income', 'Marital_Status']]
y = df['Has_Insurance']
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Train Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
# Evaluate model accuracy
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)
# Predictions for test data
test_data = np.array([
[25, 20000, 0], # Age=25, Income=20000, Single (likely No Insurance)
[40, 35000, 1], # Age=40, Income=35000, Married (likely Yes Insurance)
[45, 40000, 1], # Age=45, Income=40000, Married (likely Yes Insurance)
[30, 25000, 0], # Age=30, Income=25000, Single (likely No Insurance)
[50, 46000, 1], # Age=50, Income=46000, Married (likely Yes Insurance)
[35, 29000, 0], # Age=35, Income=29000, Single (likely No Insurance)
[33, 28000, 1], # Age=33, Income=28000, Married (likely No Insurance)
[53, 45000, 1] # Age=53, Income=45000, Married (likely Yes Insurance)
])
# Scale the test data
test_data_scaled = scaler.transform(test_data)
# Making predictions for the test data
predictions = log_reg.predict(test_data_scaled)
# Output the results for test data
print("\nPredictions for Test Data:")
for i, (pred, data) in enumerate(zip(predictions, test_data)):
age, income, marital_status = data
has_insurance = "Yes" if pred == 1 else "No"
print(f"Age: {age}, Income: {income}, Marital Status: {'Married' if marital_status == 1 else 'Single'}, "
f"Prediction (Has Insurance?): {has_insurance}")
WEEK 5.Apply Decision Tree Classifier model to take a decision whether to play cricket or
not under given conditions.
Program:
# Importing necessary libraries
import pandas as pd # For data manipulation
from sklearn.model_selection import train_test_split # For splitting data into training and testing sets
from sklearn.tree import DecisionTreeClassifier # The Decision Tree Classifier
from sklearn.metrics import accuracy_score, classification_report # For model evaluation
from sklearn.tree import plot_tree #visualize the tree
import matplotlib.pyplot as plt
# Suppress warnings for clean output
import warnings
warnings.filterwarnings('ignore')
# Step 1: Load the dataset
dataset_path = "C:/Users/kishore/Downloads/balanced_cricket_dataset.csv" # Path to the dataset
data = pd.read_csv(dataset_path)
# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())
# Step 2: Feature selection and preprocessing
# Separating features (X) and the target variable (y)
X = data[['Weather', 'Temperature', 'Humidity', 'Windy']] # Features
y = data['PlayCricket'] # Target variable
# Convert categorical data into numerical values (if necessary)
X = pd.get_dummies(X, columns=['Weather', 'Temperature', 'Humidity', 'Windy'], drop_first=True)
# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 4: Initialize and train the Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42) # Creating the model
model.fit(X_train, y_train) # Training the model
# Step 5: Make predictions on the test set
y_pred = model.predict(X_test)
# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy: {:.2f}%".format(accuracy * 100))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Step 7: Display the feature importance
feature_importance = pd.DataFrame({
'Feature': X.columns,
'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)
# Step 8: Display all possible outputs
# Define all possible conditions based on the dataset's features
conditions = pd.DataFrame({
'Weather': ['Sunny', 'Sunny', 'Overcast', 'Overcast', 'Rain', 'Rain'],
'Temperature': ['Hot', 'Mild', 'Hot', 'Cool', 'Mild', 'Cool'],
'Humidity': ['High', 'Normal', 'High', 'Normal', 'High', 'Normal'],
'Windy': ['False', 'True', 'False', 'True', 'False', 'True']
})
# Convert conditions to the same feature structure as training data
conditions_encoded = pd.get_dummies(conditions, columns=['Weather', 'Temperature', 'Humidity',
'Windy'], drop_first=True)
conditions_encoded = conditions_encoded.reindex(columns=X.columns, fill_value=0)
# Predict outcomes for all conditions
predictions = model.predict(conditions_encoded)
# Combine conditions with predictions for display
conditions['PlayCricket'] = predictions
print("\nAll Possible Outputs:")
print(conditions)
#Structure of the Decision Tree
plt.figure(figsize=(20, 12)) # Increase figure size
plot_tree(model,
feature_names=X.columns,
class_names=['No', 'Yes'],
filled=True,
rounded=True,
fontsize=10, # Adjust font size for readability
max_depth=3) # Limit depth for clarity
plt.show()
Week6
6.Train the Random Forest on the scikit digits dataset and check whether the model is correctly
predicting the handwritten digits.
Program:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")
# Load the dataset
# Ensure the CSV file is in the working directory or provide its full path
data_path = '/content/balanced_digits_dataset.csv' # Update with the correct file path
data = pd.read_csv(data_path)
# Display the first few rows of the dataset
print("\nDataset Preview:\n")
print(data.head())
# Separate features (pixels) and labels (target)
X = data.drop(columns=['label']) # Drop the target column
y = data['label'] # Extract the target column
# Display dataset information
print("\nDataset Information:\n")
print(f"Number of samples: {len(data)}")
print(f"Number of features: {X.shape[1]}")
print(f"Unique labels: {sorted(y.unique())}")
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nData Split Summary:\n")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
1
# Predict on the test set
y_pred = rf_model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Evaluation:\n")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
cm = confusion_matrix(y_test, y_pred)
# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=sorted(y.unique()),
yticklabels=sorted(y.unique()))
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
# Display feature importance
importances = rf_model.feature_importances_
important_features = pd.DataFrame({"Feature": X.columns, "Importance":
importances}).sort_values(by="Importance", ascending=False)
print("\nTop 10 Most Important Features:\n")
print(important_features.head(10))
# Plot feature importance
plt.figure(figsize=(12, 6))
plt.bar(important_features['Feature'][:10], important_features['Importance'][:10], color='green')
plt.title("Top 10 Feature Importances")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.xticks(rotation=45)
plt.show()
# Include example predictions
print("\nExample Predictions:\n")
example_indices = np.random.choice(X_test.index, 5, replace=False) # Randomly select 5 test samples
example_inputs = X_test.loc[example_indices]
example_true_labels = y_test.loc[example_indices]
example_predictions = rf_model.predict(example_inputs)
2
for i, index in enumerate(example_indices):
print(f"Sample {i+1} - True Label: {example_true_labels.loc[index]}, Predicted Label:
{example_predictions[i]}")
Output:
Week 7
7.Analyze the weather data using Gaussian Naïve Bayes classifier and predict whether cricket can
be played or not based on given new data.
Program:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# Load the dataset from the CSV file
df = pd.read_csv('/content/weather_data.csv')
# Splitting the dataset into features (X) and target (y)
X = df.drop('Can Play Cricket', axis=1) # Features: Temperature, Humidity, Wind Speed, Rainy
y = df['Can Play Cricket'] # Target: Whether cricket can be played or not
# Splitting the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initializing the Gaussian Naïve Bayes model
model = GaussianNB()
# Training the model with the training data
model.fit(X_train, y_train)
# Making predictions on the test set
y_pred = model.predict(X_test)
# Checking the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy * 100:.2f}%")
# Output the predictions for all possible outcomes (Yes and No)
print("\nPredictions on Test Data:")
for i in range(len(X_test)):
prediction = "Yes" if y_pred[i] == 1 else "No"
print(f"Data: {X_test.iloc[i].to_dict()}, Prediction: {prediction}")
3
# Making predictions on new data
def predict_cricket_playability(new_data):
"""
Predict whether cricket can be played based on the given weather conditions.
new_data: A dictionary containing weather conditions {'Temperature': value, 'Humidity': value, 'Wind
Speed': value, 'Rainy': value}
"""
new_df = pd.DataFrame([new_data])
prediction = model.predict(new_df)
return "Yes" if prediction[0] == 1 else "No"
# Example: New data to predict (can be replaced with real values)
new_weather_data = {
'Temperature': 28, # in °C
'Humidity': 65, # in percentage
'Wind Speed': 8, # in km/h
'Rainy': 0 # 0 = No Rain, 1 = Rain
}
# Predicting whether cricket can be played with the new weather data
result = predict_cricket_playability(new_weather_data)
print(f"\nCan cricket be played with new data? {result}")
Output:
Week 8
8.Prediction of breast cancer benign or malignant using KNN Classifier.
Program:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
# Suppress warnings for clean output
warnings.filterwarnings("ignore")
# Load the dataset
# Ensure the 'breast_cancer_knn_dataset.csv' file is in your working directory
4
dataset = pd.read_csv('/content/breast_cancer_knn_dataset.csv')
# Display the first 5 rows of the dataset to understand its structure
print("Dataset Preview:")
print(dataset.head())
# Basic information about the dataset
print("\nDataset Information:")
dataset.info()
# Check for missing values
print("\nMissing Values in the Dataset:")
print(dataset.isnull().sum())
# Encode the target column ('Diagnosis') into numerical values (Benign=0, Malignant=1)
le = LabelEncoder()
dataset['Diagnosis'] = le.fit_transform(dataset['Diagnosis'])
# Display unique values in the target column
print("\nUnique Values in 'Diagnosis':")
print(dataset['Diagnosis'].unique())
# Define features (X) and target (y)
X = dataset[['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness']]
y = dataset['Diagnosis']
# Standardize the feature values for better KNN performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Train a KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5) # You can experiment with 'n_neighbors'
knn.fit(X_train, y_train)
# Make predictions on the test data
y_pred = knn.predict(X_test)
# Convert numerical predictions back to categorical labels
predictions_words = le.inverse_transform(y_pred)
# Evaluate the model's performance
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
5
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))
# Display a few predictions alongside actual values for manual verification
results = pd.DataFrame({
'Actual': le.inverse_transform(y_test.values),
'Predicted': predictions_words
})
print("\nSample Predictions:")
print(results.head())
Output:
Week 9
9.Classify the iris flowers dataset using SVM and find out the flower type depending on the given
input data.
Program :
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")
# Step 1: Load the dataset
# The dataset should be in the same directory or provide the full path
file_path = "/content/svm_Iris_dataset.csv" # Replace with the correct path if needed
try:
iris_data = pd.read_csv(file_path)
print("Dataset loaded successfully!\n")
except FileNotFoundError:
print("Error: The dataset file was not found. Please check the file path.")
exit()
# Display the first 5 rows of the dataset for understanding
print("Preview of the dataset:\n")
print(iris_data.head())
# Step 2: Select features and the target variable
# Columns used as features: SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm
# Target variable: Species
6
features = iris_data[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
target = iris_data['Species']
# Step 3: Split the dataset into training and testing sets
# 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
print("\nData split into training and testing sets successfully!")
print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")
# Step 4: Train the SVM model
# Using the SVC class from sklearn
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
print("\nSVM model trained successfully!")
# Step 5: Make predictions on the test set
y_pred = svm_model.predict(X_test)
# Step 6: Evaluate the model
print("\nEvaluation Metrics:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy:.2f}")
# Step 7: Predict flower type for new input data
# Example input: SepalLengthCm=5.1, SepalWidthCm=3.5, PetalLengthCm=1.4, PetalWidthCm=0.2
sample_data = [[5.1, 3.5, 1.4, 0.2]] # Replace with desired input
predicted_species = svm_model.predict(sample_data)
print(f"\nPredicted flower type for input {sample_data}: {predicted_species[0]}")
# Optional: Displaying all possible outputs for clarity
print("\nPossible classes in the dataset:", svm_model.classes_)
Output:
7
Week 10
10.Use K-Means clustering model and classify the employees into various income groups or
clusters.
Program:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore") # Suppress unnecessary warnings
# Step 1: Load the dataset
data_path = "/content/balanced_employee_dataset.csv" # Path to the dataset (ensure the file is in the
same directory or update the path)
df = pd.read_csv(data_path)
# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())
# Step 2: Feature selection
# Select features relevant for clustering (excluding non-numeric columns like Job_Role and Cluster)
selected_features = ['Age', 'Experience', 'Education_Level', 'Skills_Rating', 'Income']
data = df[selected_features]
# Step 3: Data preprocessing
# Convert categorical data to numeric using one-hot encoding or label encoding
data_encoded = data.copy()
# Example: Encoding 'Education_Level' as a numeric column
data_encoded['Education_Level'] = data_encoded['Education_Level'].astype('category').cat.codes
# Standardizing the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_encoded)
# Display the scaled data for verification
print("\nEncoded and scaled data (first 5 rows):")
print(pd.DataFrame(data_scaled, columns=selected_features).head())
# Step 4: Determine the optimal number of clusters using the Elbow Method
inertia = []
k_values = range(1, 11)
8
for k in k_values:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data_scaled)
inertia.append(kmeans.inertia_)
# Plot the Elbow curve
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, marker='o')
plt.title('Elbow Method: Optimal Number of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid()
plt.show()
# Step 5: Apply K-Means clustering
# Choosing the number of clusters (e.g., 3, based on the Elbow curve)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(data_scaled)
# Add the cluster labels to the original dataset
df['Cluster'] = cluster_labels
# Display the updated dataset
print("\nDataset with Cluster labels:")
print(df.head())
# Step 6: Evaluate the clustering performance using Silhouette Score
sil_score = silhouette_score(data_scaled, cluster_labels)
print(f"\nSilhouette Score: {sil_score:.2f}")
# Step 7: Visualize the clusters (using first two features for simplicity)
plt.figure(figsize=(8, 6))
colors = ['red', 'blue', 'green']
for i in range(optimal_k):
plt.scatter(
data_scaled[cluster_labels == i, 0], # First feature
data_scaled[cluster_labels == i, 1], # Second feature
label=f'Cluster {i}',
s=50,
alpha=0.7,
color=colors[i]
)
plt.title('K-Means Clustering Visualization')
plt.xlabel(selected_features[0])
plt.ylabel(selected_features[1])
plt.legend()
9
plt.grid()
plt.show()
# Display final cluster centers
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
print("\nCluster Centers:")
print(pd.DataFrame(cluster_centers, columns=selected_features))
# Step 8: Display income distribution in each cluster
print("\nIncome distribution in each cluster:")
for cluster_id in range(optimal_k):
cluster_data = df[df['Cluster'] == cluster_id]
print(f"\nCluster {cluster_id}:")
print(cluster_data[['Income']].describe())
# Step 9: Visualize income distributions
# Bar chart for average income per cluster
avg_income_per_cluster = df.groupby('Cluster')['Income'].mean()
plt.figure(figsize=(8, 6))
avg_income_per_cluster.plot(kind='bar', color=['red', 'blue', 'green'], alpha=0.7, edgecolor='black')
plt.title('Average Income per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Average Income')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
# Save the clustered dataset to a new CSV file
df.to_csv("clustered_dataset.csv", index=False)
print("\nClustered dataset saved to 'clustered_dataset.csv'")
Output:
10
Week 11
11.Performance analysis of Classification Algorithms on a specific dataset.
Program:
# Import necessary libraries
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Suppress warnings for clean output
warnings.filterwarnings("ignore")
# Load the dataset
dataset_path = "/content/Performace_Iris_dataset.csv" # Update this path as per your file location
data = pd.read_csv(dataset_path)
# Display the first few rows of the dataset
print("\nDataset Overview:")
print(data.head())
# Check for missing values
print("\nChecking for missing values:")
print(data.isnull().sum())
# Select features and target
X = data.iloc[:, :-1] # Assuming the last column is the target
y = data.iloc[:, -1]
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Scale the features for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
11
# Function to train and evaluate classifiers
def evaluate_classifier(model, model_name):
print(f"\nTraining {model_name}...")
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(f"\n{model_name} Metrics:")
print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, fmt='d', cmap='coolwarm')
plt.title(f'{model_name} Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# List of classifiers to evaluate
classifiers = [
(RandomForestClassifier(random_state=42), "Random Forest"),
(SVC(kernel='linear', random_state=42), "Support Vector Machine"),
(DecisionTreeClassifier(random_state=42), "Decision Tree"),
(GaussianNB(), "Naive Bayes"),
(KNeighborsClassifier(), "K-Nearest Neighbors"),
(LogisticRegression(random_state=42), "Logistic Regression")
]
# Evaluate each classifier
for model, name in classifiers:
evaluate_classifier(model, name)
Output:
12