Lab Assignment
Name: Ch.Abhiram
RegNo : 23BCE7199
SLOT: L41+L42
Faculty : Swanth Boppudi
1. Decision Tree for Whether Dataset:
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier,
plot_tree from sklearn.preprocessing import
LabelEncoder
from sklearn.model_selection import train_test_split
# Load the weather dataset
filename = "weather.csv" # Ensure this path is correct
df = pd.read_csv(filename)
print(df)
# Remove the 'Day' feature if present
df = df.drop(columns=['Day'], errors='ignore')
# Display the first few rows of the dataset
print(df.head())
# Encode categorical features using LabelEncoder
label_encoders = {}
for column in df.columns:
if df[column].dtype == 'object': # Apply encoding only to categorical columns
le = LabelEncoder()
df[column] = le.fit_transform(df[column])
label_encoders[column] = le
print(" After fit and transform
") print(df)
# Define features and target
X = df.iloc[:, :-1] # All columns except the last as features
y = df.iloc[:, -1] # Last column as target
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build the decision tree classifier using the entropy criterion
model = DecisionTreeClassifier(criterion='entropy', random_state=42)
model.fit(X_train, y_train)
# Visualize the decision tree
plt.figure(figsize=(10, 6))
plot_tree(
model,
feature_names=X.columns,
class_names=label_encoders[df.columns[-1]].classes_ if df.columns[-1] in label_encoders else
None,
filled=True,
rounded=True,
fontsize=10
plt.title("Simple ID3 Decision Tree for Weather Dataset")
plt.show()
Output:
2.Linear Regression
Code:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Load dataset
data = pd.read_csv('India_Crop_Production (1).csv')
# Display basic info
print(data.head())
print(data.info())
# Handle missing values (example: drop rows with missing values)
data = data.dropna()
data = data[data['Production'] != '=']
# Verify the rows are removed
print(data[data['Production'] ==
'='])
# Encode categorical features
categorical_cols = ['State_Name', 'District_Name', 'Crop', 'Season']
label_encoders = {}
for col in categorical_cols:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le
# Define features and target variable
X = data[['Area', 'Season', 'Crop', 'Crop_Year']] # Example features
y = data['Production']
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train the model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict on test data
y_pred = model.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error:
{mae}") print(f"Mean Squared Error:
{mse}") print(f"R-squared: {r2}")
Output:
3. Logistic Regression
Code:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
# Read the dataset using pandas (replace 'study_hours.csv' with your actual file path)
data = pd.read_csv('study_hours.csv')
print(data)
# Assuming the target column is 'status' and all other columns are features
X = data.drop(columns=['status'])
y = data['status'] # Target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=20)
# Initialize the Logistic Regression model
model = LogisticRegression()
# Train the model
model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
# Print results
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)
Output:
4.Titanic Dataset:
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
# Load the Titanic dataset
file_path = 'Titanic-Dataset.csv' # Replace with your Titanic dataset file path
data = pd.read_csv(file_path)
# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())
# Drop columns not relevant for the model
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, errors='ignore')
# Fill missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
# Encode categorical features
categorical_cols = ['Sex', 'Embarked']
label_encoders = {}
for col in categorical_cols:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le
# Define features and target variable
X = data.drop(['Survived'], axis=1)
y = data['Survived']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
# Display results
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
Output:
5. Clustering:
Code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
# Load dataset from CSV file
df = pd.read_csv('clustering.csv') # Ensure the file exists
# Selecting relevant features
marks = df[['Subject1', 'Subject2']].values
# Standardizing the data
scaler = StandardScaler()
marks_scaled = scaler.fit_transform(marks)
# Applying K-Means Clustering
k = 2 # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(marks_scaled)
# Get centroids
centroids = kmeans.cluster_centers_
# Assign cluster names based on performance
cluster_names = {0: 'High Performers', 1: 'Low Performers'} # Modify as needed
df['Cluster Name'] = df['Cluster'].map(cluster_names)
# Save clustered data to CSV
df.to_csv('student_marks_clustered.csv', index=False)
# Performance Metrics
inertia = kmeans.inertia_ # SSE
silhouette_avg = silhouette_score(marks_scaled,
df['Cluster']) db_index = davies_bouldin_score(marks_scaled,
df['Cluster']) print(f"Inertia (SSE): {inertia:.2f}")
print(f"Silhouette Score:
{silhouette_avg:.2f}") print(f"Davies-Bouldin
Index: {db_index:.2f}") # Display cluster-wise
information print("\nCluster Information:")
print(df.groupby('Cluster Name')[['Subject1', 'Subject2']].mean())
# Plot the clusters
plt.figure(figsize=(8, 6))
plt.scatter(marks_scaled[:, 0], marks_scaled[:, 1], c=df['Cluster'], cmap='viridis', marker='o',
edgecolors='k', label='Students')
plt.scatter(centroids[:, 0], centroids[:, 1], s=200, c='red', marker='X', label='Centroids')
plt.xlabel('Subject 1 (Scaled)')
plt.ylabel('Subject 2 (Scaled)')
plt.title('K-Means Clustering of Student Marks')
plt.legend()
plt.show()
Output: