NATIONAL UNIVERSITY OF SCIENCES AND
TECHNOLOGY
APPLICATION OF ICT
LAB ASSIGNMENT #11
MACHINE LEARNING PART 3
SUBMITTED TO: MR. MUHAMMAD ADNAN
DATE OF SUBMISSION: 18 December 2024
NAME CLASS CMS ID
Muhammad Shaheer Ali BEE-16D 509801
Khan
1
LAB TASKS:
1. Load the student scores dataset and inspect its structure.
2. Train a simple linear regression model to predict scores and visualize the regression line.
3. Use the California Housing dataset to train a multiple regression model and evaluate its
performance.
4. Train a polynomial regression model on student scores and compare it with the linear
model.
5. Explore the impact of varying test splits (20%, 30%, 40%) on model performance
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.datasets import fetch_california_housing
def load_and_inspect_student_data(url="https://bit.ly/3bD4VXe"):
"""Task 1: Load and inspect student scores dataset"""
print("\n=== Task 1: Loading and Inspecting Student Data ===")
stud_scores = pd.read_csv(url)
print("Dataset Head:")
print(stud_scores.head())
print("\nDataset Info:")
print(stud_scores.info())
2
return stud_scores
def simple_linear_regression(data):
"""Task 2: Simple Linear Regression"""
print("\n=== Task 2: Simple Linear Regression ===")
X = data['Hours'].to_numpy().reshape(-1, 1)
y = data['Scores'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Make predictions
y_pred = regressor.predict(X_test)
# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Slope: {regressor.coef_[0]:.4f}")
print(f"Intercept: {regressor.intercept_:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")
# Visualize results
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', label='Regression Line')
plt.xlabel('Hours Studied')
plt.ylabel('Scores')
3
plt.title('Simple Linear Regression: Hours vs Scores')
plt.legend()
plt.show()
return regressor, mse, r2
def multiple_regression():
"""Task 3: Multiple Regression with California Housing Dataset"""
print("\n=== Task 3: Multiple Regression ===")
# Load California Housing dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Make predictions and evaluate
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Feature Names:", housing.feature_names)
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")
# Print feature importance
for name, coef in zip(housing.feature_names, regressor.coef_):
4
print(f"{name}: {coef:.4f}")
return regressor, mse, r2
def polynomial_regression(data):
"""Task 4: Polynomial Regression"""
print("\n=== Task 4: Polynomial Regression ===")
X = data['Hours'].to_numpy().reshape(-1, 1)
y = data['Scores'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create and train polynomial model
poly_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_model.fit(X_train, y_train)
# Make predictions
y_poly_pred = poly_model.predict(X_test)
# Evaluate model
mse_poly = mean_squared_error(y_test, y_poly_pred)
r2_poly = r2_score(y_test, y_poly_pred)
print(f"Polynomial Mean Squared Error: {mse_poly:.4f}")
print(f"Polynomial R-squared Score: {r2_poly:.4f}")
# Visualize results
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual')
# Sort X_test for smooth curve plotting
5
X_test_sorted = np.sort(X_test, axis=0)
y_poly_pred_sorted = poly_model.predict(X_test_sorted)
plt.plot(X_test_sorted, y_poly_pred_sorted, color='red', label='Polynomial Regression')
plt.xlabel('Hours Studied')
plt.ylabel('Scores')
plt.title('Polynomial Regression: Hours vs Scores')
plt.legend()
plt.show()
return poly_model, mse_poly, r2_poly
def test_split_impact(data):
"""Task 5: Impact of Different Test Split Sizes"""
print("\n=== Task 5: Impact of Different Test Split Sizes ===")
X = data['Hours'].to_numpy().reshape(-1, 1)
y = data['Scores'].to_numpy()
test_sizes = [0.2, 0.3, 0.4]
results = []
for test_size in test_sizes:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
6
results.append({
'test_size': test_size,
'mse': mse,
'r2': r2
})
print(f"\nTest Size: {test_size*100}%")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")
return results
def main():
# Execute all tasks
data = load_and_inspect_student_data()
simple_linear_regression(data)
multiple_regression()
polynomial_regression(data)
test_split_impact(data)
if _name_ == "_main_":
main()
OUTPUT:
Attached in zip