import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import f_classif
student_name = "Maitha Al Shamsi"
std_id = "202200129"
deadline = "11/Sept/25"
print("Lab Task_01: Exploring the Iris Dataset")
print(f"Student Name: {student_name}")
print(f"STD ID: {std_id}")
print(f"Deadline: {deadline}")
print("-" * 60)
iris = datasets.load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
print("First 10 rows of the Iris dataset:")
display(df.head(10))
features = iris.feature_names
means = df[features].mean()
medians = df[features].median()
modes = df[features].mode().iloc[0]
stats_df = pd.DataFrame({'mean': means, 'median': medians, 'mode': modes})
display(stats_df)
petal_length_col = 'petal length (cm)'
petal_width_col = 'petal width (cm)'
print("\nPetal length/width min and max:")
print("Petal length min:", df[petal_length_col].min())
print("Petal length max:", df[petal_length_col].max())
print("Petal width min:", df[petal_width_col].min())
print("Petal width max:", df[petal_width_col].max())
for feature in features:
plt.figure(figsize=(6,4))
plt.hist(df[feature], bins=10)
plt.title(f'Histogram of {feature}')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize=(6,5))
species_codes = df['species'].cat.codes
plt.scatter(df[petal_length_col], df[petal_width_col], c=species_codes)
plt.title('Petal length vs Petal width (colored by species)')
plt.xlabel(petal_length_col)
plt.ylabel(petal_width_col)
for i, name in enumerate(iris.target_names):
plt.scatter([], [], label=name)
plt.legend()
plt.show()
plt.figure(figsize=(6,5))
grouped = [group['sepal length (cm)'].values for name, group in df.groupby('species')]
plt.boxplot(grouped, labels=df['species'].cat.categories)
plt.title('Sepal length distribution across species')
plt.xlabel('Species')
plt.ylabel('Sepal length (cm)')
plt.show()
F, p = f_classif(df[features], df['species'].cat.codes)
separability = pd.DataFrame({'feature': features, 'F_value': F, 'p_value':
p}).sort_values(by='F_value', ascending=False)
display(separability)
print("Best separating feature:", separability.iloc[0]['feature'])
mean_petal_by_species = df.groupby('species')[petal_length_col].mean()
display(mean_petal_by_species)
print("Species with largest average petal length:", mean_petal_by_species.idxmax())
X = df[features].values
y = df['species'].cat.codes.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,
stratify=y)
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("\nLogistic Regression accuracy:", accuracy_score(y_test, y_pred_lr))
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest accuracy:", accuracy_score(y_test, y_pred_rf))
cv_scores = cross_val_score(LogisticRegression(max_iter=200), X, y, cv=5)
print("5-fold CV (Logistic Regression) mean accuracy:", cv_scores.mean())