Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
12 views3 pages

Code

The document details a lab task exploring the Iris dataset, including data loading, statistical analysis, and visualization. It implements Logistic Regression and Random Forest classifiers to predict species, reporting their accuracy and cross-validation results. Key findings include the best separating feature and the species with the largest average petal length.

Uploaded by

wetechhub1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views3 pages

Code

The document details a lab task exploring the Iris dataset, including data loading, statistical analysis, and visualization. It implements Logistic Regression and Random Forest classifiers to predict species, reporting their accuracy and cross-validation results. Key findings include the best separating feature and the species with the largest average petal length.

Uploaded by

wetechhub1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 3

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import f_classif

student_name = "Maitha Al Shamsi"


std_id = "202200129"
deadline = "11/Sept/25"

print("Lab Task_01: Exploring the Iris Dataset")


print(f"Student Name: {student_name}")
print(f"STD ID: {std_id}")
print(f"Deadline: {deadline}")
print("-" * 60)

iris = datasets.load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

print("First 10 rows of the Iris dataset:")


display(df.head(10))

features = iris.feature_names
means = df[features].mean()
medians = df[features].median()
modes = df[features].mode().iloc[0]
stats_df = pd.DataFrame({'mean': means, 'median': medians, 'mode': modes})
display(stats_df)

petal_length_col = 'petal length (cm)'


petal_width_col = 'petal width (cm)'
print("\nPetal length/width min and max:")
print("Petal length min:", df[petal_length_col].min())
print("Petal length max:", df[petal_length_col].max())
print("Petal width min:", df[petal_width_col].min())
print("Petal width max:", df[petal_width_col].max())

for feature in features:


plt.figure(figsize=(6,4))
plt.hist(df[feature], bins=10)
plt.title(f'Histogram of {feature}')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(6,5))
species_codes = df['species'].cat.codes
plt.scatter(df[petal_length_col], df[petal_width_col], c=species_codes)
plt.title('Petal length vs Petal width (colored by species)')
plt.xlabel(petal_length_col)
plt.ylabel(petal_width_col)
for i, name in enumerate(iris.target_names):
plt.scatter([], [], label=name)
plt.legend()
plt.show()

plt.figure(figsize=(6,5))
grouped = [group['sepal length (cm)'].values for name, group in df.groupby('species')]
plt.boxplot(grouped, labels=df['species'].cat.categories)
plt.title('Sepal length distribution across species')
plt.xlabel('Species')
plt.ylabel('Sepal length (cm)')
plt.show()

F, p = f_classif(df[features], df['species'].cat.codes)
separability = pd.DataFrame({'feature': features, 'F_value': F, 'p_value':
p}).sort_values(by='F_value', ascending=False)
display(separability)
print("Best separating feature:", separability.iloc[0]['feature'])

mean_petal_by_species = df.groupby('species')[petal_length_col].mean()
display(mean_petal_by_species)
print("Species with largest average petal length:", mean_petal_by_species.idxmax())

X = df[features].values
y = df['species'].cat.codes.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,
stratify=y)

lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("\nLogistic Regression accuracy:", accuracy_score(y_test, y_pred_lr))
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest accuracy:", accuracy_score(y_test, y_pred_rf))

cv_scores = cross_val_score(LogisticRegression(max_iter=200), X, y, cv=5)


print("5-fold CV (Logistic Regression) mean accuracy:", cv_scores.mean())

You might also like