import pandas as pd
# Load data
df = pd.read_csv("StudentsPerformance.csv")
# Preview
df.head()
gender race/ethnicity parental level of education lunch \
0 female group B bachelor's degree standard
1 female group C some college standard
2 female group B master's degree standard
3 male group A associate's degree free/reduced
4 male group C some college standard
test preparation course math score reading score writing score
0 none 72 72 74
1 completed 69 90 88
2 none 90 95 93
3 none 47 57 44
4 none 76 78 75
df.tail(10)
gender race/ethnicity parental level of education
lunch \
990 male group E high school free/reduced
991 female group B some high school standard
992 female group D associate's degree free/reduced
993 female group D bachelor's degree free/reduced
994 male group A high school standard
995 female group E master's degree standard
996 male group C high school free/reduced
997 female group C high school free/reduced
998 female group D some college standard
999 female group D some college free/reduced
test preparation course math score reading score writing score
990 completed 86 81 75
991 completed 65 82 78
992 none 55 76 76
993 none 62 72 74
994 none 63 63 62
995 completed 88 99 95
996 none 62 55 55
997 completed 59 71 65
998 completed 68 78 77
999 none 77 86 86
df.shape
(1000, 8)
df.dtypes
gender object
race/ethnicity object
parental level of education object
lunch object
test preparation course object
math score int64
reading score int64
writing score int64
dtype: object
df['math score'].describe()
count 1000.00000
mean 66.08900
std 15.16308
min 0.00000
25% 57.00000
50% 66.00000
75% 77.00000
max 100.00000
Name: math score, dtype: float64
df['lunch'].describe()
count 1000
unique 2
top standard
freq 645
Name: lunch, dtype: object
df.describe()
math score reading score writing score
count 1000.00000 1000.000000 1000.000000
mean 66.08900 69.169000 68.054000
std 15.16308 14.600192 15.195657
min 0.00000 17.000000 10.000000
25% 57.00000 59.000000 57.750000
50% 66.00000 70.000000 69.000000
75% 77.00000 79.000000 79.000000
max 100.00000 100.000000 100.000000
df.iloc[100]
gender male
race/ethnicity group B
parental level of education some college
lunch standard
test preparation course none
math score 79
reading score 67
writing score 67
Name: 100, dtype: object
df.loc[:,"lunch"]
0 standard
1 standard
2 standard
3 free/reduced
4 standard
...
995 standard
996 free/reduced
997 free/reduced
998 standard
999 free/reduced
Name: lunch, Length: 1000, dtype: object
df.sort_values(by = "math score", ascending=False).head()
gender race/ethnicity parental level of education
lunch \
962 female group E associate's degree standard
458 female group E bachelor's degree standard
149 male group E associate's degree free/reduced
625 male group D some college standard
916 male group E bachelor's degree standard
test preparation course math score reading score writing score
962 none 100 100 100
458 none 100 100 100
149 completed 100 100 93
625 completed 100 97 99
916 completed 100 100 100
df["lunch"].head(10)
0 standard
1 standard
2 standard
3 free/reduced
4 standard
5 standard
6 standard
7 free/reduced
8 free/reduced
9 free/reduced
Name: lunch, dtype: object
df[df["math score"]==99]
gender race/ethnicity parental level of education lunch \
114 female group E bachelor's degree standard
263 female group E high school standard
306 male group E some college standard
test preparation course math score reading score writing score
114 completed 99 100 100
263 none 99 93 90
306 completed 99 87 81
Q1 = df['math score'].quantile(0.25)
Q3 = df['math score'].quantile(0.75)
IQR = Q3-Q1
print("The interquartile range is: ", IQR)
The interquartile range is: 20.0
df.isnull()
gender race/ethnicity parental level of education lunch \
0 False False False False
1 False False False False
2 False False False False
3 False False False False
4 False False False False
.. ... ... ... ...
995 False False False False
996 False False False False
997 False False False False
998 False False False False
999 False False False False
test preparation course math score reading score writing score
0 False False False False
1 False False False False
2 False False False False
3 False False False False
4 False False False False
.. ... ... ... ...
995 False False False False
996 False False False False
997 False False False False
998 False False False False
999 False False False False
[1000 rows x 8 columns]
df.isnull().sum()
gender 0
race/ethnicity 0
parental level of education 0
test preparation course 0
math score 0
reading score 0
writing score 0
dtype: int64
df.count()
gender 1000
race/ethnicity 1000
parental level of education 1000
test preparation course 1000
math score 1000
reading score 1000
writing score 1000
dtype: int64
# remove all the rows that contain a missing value
df.dropna(inplace=True)
# Students with high math scores (above 90)
df[df["math score"] > 90]
gender race/ethnicity parental level of education \
34 male group E some college
104 male group C some college
114 female group E bachelor's degree
121 male group B associate's degree
149 male group E associate's degree
165 female group C bachelor's degree
171 male group E some high school
179 female group D some high school
233 male group E some high school
263 female group E high school
286 male group E associate's degree
306 male group E some college
451 female group E some college
458 female group E bachelor's degree
469 male group C some college
501 female group B associate's degree
503 female group E associate's degree
521 female group C associate's degree
539 male group A associate's degree
546 female group A some high school
562 male group C bachelor's degree
566 female group E bachelor's degree
571 male group A bachelor's degree
594 female group C bachelor's degree
612 male group C bachelor's degree
618 male group D master's degree
623 male group A some college
625 male group D some college
685 female group E master's degree
689 male group E some college
710 male group C some college
712 female group D some college
717 female group C associate's degree
719 male group E associate's degree
736 male group C associate's degree
779 male group E associate's degree
784 male group C bachelor's degree
815 male group B some high school
846 male group C master's degree
855 female group B bachelor's degree
864 male group C associate's degree
886 female group E associate's degree
903 female group D bachelor's degree
916 male group E bachelor's degree
919 male group B some college
934 male group C associate's degree
950 male group E high school
957 female group D master's degree
962 female group E associate's degree
979 female group C associate's degree
test preparation course math score reading score writing score
34 none 97 87 82
104 completed 98 86 90
114 completed 99 100 100
121 completed 91 89 92
149 completed 100 100 93
165 completed 96 100 100
171 none 94 88 78
179 completed 97 100 100
233 none 92 87 78
263 none 99 93 90
286 completed 97 82 88
306 completed 99 87 81
451 none 100 92 97
458 none 100 100 100
469 none 91 74 76
501 completed 94 87 92
503 completed 95 89 92
521 none 91 86 84
539 completed 97 92 86
546 completed 92 100 97
562 completed 96 90 92
566 completed 92 100 100
571 none 91 96 92
594 completed 92 100 99
612 completed 94 90 91
618 none 95 81 84
623 completed 100 96 86
625 completed 100 97 99
685 completed 94 99 100
689 none 93 90 83
710 completed 93 84 90
712 none 98 100 99
717 completed 96 96 99
719 completed 91 73 80
736 none 92 79 84
779 completed 94 85 82
784 completed 91 81 79
815 completed 94 86 87
846 completed 91 85 85
855 none 97 97 96
864 none 97 93 91
886 completed 93 100 95
903 completed 93 100 100
916 completed 100 100 100
919 completed 91 96 91
934 completed 98 87 90
950 none 94 73 71
957 none 92 100 100
962 none 100 100 100
979 none 91 95 94
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = [10, 5]
# Load data
df = pd.read_csv("StudentsPerformance.csv")
df.head()
gender race/ethnicity parental level of education lunch \
0 female group B bachelor's degree standard
1 female group C some college standard
2 female group B master's degree standard
3 male group A associate's degree free/reduced
4 male group C some college standard
test preparation course math score reading score writing score
0 none 72 72 74
1 completed 69 90 88
2 none 90 95 93
3 none 47 57 44
4 none 76 78 75
# Color palettes
sns.palplot(sns.color_palette("colorblind"))
plt.title("Color Palette: Colorblind")
plt.show()
sns.palplot(sns.color_palette("Reds"))
plt.title("Color Palette: Reds")
plt.show()
sns.histplot(df['math score'], kde=False)
plt.title("Distribution of Math Scores")
plt.show()
sns.distplot(df['reading score'], hist=False)
plt.title("KDE of Reading Scores")
plt.show()
plt.figure(figsize=(8, 8))
sns.distplot(df['writing score'])
plt.title("Distribution of Writing Scores")
plt.show()
plt.figure(figsize=(8, 8))
sns.scatterplot(x="math score", y="writing score", hue="gender",
data=df)
plt.title("Math vs Writing Scores by Gender")
plt.show()
# 5. Bar Plot: Average Writing Score by Gender and Lunch Type
plt.figure(figsize=(8, 8))
sns.barplot(x="gender", y="writing score", hue="lunch", data=df)
plt.title("Average Writing Score by Gender & Lunch")
plt.show()
# 6. Relplot – math vs reading
sns.relplot(x="math score", y="reading score", hue="gender",
style="gender", kind="scatter", data=df)
plt.title("Math vs Reading by Gender")
plt.show()
# 8. Lineplot – reading vs writing by gender
plt.figure(figsize=(7, 7))
sns.lineplot(x="reading score", y="writing score", hue="gender",
data=df)
plt.title("Reading vs Writing by Gender (Lineplot)")
plt.show()
# 10. Barplot – math score by test prep and gender
plt.figure(figsize=(7, 7))
sns.barplot(x="test preparation course", y="math score", hue="gender",
data=df)
plt.title("Math Score by Test Prep and Gender")
plt.show()
# 11. Boxplot – reading score by parental education
plt.figure(figsize=(12, 6))
sns.boxplot(x="parental level of education", y="reading score",
data=df)
plt.title("Reading Score by Parental Education")
plt.xticks(rotation=45)
plt.show()
# 12. Violin plot – writing score by lunch
plt.figure(figsize=(6, 6))
sns.violinplot(x="lunch", y="writing score", data=df)
plt.title("Writing Score by Lunch Type")
plt.show()
# 13. Boxplot – writing score by gender
sns.boxplot(x="gender", y="writing score", data=df)
plt.title("Writing Score by Gender")
plt.show()
plt.figure(figsize=(8, 6))
sns.boxplot(x="race/ethnicity", y="math score", data=df)
plt.title("Math Score by Race/Ethnicity Group")
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
# Make a copy to avoid errors if run twice
df = df.copy()
# List of categorical columns
cat_cols = ['gender', 'race/ethnicity', 'parental level of education',
'lunch', 'test preparation course']
# Apply Label Encoding to each categorical column
for col in cat_cols:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
# 🎯 Create target column
df['pass_math'] = (df['math score'] >= 50).astype(int)
# 🎯 Define features and target
X = df.drop(['math score', 'pass_math'], axis=1)
y = df['pass_math']
# ✂️ Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=0)
# 🧠 Train Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
LogisticRegression()
# ✅ Evaluate
from sklearn.metrics import accuracy_score, confusion_matrix
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc * 100:.2f}%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
Accuracy: 93.50%
Confusion Matrix:
[[ 20 7]
[ 6 167]]