import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#load the dataset
df=pd.read_csv("StudentsPerformance.csv")
df.head()
gender race/ethnicity parental level of education lunch \
0 female group B bachelor's degree standard
1 female group C some college standard
2 female group B master's degree standard
3 male group A associate's degree free/reduced
4 male group C some college standard
test preparation course math score reading score writing score
0 none 72 72 74
1 completed 69 90 88
2 none 90 95 93
3 none 47 57 44
4 none 76 78 75
#basic info of the dataset
print(df.shape)
(1000, 8)
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 1000 non-null object
1 race/ethnicity 1000 non-null object
2 parental level of education 1000 non-null object
3 lunch 1000 non-null object
4 test preparation course 1000 non-null object
5 math score 1000 non-null int64
6 reading score 1000 non-null int64
7 writing score 1000 non-null int64
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None
print(df.describe())
math score reading score writing score
count 1000.00000 1000.000000 1000.000000
mean 66.08900 69.169000 68.054000
std 15.16308 14.600192 15.195657
min 0.00000 17.000000 10.000000
25% 57.00000 59.000000 57.750000
50% 66.00000 70.000000 69.000000
75% 77.00000 79.000000 79.000000
max 100.00000 100.000000 100.000000
print (df.isnull().sum())
gender 0
race/ethnicity 0
parental level of education 0
lunch 0
test preparation course 0
math score 0
reading score 0
writing score 0
dtype: int64
df['column_name'].unique()
----------------------------------------------------------------------
-----
NameError Traceback (most recent call
last)
Cell In[1], line 1
----> 1 df['column_name'].unique()
NameError: name 'df' is not defined
for col in df.columns:
print(f"{col}: {df[col].unique()}")
gender: ['female' 'male']
race/ethnicity: ['group B' 'group C' 'group A' 'group D' 'group E']
parental level of education: ["bachelor's degree" 'some college'
"master's degree" "associate's degree"
'high school' 'some high school']
lunch: ['standard' 'free/reduced']
test preparation course: ['none' 'completed']
math score: [ 72 69 90 47 76 71 88 40 64 38 58 65 78 50
18 46 54 66
44 74 73 67 70 62 63 56 97 81 75 57 55 53 59 82 77
33
52 0 79 39 45 60 61 41 49 30 80 42 27 43 68 85 98
87
51 99 84 91 83 89 22 100 96 94 48 35 34 86 92 37 28
24
26 95 36 29 32 93 19 23 8]
reading score: [ 72 90 95 57 78 83 43 64 60 54 52 81 53
75 89 32 42 58
69 73 71 74 70 65 87 56 61 84 55 44 41 85 59 17 39
80
37 63 51 49 26 68 45 47 86 34 79 66 67 91 100 76 77
82
92 93 62 88 50 28 48 46 23 38 94 97 99 31 96 24 29
40]
writing score: [ 74 88 93 44 75 78 92 39 67 50 52 43 73
70 58 86 28 46
61 63 53 80 72 55 65 38 82 79 83 59 57 54 68 66 62
76
48 42 87 49 10 34 71 37 56 41 22 81 45 36 89 47 90
100
64 98 51 40 84 69 33 60 85 91 77 27 94 95 19 35 32
96
97 99 15 30 23]
df[df["math score"]>70]
df[df["gender"]=='female']
gender race/ethnicity parental level of education
lunch \
0 female group B bachelor's degree standard
1 female group C some college standard
2 female group B master's degree standard
5 female group B associate's degree standard
6 female group B some college standard
.. ... ... ... ...
993 female group D bachelor's degree free/reduced
995 female group E master's degree standard
997 female group C high school free/reduced
998 female group D some college standard
999 female group D some college free/reduced
test preparation course math score reading score writing score
0 none 72 72 74
1 completed 69 90 88
2 none 90 95 93
5 none 71 83 78
6 completed 88 95 92
.. ... ... ... ...
993 none 62 72 74
995 completed 88 99 95
997 completed 59 71 65
998 completed 68 78 77
999 none 77 86 86
[518 rows x 8 columns]
df.groupby('gender')['math score'].mean()
gender
female 63.633205
male 68.728216
Name: math score, dtype: float64
sns.histplot(df['math score'])
<Axes: xlabel='math score', ylabel='Count'>
plt.figure(figsize=(8,5))
sns.histplot(df['math score'], kde=True, bins=10)
plt.title("Distribution of Math Scores")
plt.xlabel("Math Score")
plt.ylabel("Frequency")
plt.show()
sns.boxplot(x='gender', y='math score', data=df)
plt.title("Gender vs Math Score")
plt.show()
sns.boxplot(x='lunch', y='math score', data=df)
plt.title("Lunch Type vs Math Score")
plt.show()
sns.boxplot(x='test preparation course', y='math score', data=df)
plt.title("Test Prep vs Math Score")
plt.show()
df.groupby('gender')[['math score', 'reading score', 'writing
score']].mean()
math score reading score writing score
gender
female 63.633205 72.608108 72.467181
male 68.728216 65.473029 63.311203
df.to_csv("cleaned_dataset.csv", index=False)