Lab 1: Numpy Fundamentals and Array
Operations
-> To introduce students to the foundational concepts of NumPy, focusing on array creation,
manipulation, and mathematical operations.
-> This lab aims to build essential skills for performing efficient numerical computations and preparing
data for machine learning and data analysis tasks.
In [1]: # Import necessary libraries
import numpy as np
In [2]: # 1. Array Creation
a1 = np.array([1,2,3,4,5,6])
a2 = np.array([[9,8,7],[6,5,4]])
print("1D array: ", a1)
print("2D array:\n ", a2)
1D array: [1 2 3 4 5 6]
2D array:
[[9 8 7]
[6 5 4]]
In [3]: # 2. Creaating array with zeros
zeros = np.zeros((3,3))
print("Array of zeros:\n ", zeros)
Array of zeros:
[[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]]
In [4]: # 3. Creating array with ones
ones = np.ones((4,4))
print("Array of ones:\n", ones)
Array of ones:
[[1. 1. 1. 1.]
[1. 1. 1. 1.]
[1. 1. 1. 1.]
[1. 1. 1. 1.]]
In [5]: # 4. Using arange to create an array at equal intervals
arrange = np.arange(0, 10, 2)
print("Array with arrange:\n", arrange)
Array with arrange:
[0 2 4 6 8]
In [6]: # 5. Using linspace to create an array with specified number of points
linespace = np.linspace(0,10,5)
print("Array with linespace:\n", linespace)
Array with linespace:
[ 0. 2.5 5. 7.5 10. ]
In [7]: # 6. Using eye to create an identity matrix
identity = np.eye(N = 3, dtype=int)
print("Identity matrix:\n", identity)
Identity matrix:
[[1 0 0]
[0 1 0]
[0 0 1]]
In [8]: # 7. Knowing the shape of an array
a2_shape = a2.shape
print("Shape of a2: ", a2_shape)
Shape of a2: (2, 3)
In [9]: # 8. Reshaping an array
reshape = np.reshape(a1, (3,2))
print("Reshapened:\n", reshape)
Reshapened:
[[1 2]
[3 4]
[5 6]]
In [10]: # 9. Transposing an array
flatten = a2.flatten()
print("Flatten:", flatten)
Flatten: [9 8 7 6 5 4]
In [11]: # 10. Indexing and slicing
a3 = np.array([[10,20,30],[40,50,60]])
print("Element at position (1,2):", a3[1,2])
print("First row:", a3[0])
print("Second column:", a3[1])
print("Slice (0:2, 1:3):\n", a3[0:2,1:3])
Element at position (1,2): 60
First row: [10 20 30]
Second column: [40 50 60]
Slice (0:2, 1:3):
[[20 30]
[50 60]]
In [12]: # 11. Simple Mathematical operations
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])
print("Addition:", x + y)
print("Subtraction:", x - y)
print("Multiplication:", x * y)
print("Division:", x / y)
print("Exponent:", x ** 2)
Addition: [5 7 9]
Subtraction: [-3 -3 -3]
Multiplication: [ 4 10 18]
Division: [0.25 0.4 0.5 ]
Exponent: [1 4 9]
In [13]: # 12. Statistical operations
a4 = np.array([[1, 2, 3], [4, 5, 6]])
print("Sum:", np.sum(a4))
print("Sum (axis=0):", np.sum(a4, axis=0))
print("Mean:", np.mean(a4))
print("Max:", np.max(a4))
print("Min:", np.min(a4))
print("Standard Deviation:", np.std(a4))
Sum: 21
Sum (axis=0): [5 7 9]
Mean: 3.5
Max: 6
Min: 1
Standard Deviation: 1.707825127659933
In [14]: # 13. Broadcasting operation
arr = np.array([[1, 2], [3, 4]])
print("Broadcasted +5:\n", arr + 5)
Broadcasted +5:
[[6 7]
[8 9]]
In [15]: # 14. Vertical and Horizontal Stacking
v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])
print("Vertical Stack:\n", np.vstack((v1, v2)))
print("Horizontal Stack:\n", np.hstack((v1, v2)))
Vertical Stack:
[[1 2 3]
[4 5 6]]
Horizontal Stack:
[1 2 3 4 5 6]
In [16]: # 15. Splitting arrays
split_arr = np.array([10, 20, 30, 40])
print("Split:\n", np.split(split_arr, 2))
Split:
[array([10, 20]), array([30, 40])]
In [17]: # 16. Coditioning arrays
nums = np.array([5, 10, 15, 20])
print("Greater than 10:", nums[nums > 10])
print("Even numbers:", nums[nums % 2 == 0])
Greater than 10: [15 20]
Even numbers: [10 20]
In [18]: # 17. Copy vs View
original = np.array([1, 2, 3])
copy_arr = original.copy()
view_arr = original.view()
copy_arr[0] = 99
view_arr[1] = 77
print("Original:", original)
print("Copy:", copy_arr)
print("View:", view_arr)
Original: [ 1 77 3]
Copy: [99 2 3]
View: [ 1 77 3]
In [19]: # 18. Random number generation
rand_uniform = np.random.rand(3, 2)
rand_int = np.random.randint(1, 10, size=(2, 3))
np.random.seed(0) # For reproducibility
rand_fixed = np.random.rand(2, 2)
print("Random Uniform:\n", rand_uniform)
print("Random Integers:\n", rand_int)
print("Random with Seed:\n", rand_fixed)
Random Uniform:
[[0.79847269 0.66721666]
[0.30928783 0.55393347]
[0.39006479 0.73750991]]
Random Integers:
[[8 7 6]
[6 7 5]]
Random with Seed:
[[0.5488135 0.71518937]
[0.60276338 0.54488318]]
Lab 2: Introduction to Pandas
This lab introduces Pandas, a powerful library for data manipulation and analysis.
# Importing necessary libraries
import numpy as np
import pandas as pd
# 1. Series Creation from list
data_list = [10, 20, 30, 40, 50]
series1 = pd.Series(data_list)
print("Series from list:\n", series1)
Series from list:
0 10
1 20
2 30
3 40
4 50
dtype: int64
# 2. Series Creation from dictionary
dict1 = {"a" : 100, "b" : 200, "c" : 300}
series2 = pd.Series(dict1)
print("Series from dictionary:\n", series2)
Series from dictionary:
a 100
b 200
c 300
dtype: int64
# 3. Creating dataframe from dictionary
dict2 = {
"Name" : ["Alice", "Bob", "Charlie"],
"Age" : [25, 30, 35],
"City" : ["New York", "Los Angeles", "Chicago"]
}
df = pd.DataFrame(dict2)
print("DataFrame from dictionary:\n", df)
DataFrame from dictionary:
Name Age City
0 Alice 25 New York
1 Bob 30 Los Angeles
2 Charlie 35 Chicago
# 4.1 Writing DataFrame to CSV
df.to_csv("df.csv", index=False)
# 4.2 Writing DataFrame to Excel
df.to_excel("df.xlsx", index=False)
# 4.3 Writing DataFrame to JSON
df.to_json("df.json", orient="records", lines=True)
# 5.1 Reading CSV file
df_csv = pd.read_csv("df.csv")
# 5.2 Reading Excel file
df_excel = pd.read_excel("df.xlsx")
# 5.3 Reading JSON file
df_json = pd.read_json("df.json", lines=True)
# 6. Dataframe shape
df_shape = df.shape
print("DataFrame shape: ", df_shape)
DataFrame shape: (3, 3)
# 7. Dataframe columns
df_columns = df.columns
print("Dataframe columns: ", df_columns)
Dataframe columns: Index(['Name', 'Age', 'City'], dtype='object')
# 8. Dataframe Information
df_info = df.info()
print("Dataframe info:\n", df_info)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 3 non-null object
1 Age 3 non-null int64
2 City 3 non-null object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes
Dataframe info:
None
# 9. Dataframe Summary
df_descibe = df.describe(include='all')
print("Dataframe describe:\n", df_descibe)
Dataframe describe:
Name Age City
count 3 3.0 3
unique 3 NaN 3
top Alice NaN New York
freq 1 NaN 1
mean NaN 30.0 NaN
std NaN 5.0 NaN
min NaN 25.0 NaN
25% NaN 27.5 NaN
50% NaN 30.0 NaN
75% NaN 32.5 NaN
max NaN 35.0 NaN
# 10. Finding missing values
df_missing = df.isnull().sum()
print("Missing values in each column:\n", df_missing)
Missing values in each column:
Name 0
Age 0
City 0
dtype: int64
# 11. Selecting and filtering data
print("Selecting specific columns:\n", df[["Name", "City"]])
print("\nSelect rows using loc:\n", df.loc[0])
print("\nFilter age > 25:\n", df[df['Age'] > 25])
Selecting specific columns:
Name City
0 Alice New York
1 Bob Los Angeles
2 Charlie Chicago
Select rows using loc:
Name Alice
Age 25
City New York
Name: 0, dtype: object
Filter age > 25:
Name Age City
1 Bob 30 Los Angeles
2 Charlie 35 Chicago
# 12. Adding new column
df["Age + 5"] = df["Age"] + 5
df
Name Age City Age + 5
0 Alice 25 New York 30
1 Bob 30 Los Angeles 35
2 Charlie 35 Chicago 40
# 13. Renaming columns
df.rename(columns={"Name": "First Name", "City": "Location"}, inplace=True)
print("Renamed DataFrame:\n", df)
Renamed DataFrame:
First Name Age Location Age + 5
0 Alice 25 New York 30
1 Bob 30 Los Angeles 35
2 Charlie 35 Chicago 40
# 14. Dropping columns
df.drop(columns=["Age + 5"], inplace=True)
print("DataFrame after dropping column:\n", df)
DataFrame after dropping column:
First Name Age Location
0 Alice 25 New York
1 Bob 30 Los Angeles
2 Charlie 35 Chicago
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js
Lab 3: Pandas Fundamentals Practice
In this lab, you will practice fundamental Pandas operations using a single dataset. Dataset: students.csv (assumed to be in the
same directory)
Tasks include:
Exploring the dataset
Selecting/filtering data
Performing basic operations
Aggregating and grouping
Saving the updated dataset
# 1. Import the necessary libraries
import numpy as np
import pandas as pd
# 2. Load the dataset and print the first 5 rows
df = pd.read_csv(r"data\3_student_data.csv")
df.head()
school gender age address famsize Pstatus Medu Fedu Mjob Fjob ... famrel freetime goout Dalc Walc health abs
0 GP F 18 U GT3 A 4 4 at_home teacher ... 4 3 4 1 1 3
1 GP F 17 U GT3 T 1 1 at_home other ... 5 3 3 1 1 3
2 GP F 15 U LE3 T 1 1 at_home other ... 4 3 2 2 3 3
3 GP F 15 U GT3 T 4 2 health services ... 3 2 2 1 1 5
4 GP F 16 U GT3 T 3 3 other other ... 4 3 2 1 2 5
5 rows × 33 columns
# 3. Print the shape of the dataset and the column names
df_shape = df.shape
print(f"The sape of the dataset is: {df_shape}")
df_columns = df.columns
print(f"\nThe columns in the dataset are: {df_columns}")
The sape of the dataset is: (395, 33)
The columns in the dataset are: Index(['school', 'gender', 'age', 'address', 'famsize', 'Pstatus', 'Medu',
'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
dtype='object')
# 4. Display the inofrmation about the dataset
df_info = df.info()
print(f"\nThe information about the dataset is: {df_info}")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 school 395 non-null object
1 gender 395 non-null object
2 age 395 non-null int64
3 address 395 non-null object
4 famsize 395 non-null object
5 Pstatus 395 non-null object
6 Medu 395 non-null int64
7 Fedu 395 non-null int64
8 Mjob 395 non-null object Do not write this output
9 Fjob 395 non-null object
10 reason 395 non-null object
11 guardian 395 non-null object
12 traveltime 395 non-null int64
13 studytime 395 non-null int64
14 failures 395 non-null int64
15 schoolsup 395 non-null object
16 famsup 395 non-null object
17 paid 395 non-null object
18 activities 395 non-null object
19 nursery 395 non-null object
20 higher 395 non-null object
21 internet 395 non-null object
22 romantic 395 non-null object
23 famrel 395 non-null int64
24 freetime 395 non-null int64
25 goout 395 non-null int64
26 Dalc 395 non-null int64
27 Walc 395 non-null int64
28 health 395 non-null int64
29 absences 395 non-null int64
30 G1 395 non-null int64
31 G2 395 non-null int64
32 G3 395 non-null int64
dtypes: int64(16), object(17)
memory usage: 102.0+ KB
The information about the dataset is: None
# 5. Show only gender, age and school columns
custom_columns = df[["gender", "age", "school"]]
print(f"The custom columns are: \n{custom_columns}")
The custom columns are:
gender age school
0 F 18 GP
1 F 17 GP
2 F 15 GP
3 F 15 GP
4 F 16 GP
.. ... ... ...
390 M 20 MS
391 M 17 MS
392 M 21 MS
393 M 18 MS
394 M 19 MS
[395 rows x 3 columns]
# 6. Filter and show all the rows where the age is greater than 20 in the filtered dataset
age_filter = custom_columns[custom_columns["age"] > 20]
print(f"\nThe rows where the age is greater than 20 are: \n{age_filter}")
The rows where the age is greater than 20 are:
gender age school
247 M 22 GP
392 M 21 MS
# 7. Find the number of student who has a reading time above 2 hours
reading_time_filter = df[df["studytime"] > 2]
print(f"\nThe number of students who has a reading time above 2 hours is: {reading_time_filter.shape[0]}")
The number of students who has a reading time above 2 hours is: 92
# 8. Create a new column called "averge" (mean of G!, G2, G3)
df["average"] = df[["G1", "G2", "G3"]].mean(axis=1)
print(f"\nThe new column 'average' is added to the dataset: \n{df[['G1', 'G2', 'G3', 'average']].head()}")
The new column 'average' is added to the dataset:
G1 G2 G3 average
0 5 6 6 5.666667
1 5 5 6 5.333333
2 7 8 10 8.333333
3 15 14 15 14.666667
4 6 10 10 8.666667
# 9. Add a new column called "pass" which is True if the average is greater than 8 and False otherwise
df['result'] = df['average'].apply(lambda x: 'Pass' if x >= 8 else 'Fail')
print(df[['average', 'result']].head())
average result
0 5.666667 Fail
1 5.333333 Fail
2 8.333333 Pass
3 14.666667 Pass
4 8.666667 Pass
# 10. Count the number of students who passed and failed
pass_count = df['result'].value_counts()
print(f"\nThe number of students who passed and failed are: \n{pass_count}")
The number of students who passed and failed are:
result
Pass 313
Fail 82
Name: count, dtype: int64
# 11. Find the aveerage score grouped by gender
average_score = df.groupby('gender')[['G1', 'G2', 'G3']].mean()
print(f"\nThe average score grouped by gender is: \n{average_score}")
The average score grouped by gender is:
G1 G2 G3
gender
F 10.620192 10.389423 9.966346
M 11.229947 11.074866 10.914439
# 12. Rename the columns "G1", "G2", "G3" to "Maths", "English", "Science"
df.rename(columns={'G1': 'Maths', 'G2': 'English', 'G3': 'Science'}, inplace=True)
print(f"\nThe new columns names are: \n{df.head()}")
The new columns names are:
school gender age address famsize Pstatus Medu Fedu Mjob Fjob \
0 GP F 18 U GT3 A 4 4 at_home teacher
1 GP F 17 U GT3 T 1 1 at_home other
2 GP F 15 U LE3 T 1 1 at_home other
3 GP F 15 U GT3 T 4 2 health services
4 GP F 16 U GT3 T 3 3 other other
... Dalc Walc health absences Maths English Science average pass \
Do not write this output
0 ... 1 1 3 6 5 6 6 5.666667 False
1 ... 1 1 3 4 5 5 6 5.333333 False
2 ... 2 3 3 10 7 8 10 8.333333 False
3 ... 1 1 5 2 15 14 15 14.666667 True
4 ... 1 2 5 4 6 10 10 8.666667 False
result
0 Fail
1 Fail
2 Pass
3 Pass
4 Pass
[5 rows x 36 columns]
# 13. Sort the dataset by age in descending order
sorted_df = df.sort_values(by='age', ascending=False)
print(f"\nThe dataset sorted by age in descending order is: \n{sorted_df.head()}")
The dataset sorted by age in descending order is:
school gender age address famsize Pstatus Medu Fedu Mjob \
247 GP M 22 U GT3 T 3 1 services
392 MS M 21 R GT3 T 1 1 other
390 MS M 20 U LE3 A 2 2 services
306 GP M 20 U GT3 A 3 2 services
376 MS F 20 U GT3 T 4 2 health
Fjob ... Dalc Walc health absences Maths English Science \
247 services ... 5 5 1 16 6 8 8
392 other ... 3 3 3 3 10 8 7
390 services ... 4 5 4 11 9 9 9
306 other ... 1 1 5 0 17 18 18
376 other ... 1 1 3 4 15 14 15
average pass result
247 7.333333 False Fail
392 8.333333 False Pass
390 9.000000 False Pass
306 17.666667 True Pass
376 14.666667 True Pass
[5 rows x 36 columns]
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js
Lab 4: Introduction to Matplotlib
To introduce students to data visualization using the Matplotlib library. This lab focuses on developing skills to create, customize, and
interpret a wide variety of plots including line plots, bar charts, histograms, pie charts, scatter plots, subplots, box plots, and more.
Students will learn how to use these plots to explore and present insights from real-world datasets effectively.
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
# Loading the dataset
df = pd.read_csv(r"data\4_student_habits_performance.csv")
df
Study Social Media Netflix Part Time Sleep Exercise Mental
Age Gender Attendance Marks
Hours Hours Hours Job Hours Hours Health
0 19 0 3.1 0.7 1.3 1 88.0 7.6 2 6 81.5
1 20 0 2.1 0.5 2.0 0 80.9 7.3 3 6 62.3
2 18 0 5.9 1.3 1.7 0 80.6 6.2 5 1 87.1
3 20 0 0.9 4.3 4.1 0 90.1 6.1 6 4 27.6
4 19 0 5.2 4.2 2.8 1 93.0 7.5 2 7 77.9
... ... ... ... ... ... ... ... ... ... ... ...
195 24 1 2.4 4.3 1.0 0 87.9 8.9 5 9 71.0
196 18 1 1.2 3.6 0.0 0 72.7 6.6 4 4 41.1
197 21 1 1.9 2.3 1.4 0 84.0 5.1 4 4 51.8
198 17 1 5.0 1.3 0.8 0 95.4 5.4 6 7 96.2
199 19 1 2.6 3.1 0.3 0 84.6 7.1 6 5 77.5
200 rows × 11 columns
df_sort = df.sort_values(by = ["Study Hours"], ascending = True)
# 1. Displaing line plot of Study Hours vs Marks
plt.figure(figsize=(20, 8))
plt.plot(df_sort['Study Hours'], df['Marks'], marker='o', linestyle='--', color='b')
plt.title("Study Hours vs Marks")
plt.xlabel("Study Hours")
plt.ylabel("Marks")
plt.grid(True)
plt.show()
# 2. Displaying bar plot of Social Media Hours
plt.figure(figsize=(15, 5))
df['Social Media Hours'].value_counts().plot(kind='bar', color='skyblue')
plt.title("Social Media Hours Distribution")
plt.xlabel("Category")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.grid()
plt.show()
# 3. Displaying histogram of Sleep Hours
plt.figure(figsize=(8, 5))
plt.hist(df['Sleep Hours'], bins=20, color='lightgreen', edgecolor='black')
plt.title("Distribution of Sleep Hours")
plt.xlabel("Sleep Hours")
plt.ylabel("Number of Students")
plt.grid(axis='y')
plt.show()
# 4. Displaying pie chart of Gender Distribution
plt.figure(figsize=(6, 6))
df['Age'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['pink', 'lightblue'])
plt.title("Gender Distribution")
plt.ylabel('')
plt.show()
# 5. Displaying scatter plot of Social Media Hours vs Netflix Hours
plt.figure(figsize=(8, 5))
plt.scatter(df['Social Media Hours'], df['Netflix Hours'], color='purple')
plt.title("Sleep Hours vs Marks")
plt.xlabel("Sleep Hours")
plt.ylabel("Marks")
plt.grid(True)
plt.show()
# 6. Subplots examples
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
# Subplot 1: Line plot of Study Hours and Marks
axs[0].plot(df_sort['Study Hours'], df['Marks'], marker='o')
axs[0].set_title("Study Hours vs Marks")
axs[0].set_xlabel("Study Hours")
axs[0].set_ylabel("Marks")
# Subplot 2: Scatter plot of Sleep Hours and Marks
axs[1].scatter(df['Sleep Hours'], df['Marks'], color='orange')
axs[1].set_title("Sleep Hours vs Marks")
axs[1].set_xlabel("Sleep Hours")
axs[1].set_ylabel("Marks")
plt.tight_layout()
plt.show()
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js