2/6/23, 5:11 PM Untitled10 - Jupyter Notebook
In [7]:
# Data exploration, here we assign the data, and then we visualize the data in a tabular format.
# Import pandas package
import pandas as pd
# Assign data
data = {'Name': ['Jai', 'Princi', 'Gaurav',
'Anuj', 'Ravi', 'Natasha', 'Riya'],
'Age': [17, 17, 18, 17, 18, 17, 17],
'Gender': ['M', 'F', 'M', 'M', 'M', 'F', 'F'],
'Marks': [90, 76, 'NaN', 74, 65, 'NaN', 71]}
# Convert into DataFrame
df = pd.DataFrame(data)
# Display data
print(df)
Name Age Gender Marks
0 Jai 17 M 90
1 Princi 17 F 76
2 Gaurav 18 M NaN
3 Anuj 17 M 74
4 Ravi 18 M 65
5 Natasha 17 F NaN
6 Riya 17 F 71
localhost:8892/notebooks/Untitled10.ipynb?kernel_name=python3 1/5
2/6/23, 5:11 PM Untitled10 - Jupyter Notebook
In [23]:
# Compute average
c = avg = 0
for ele in df["Marks"]:
if str(ele).isnumeric():
c += 1
avg += ele
avg /= c
# Replace missing values
df = df.replace(to_replace="NaN",
value=avg)
# Display data
print(df)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3628 try:
-> 3629 return self._engine.get_loc(casted_key)
3630 except KeyError as err:
~\anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
~\anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Marks'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_103800\4025425810.py in <module>
1 # Compute average
2 #c = avg = 0
----> 3 for ele in df["Marks"]:
4 if str(ele).isnumeric():
5 c += 1
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
3503 if self.columns.nlevels > 1:
3504 return self._getitem_multilevel(key)
-> 3505 indexer = self.columns.get_loc(key)
3506 if is_integer(indexer):
3507 indexer = [indexer]
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3629 return self._engine.get_loc(casted_key)
3630 except KeyError as err:
-> 3631 raise KeyError(key) from err
3632 except TypeError:
3633 # If we have a listlike key, _check_indexing_error will raise
KeyError: 'Marks'
In [13]:
# Categorize gender
df['Gender'] = df['Gender'].map({'M': 0,
'F': 1, }).astype(float)
# Display data
print(df)
Name Age Gender Marks
0 Jai 17 NaN 90.0
1 Princi 17 NaN 76.0
2 Gaurav 18 NaN 75.2
3 Anuj 17 NaN 74.0
4 Ravi 18 NaN 65.0
5 Natasha 17 NaN 75.2
6 Riya 17 NaN 71.0
localhost:8892/notebooks/Untitled10.ipynb?kernel_name=python3 2/5
2/6/23, 5:11 PM Untitled10 - Jupyter Notebook
In [14]:
# Filter top scoring students
df = df[df['Marks'] >= 75]
# Remove age row
df = df.drop(['Age'], axis=1)
# Display data
print(df)
Name Gender Marks
0 Jai NaN 90.0
1 Princi NaN 76.0
2 Gaurav NaN 75.2
5 Natasha NaN 75.2
In [15]:
# Wrangling Data Using Merge Operation
# Merge operation is used to merge raw data and into the desired format.
# Syntax for merging pd.merge( data_frame1,data_frame2, on="field ")
# import module
import pandas as pd
# creating DataFrame for Student Details
details = pd.DataFrame({
'ID': [101, 102, 103, 104, 105, 106,
107, 108, 109, 110],
'NAME': ['Jagroop', 'Praveen', 'Harjot',
'Pooja', 'Rahul', 'Nikita',
'Saurabh', 'Ayush', 'Dolly', "Mohit"],
'BRANCH': ['CSE', 'CSE', 'CSE', 'CSE', 'CSE',
'CSE', 'CSE', 'CSE', 'CSE', 'CSE']})
# printing details
print(details)
ID NAME BRANCH
0 101 Jagroop CSE
1 102 Praveen CSE
2 103 Harjot CSE
3 104 Pooja CSE
4 105 Rahul CSE
5 106 Nikita CSE
6 107 Saurabh CSE
7 108 Ayush CSE
8 109 Dolly CSE
9 110 Mohit CSE
In [16]:
# Import module
import pandas as pd
# Creating Dataframe for Fees_Status
fees_status = pd.DataFrame(
{'ID': [101, 102, 103, 104, 105,
106, 107, 108, 109, 110],
'PENDING': ['5000', '250', 'NIL',
'9000', '15000', 'NIL',
'4500', '1800', '250', 'NIL']})
# Printing fees_status
print(fees_status)
ID PENDING
0 101 5000
1 102 250
2 103 NIL
3 104 9000
4 105 15000
5 106 NIL
6 107 4500
7 108 1800
8 109 250
9 110 NIL
localhost:8892/notebooks/Untitled10.ipynb?kernel_name=python3 3/5
2/6/23, 5:11 PM Untitled10 - Jupyter Notebook
In [17]:
# WRANGLING DATA USING MERGE OPERATION:
# Creating Dataframe
details = pd.DataFrame({
'ID': [101, 102, 103, 104, 105,
106, 107, 108, 109, 110],
'NAME': ['Jagroop', 'Praveen', 'Harjot',
'Pooja', 'Rahul', 'Nikita',
'Saurabh', 'Ayush', 'Dolly', "Mohit"],
'BRANCH': ['CSE', 'CSE', 'CSE', 'CSE', 'CSE',
'CSE', 'CSE', 'CSE', 'CSE', 'CSE']})
# Creating Dataframe
fees_status = pd.DataFrame(
{'ID': [101, 102, 103, 104, 105,
106, 107, 108, 109, 110],
'PENDING': ['5000', '250', 'NIL',
'9000', '15000', 'NIL',
'4500', '1800', '250', 'NIL']})
# Merging Dataframe
print(pd.merge(details, fees_status, on='ID'))
ID NAME BRANCH PENDING
0 101 Jagroop CSE 5000
1 102 Praveen CSE 250
2 103 Harjot CSE NIL
3 104 Pooja CSE 9000
4 105 Rahul CSE 15000
5 106 Nikita CSE NIL
6 107 Saurabh CSE 4500
7 108 Ayush CSE 1800
8 109 Dolly CSE 250
9 110 Mohit CSE NIL
In [18]:
# wrangling data using grouping method
# Creating Data
car_selling_data = {'Brand': ['Maruti', 'Maruti', 'Maruti',
'Maruti', 'Hyundai', 'Hyundai',
'Toyota', 'Mahindra', 'Mahindra',
'Ford', 'Toyota', 'Ford'],
'Year': [2010, 2011, 2009, 2013,
2010, 2011, 2011, 2010,
2013, 2010, 2010, 2011],
'Sold': [6, 7, 9, 8, 3, 5,
2, 8, 7, 2, 4, 2]}
# Creating Dataframe of car_selling_data
df = pd.DataFrame(car_selling_data)
print(df)
Brand Year Sold
0 Maruti 2010 6
1 Maruti 2011 7
2 Maruti 2009 9
3 Maruti 2013 8
4 Hyundai 2010 3
5 Hyundai 2011 5
6 Toyota 2011 2
7 Mahindra 2010 8
8 Mahindra 2013 7
9 Ford 2010 2
10 Toyota 2010 4
11 Ford 2011 2
In [19]:
# Group the data when year = 2010
grouped = df.groupby('Year')
print(grouped.get_group(2010))
Brand Year Sold
0 Maruti 2010 6
4 Hyundai 2010 3
7 Mahindra 2010 8
9 Ford 2010 2
10 Toyota 2010 4
localhost:8892/notebooks/Untitled10.ipynb?kernel_name=python3 4/5
2/6/23, 5:11 PM Untitled10 - Jupyter Notebook
In [20]:
# Wrangling data by removing Duplication
# DataFrame.duplicated(subset=None, keep='first')
# Initializing Data
student_data = {'Name': ['Amit', 'Praveen', 'Jagroop',
'Rahul', 'Vishal', 'Suraj',
'Rishab', 'Satyapal', 'Amit',
'Rahul', 'Praveen', 'Amit'],
'Roll_no': [23, 54, 29, 36, 59, 38,
12, 45, 34, 36, 54, 23],
'
[email protected]', '
[email protected]',
'
[email protected]', '
[email protected]',
'
[email protected]', '
[email protected]',
'
[email protected]', '
[email protected]',
'
[email protected]', '
[email protected]']}
# Creating Dataframe of Data
df = pd.DataFrame(student_data)
# Printing Dataframe
print(df)
Name Roll_no Email
0 Amit 23
[email protected] 1 Praveen 54
[email protected] 2 Jagroop 29
[email protected] 3 Rahul 36
[email protected] 4 Vishal 59
[email protected] 5 Suraj 38
[email protected] 6 Rishab 12
[email protected] 7 Satyapal 45
[email protected] 8 Amit 34
[email protected] 9 Rahul 36
[email protected] 10 Praveen 54
[email protected] 11 Amit 23
[email protected] In [21]:
# Here df.duplicated() list duplicate Entries in ROllno.
# So that ~(NOT) is placed in order to get non duplicate values.
non_duplicate = df[~df.duplicated('Roll_no')]
# printing non-duplicate values
print(non_duplicate)
Name Roll_no Email
0 Amit 23
[email protected] 1 Praveen 54
[email protected] 2 Jagroop 29
[email protected] 3 Rahul 36
[email protected] 4 Vishal 59
[email protected] 5 Suraj 38
[email protected] 6 Rishab 12
[email protected] 7 Satyapal 45
[email protected] 8 Amit 34
[email protected] In [ ]:
localhost:8892/notebooks/Untitled10.ipynb?kernel_name=python3 5/5