10/6/24, 7:51 PM DMV_4 - Jupyter Notebook
In [1]: import pandas as pd
In [5]: data = pd.read_csv('city_day.csv')
In [6]: data.head()
Out[6]:
City Date PM2.5 PM10 NO NO2 NOx NH3 CO SO2 O3 Benzene
2015-
0 Ahmedabad NaN NaN 0.92 18.22 17.15 NaN 0.92 27.64 133.36 0.00
01-01
2015-
1 Ahmedabad NaN NaN 0.97 15.69 16.46 NaN 0.97 24.55 34.06 3.68
01-02
2015-
2 Ahmedabad NaN NaN 17.40 19.30 29.70 NaN 17.40 29.07 30.70 6.80
01-03
2015-
3 Ahmedabad NaN NaN 1.70 18.48 17.97 NaN 1.70 18.59 36.08 4.43
01-04
2015-
4 Ahmedabad NaN NaN 22.10 21.42 37.76 NaN 22.10 39.33 39.31 7.01
01-05
In [7]: data.columns
Out[7]: Index(['City', 'Date', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO',
'SO2',
'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket'],
dtype='object')
In [8]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 City 29531 non-null object
1 Date 29531 non-null object
2 PM2.5 24933 non-null float64
3 PM10 18391 non-null float64
4 NO 25949 non-null float64
5 NO2 25946 non-null float64
6 NOx 25346 non-null float64
7 NH3 19203 non-null float64
8 CO 27472 non-null float64
9 SO2 25677 non-null float64
10 O3 25509 non-null float64
11 Benzene 23908 non-null float64
12 Toluene 21490 non-null float64
13 Xylene 11422 non-null float64
14 AQI 24850 non-null float64
15 AQI_Bucket 24850 non-null object
dtypes: float64(13), object(3)
memory usage: 3.6+ MB
localhost:8888/notebooks/BE_PRACTICALS/DMV_4.ipynb 1/8
10/6/24, 7:51 PM DMV_4 - Jupyter Notebook
In [9]: data.describe()
Out[9]:
PM2.5 PM10 NO NO2 NOx NH3
count 24933.000000 18391.000000 25949.000000 25946.000000 25346.000000 19203.000000
mean 67.450578 118.127103 17.574730 28.560659 32.309123 23.483476
std 64.661449 90.605110 22.785846 24.474746 31.646011 25.684275
min 0.040000 0.010000 0.020000 0.010000 0.000000 0.010000
25% 28.820000 56.255000 5.630000 11.750000 12.820000 8.580000
50% 48.570000 95.680000 9.890000 21.690000 23.520000 15.850000
75% 80.590000 149.745000 19.950000 37.620000 40.127500 30.020000
max 949.990000 1000.000000 390.680000 362.210000 467.630000 352.890000
In [10]: data.isnull().sum()
Out[10]: City 0
Date 0
PM2.5 4598
PM10 11140
NO 3582
NO2 3585
NOx 4185
NH3 10328
CO 2059
SO2 3854
O3 4022
Benzene 5623
Toluene 8041
Xylene 18109
AQI 4681
AQI_Bucket 4681
dtype: int64
localhost:8888/notebooks/BE_PRACTICALS/DMV_4.ipynb 2/8
10/6/24, 7:51 PM DMV_4 - Jupyter Notebook
In [22]: data['PM2.5'].fillna(data['PM2.5'].mean(), inplace=True)
data['PM10'].fillna(data['PM10'].mean(), inplace=True)
data['NO'].fillna(data['NO'].mean(), inplace=True)
data['NO2'].fillna(data['NO2'].mean(), inplace=True)
data['NOx'].fillna(data['NOx'].mean(), inplace=True)
data['NH3'].fillna(data['NH3'].mean(), inplace=True)
data['CO'].fillna(data['CO'].mean(), inplace=True)
data['SO2'].fillna(data['SO2'].mean(), inplace=True)
data['O3'].fillna(data['O3'].mean(), inplace=True)
data['Benzene'].fillna(data['Benzene'].mean(), inplace=True)
data['Toluene'].fillna(data['Toluene'].mean(), inplace=True)
data['Xylene'].fillna(data['Xylene'].mean(), inplace=True)
data['AQI'].fillna(data['AQI'].mean(), inplace=True)
data['AQI_Bucket'].fillna(data['AQI_Bucket'].mean(), inplace=True)
localhost:8888/notebooks/BE_PRACTICALS/DMV_4.ipynb 3/8
10/6/24, 7:51 PM DMV_4 - Jupyter Notebook
--------------------------------------------------------------------------
-
TypeError Traceback (most recent call las
t)
Cell In[22], line 14
12 data['Xylene'].fillna(data['Xylene'].mean(), inplace=True)
13 data['AQI'].fillna(data['AQI'].mean(), inplace=True)
---> 14 data['AQI_Bucket'].fillna(data['AQI_Bucket'].mean(), inplace=True)
File C:\Python310\lib\site-packages\pandas\core\series.py:6221, in Series.
mean(self, axis, skipna, numeric_only, **kwargs)
6213 @doc(make_doc("mean", ndim=1))
6214 def mean(
6215 self,
(...)
6219 **kwargs,
6220 ):
-> 6221 return NDFrame.mean(self, axis, skipna, numeric_only, **kwarg
s)
File C:\Python310\lib\site-packages\pandas\core\generic.py:11978, in NDFra
me.mean(self, axis, skipna, numeric_only, **kwargs)
11971 def mean(
11972 self,
11973 axis: Axis | None = 0,
(...)
11976 **kwargs,
11977 ) -> Series | float:
> 11978 return self._stat_function(
11979 "mean", nanops.nanmean, axis, skipna, numeric_only, **kwar
gs
11980 )
File C:\Python310\lib\site-packages\pandas\core\generic.py:11935, in NDFra
me._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
11931 nv.validate_func(name, (), kwargs)
11933 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11935 return self._reduce(
11936 func, name=name, axis=axis, skipna=skipna, numeric_only=numeri
c_only
11937 )
File C:\Python310\lib\site-packages\pandas\core\series.py:6129, in Series.
_reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
6124 # GH#47500 - change to TypeError to match other methods
6125 raise TypeError(
6126 f"Series.{name} does not allow {kwd_name}={numeric_only} "
6127 "with non-numeric dtypes."
6128 )
-> 6129 return op(delegate, skipna=skipna, **kwds)
File C:\Python310\lib\site-packages\pandas\core\nanops.py:147, in bottlene
ck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
145 result = alt(values, axis=axis, skipna=skipna, **kwds)
146 else:
--> 147 result = alt(values, axis=axis, skipna=skipna, **kwds)
149 return result
File C:\Python310\lib\site-packages\pandas\core\nanops.py:404, in _datetim
elike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
401 if datetimelike and mask is None:
localhost:8888/notebooks/BE_PRACTICALS/DMV_4.ipynb 4/8
10/6/24, 7:51 PM DMV_4 - Jupyter Notebook
402 mask = isna(values)
--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwarg
s)
406 if datetimelike:
407 result = _wrap_results(result, orig_values.dtype, fill_value=i
NaT)
File C:\Python310\lib\site-packages\pandas\core\nanops.py:719, in nanmean
(values, axis, skipna, mask)
716 dtype_count = dtype
718 count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
--> 719 the_sum = values.sum(axis, dtype=dtype_sum)
720 the_sum = _ensure_numeric(the_sum)
722 if axis is not None and getattr(the_sum, "ndim", False):
File C:\Python310\lib\site-packages\numpy\core\_methods.py:48, in _sum(a,
axis, dtype, out, keepdims, initial, where)
46 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
47 initial=_NoValue, where=True):
---> 48 return umr_sum(a, axis, dtype, out, keepdims, initial, where)
TypeError: unsupported operand type(s) for +: 'int' and 'str'
In [ ]: import matplotlib.pyplot as plt
In [23]: data['Date'] = pd.to_datetime(data['Date'])
In [24]: plt.figure(figsize=(12, 6))
plt.plot(data['Date'], data['AQI'], label='AQI', color='blue', linewidth=2)
plt.title('Overall AQI Trend Over Time')
plt.xlabel('Date')
plt.ylabel('AQI Value')
plt.xticks(rotation=45)
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()
localhost:8888/notebooks/BE_PRACTICALS/DMV_4.ipynb 5/8
10/6/24, 7:51 PM DMV_4 - Jupyter Notebook
In [25]: pollutants = ['PM2.5', 'PM10', 'CO']
for pollutant in pollutants:
plt.figure(figsize=(12, 6))
plt.plot(data['Date'], data[pollutant], label=pollutant, linewidth=2)
plt.title(f'{pollutant} Levels Over Time')
plt.xlabel('Date')
plt.ylabel(f'{pollutant} Concentration')
plt.xticks(rotation=45)
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()
localhost:8888/notebooks/BE_PRACTICALS/DMV_4.ipynb 6/8
10/6/24, 7:51 PM DMV_4 - Jupyter Notebook
In [26]: daily_avg_aqi = data.groupby('Date')['AQI'].mean().reset_index()
plt.figure(figsize=(12, 6))
plt.bar(daily_avg_aqi['Date'], daily_avg_aqi['AQI'], color='orange')
plt.title('Average AQI Values Across Dates')
plt.xlabel('Date')
plt.ylabel('Average AQI Value')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
localhost:8888/notebooks/BE_PRACTICALS/DMV_4.ipynb 7/8
10/6/24, 7:51 PM DMV_4 - Jupyter Notebook
In [27]: plt.figure(figsize=(12, 6))
plt.scatter(data['PM2.5'], data['AQI'], alpha=0.5)
plt.title('Relationship Between PM2.5 and AQI')
plt.xlabel('PM2.5 Concentration')
plt.ylabel('AQI Value')
plt.grid()
plt.show()
In [ ]:
In [ ]:
localhost:8888/notebooks/BE_PRACTICALS/DMV_4.ipynb 8/8