# Solutions for Data Analysis and Visualization (UPC: 2343012002)
S.No. 1673
# SECTION A
# Q1 (a)
import matplotlib.pyplot as plt
rainfall = [5, 2, 7, 8, 2]
days = [1, 3, 5, 1, 9]
plt.plot(days, rainfall, 'ro', markersize=10)
plt.title("Rainfall over Days")
plt.xlabel("Days")
plt.ylabel("Rainfall")
plt.show()
# Q1 (b)
import pandas as pd
company = pd.DataFrame({'Name': ['Sangeeta', 'Sarika', 'Sangeeta', 'Babita', 'Sarika'], 'Age': [18, 30,
45, 32, 25]})
# (i)
company['Name'].unique()
# (ii)
company.groupby('Name')['Age'].mean()
Q1 (c)
section1 = pd.DataFrame({'RollNo': [1,2,3, 4], 'Name': ['Abhav', 'Vihaan','Chitra','Devansh']})
section2 = pd.DataFrame({'RollNo': [1,5,3, 2], 'Name': ['Roni', 'Kabeer','Ishani','Vihaan']})
# (i)
print(section1)
# (ii)
merged = pd.merge(section2, section1, on='Name', how='inner')
print(merged)
# (iii)
common = pd.merge(section1, section2, on=['Name', 'RollNo'])
print(common)
# Q1 (d)
al = np.zeros((2, 3))
[[0,0,0],[0,0,0]]
a2 = [[3, 4, 5], [7, 8, 9]]
print(np.add(al, a2))
[[3.,4.,5.,],[7.,8.,9.]]
a1=np.append(a1,a2,axis=0)
print(a1)
[[0,0,0],[0,0,0], [3.,4.,5.,],[7.,8.,9.]]
print('shape of array',a1.shape)
(4,3)
# Q1 (e)
empSalary = np.array([4000, 5200, 6100, 7000, 4900, 8000, 3000, 9200, 6300, 4800])
# (i)
len(empSalary[empSalary>5000])# (ii)
incentive = empSalary * 0.1
print("Incentives:", incentive)
# Q1 (f)
data = pd.DataFrame([[2, 4, 6], [np.NaN, 8, 10], [np.NaN, 12, np.NaN], [np.NaN, np.NaN, np.NaN]])
print(data)
0 1 2
0 2.0 4.0 6.0
1 NaN 8.0 10.0
2 NaN 12.0 NaN
3 NaN NaN NaN
print(data.dropna(thresh=2))
0 1 2
0 2.0 4.0 6.0
1 NaN 8.0 10.0
print(data.fillna(method="ffill", limit=2))
0 1 2
0 2.0 4.0 6.0
1 2.0 8.0 10.0
2 2.0 12.0 10.0
3 NaN 12.0 10.0
# SECTION B
# Q2 (a)
df = pd.DataFrame(np.arange(12).reshape(4, 3), index=[['North', 'North', 'South', 'South'], [1, 2, 1,
2]], columns=[['Delhi', 'Delhi', 'Chandigarh'], ['Green', 'Red', 'Green']])
df.index.names = ['key1', 'key2']
print(df)
df1 = df.swaplevel('key1', 'key2')
print(df1)
df2 = df1.sort_index(level=0)
print(df2)
# Q2 (b)
markSheet = np.random.randint(60, 101, size=(2, 3))
print(markSheet)
print("Datatype:", markSheet.dtype)
print("Shape:", markSheet.shape)
print("Dimension:", markSheet.ndim)
# Q2 (c)
itemRate = pd.DataFrame({'Item': ['Apples', 'Oranges'], 'Rate': [220, 90]})
itemRate['Rate'] *= 2
print(itemRate)
print("Item with Min Rate:", itemRate.loc[itemRate['Rate'].idxmin()])
Q 3 import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# DataFrame
data = {
'Name': ['Mohan', 'Sohan', 'Jeevan', 'Gita', 'Meenu', 'Gopal', 'Rajeev'],
'Hours_studied': [2.5, 4.0, 6.0, 8.0, 10.0, 1.0, 5.0],
'Marks_obtained': [40, 52, 64, 70, 90, 10, 60]
df_Student = pd.DataFrame(data)
# 1. Students with maximum marks
max_marks = df_Student['Marks_obtained'].max()
top_students = df_Student[df_Student['Marks_obtained'] == max_marks]['Name'].tolist()
print("Students with maximum marks:", top_students)
# 2. Average hours studied
avg_hours = df_Student['Hours_studied'].mean()
print("Average hours studied:", avg_hours)
# 3. Correlation and Covariance
correlation = df_Student[['Hours_studied', 'Marks_obtained']].corr()
covariance = df_Student[['Hours_studied', 'Marks_obtained']].cov()
print("Correlation:\n", correlation)
print("Covariance:\n", covariance)
# 4. Heatmap
sns.heatmap(df_Student[['Hours_studied', 'Marks_obtained']].corr(), annot=True, cmap='coolwarm')
plt.title('Heatmap: Hours Studied vs Marks Obtained')
plt.show()
i.
[0 1 2 3 4 5]
ii.
[[1 2 3]
[4 6 8]]
iii.
[[2. 1. 0.66666667]
[0.5 0.33333333 0.25 ]]
iv.
1 [4 6 8] [[1 2 3]]
v.
[0]
Q 4 a)
Q4 (b)
df=pd.DataFrame({'person':['A','B','C','D','E','A','B','C','D'],'sales':
[1000,300,400,500,800,1000,500,700,50],'quarter':[1,1,1,1,1,2,2,2,2],'country':
['US','Japna','Brazil','UK','US','Brazil','Japan','Brazil','US']})sns.boxplot(x='sales', data=data)
max_sales=df[df['country']=='Brazil']['sales'].max()
min_sales=df[df['country']=='Brazil']['sales'].min()
df.groupby('country')['sales'].sum()
max_avg_sales=df.groupby('person')['sales'].mean().max()
df[df['sales']==max_avg_sales]['person']
df['sales'].describe()
boxplot = df.boxplot(column='sales')
plt.show()
# Q5 (a)
c1 = np.arange(0, 24)
c2 = c1.reshape((2, 12))
c2[:, 3:] = 0
print(c1)
print(c2)
print(c1 * 2)
print(c2.reshape((3, 8)))
# Q5 (b)
excel_data = pd.DataFrame({
'Employee id': [101, 102, 103, 104, 105, 106],
'Department': ['CS', 'CS', 'CS', 'English', 'English', 'English'],
'Salary': [2000, 2002, 2040, 2045, 2030, 2006],
'Age': [24, 23, 34, 39, 43, 34]
})
excel_data.to_excel("data.xlsx", index=False)
df1 = pd.read_excel("data.xlsx", index_col='Employee id')
fig, axes = plt.subplots(1, 2)
df1.plot.scatter(x='Salary', y='Age', ax=axes[0], title='Salary vs Age')
df1['Salary_bins'] = pd.cut(df1['Salary'], 3)
df1['Salary_bins'].value_counts().plot(kind='bar', ax=axes[1])
plt.savefig("Employees.png")
# Q6 (a)
s1 = pd.Series([5, 0, -4, 8])
print(s1)
print(s1.rank())
data1 = pd.DataFrame({'One': ['a', 'b'] * 2 + ['b'], 'Two': [21, 22, 21, 23, 24]})
print(data1)
data2 = data1.drop_duplicates(['One', 'Two'], keep='last')
print(data2)
df1 = pd.DataFrame({'A': [21, 32], 'B': [27, 30]})
df2 = pd.DataFrame({'A': [23, 41]})
df2['A'][1] = df2['A'][1] + 10
print(df1)
print(df2)
print(df2 > df1['B'].min())
# Q6 (b)
ages = np.array([20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32])
categories = pd.cut(ages, bins=[18, 25, 35, 60, 100], labels=['Youth', 'YoungAdult', 'MiddleAged',
'Senior'])
print(categories.value_counts())
quantile_bins = pd.qcut(ages, q=4)
print(quantile_bins.value_counts())
# Q7
empData = pd.DataFrame({
'Gender': ['Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Female', 'Female', 'Male',
'Male', 'Male'],
'Role': ['Data Analyst']*3 + ['Data Scientist']*3 + ['Manager']*3 + ['Data Analyst', 'Data Scientist',
'Manager'],
'Experience': [1, 1, 3, 5, 6, 1, 2, 3, 5, 6, 10, 11],
'Salary': [48000, 42000, 51000, 62000, 71000, 73000, 82000, 87000, 91000, 45000, 56000, 66000]
})
# (a)
print(empData)
# (b)
print(empData.groupby('Role')['Salary'].sum())
# (c)
print(empData[empData['Gender'] == 'Female'].groupby('Role').size())
# (d)
print(empData.groupby('Gender')['Salary'].agg(['max', 'min']))
# (e)
avg_salary = empData['Salary'].mean()
empData = empData[empData['Salary'] >= avg_salary]
print(empData)