0% found this document useful (0 votes)

42 views4 pages

Car Analytics Solution

The document is a Python script for processing car data using PySpark, including functions for reading data from S3, cleaning it, and performing analysis to generate results based on average selling price and price per kilometer. It includes placeholders for user-defined bucket names and RDS connection details, along with error handling for each processing step. The main function orchestrates the flow of execution, ensuring that data is read, cleaned, analyzed, and stored in both S3 and an RDS database.

Uploaded by

Deivanai M

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

42 views4 pages

Car Analytics Solution

Uploaded by

Deivanai M

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 4

# -- coding: utf-8 --

import os
import shutil
import pyspark
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import traceback
from datetime import datetime

#*********************************************************************//
# Note: Please refer to problem statements from challenge.htm file //
# found under instructions folder for detailed requirements. We have //
# defined placeholder functions where you need to add your code that //
# may solve one or more of the the problem statements. Within the //
# functions we have added DEFAULT code which when executed without //
# any modification would create empty dataframe. This approach will //
# enable you to test your code intermediate without worrying about //
# syntax failure outside of your code. To SOLVE the problem, you are //
# expected to REPLACE the code which creates dummy dataframe with //
# your ACTUAL code. //
#*********************************************************************//
# Note: We have also added code to print sample data to console in //
# each of the functions to help you visualize your intermediate data //
#*********************************************************************//

def read_data(spark):
'''
spark_session : spark
customSchema : we have given the custom schema
'''
print("-------------------")
print("Starting read_data")
print("-------------------")

# TODO: REPLACE WITH YOUR ACTUAL BUCKET NAME

# Your bucket name should start with "car-data" followed by random digits
# Example: "car-data123456789"
bucket_name = "YOUR_BUCKET_NAME_HERE" # Replace with actual bucket name like "car-data123456789"

s3_input_path = "s3://" + bucket_name + "/inputfile/car_data.csv"

# Read CSV file from S3 with header=true

df = spark.read.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.load(s3_input_path)

print("Data read successfully. Sample data:")

df.show(5)
print(f"Total rows: {df.count()}")
print(f"Schema: {df.printSchema()}")

return df

def clean_data(input_df):
'''
for input file: input_df is output of read_data function
'''
print("-------------------")
print("Starting clean_data")
print("-------------------")

# Start with input dataframe

df = input_df

print(f"Original data count: {df.count()}")

# 1. Drop rows with null values in any column

df = df.dropna()
print(f"After dropping null values: {df.count()}")

# 2. Drop duplicate rows

df = df.dropDuplicates()
print(f"After dropping duplicates: {df.count()}")

print("Clean data sample:")

df.show(5)

return df

def s3_load_data(data, file_name):

'''
data : the output data of clean_data function
file_name : the name of the output to be stored inside the s3
'''
# TODO: REPLACE WITH YOUR ACTUAL BUCKET NAME
# Use the same bucket name as in read_data function
bucket_name = "YOUR_BUCKET_NAME_HERE" # Replace with actual bucket name like "car-data123456789"

output_path = "s3://" + bucket_name + "/output/" + file_name

if data.count() != 0:
print("Loading the data", output_path)

# Write data to S3 as single partition CSV with header

data.coalesce(1) \
.write \
.mode("overwrite") \
.option("header", "true") \
.csv(output_path)

print(f"Data successfully saved to {output_path}")

else:
print("Empty dataframe, hence cannot save the data", output_path)

def result_1(input_df):
'''
for input file: input_df is output of clean_data function
'''
print("-------------------------")
print("Starting result_1")
print("-------------------------")

# Group by car_name and calculate average selling price and count

df = input_df.groupBy("car_name") \
.agg(
avg("selling_price").alias("average_selling_price"),
count("*").alias("car_count")
) \
.filter(col("car_count") > 2) \
.select("car_name", "average_selling_price", "car_count")

print("Result 1 sample data:")

df.show(10)
print(f"Total records with car_count > 2: {df.count()}")

return df

def result_2(input_df):
'''
for input file: input_df is output of clean_data function
'''
print("-------------------------")
print("Starting result_2")
print("-------------------------")

# Create price_per_km column, filter and round

df = input_df.withColumn("price_per_km",
round(col("selling_price") / col("km_driven"), 2)) \
.filter(col("price_per_km") < 10) \
.select("car_name", "year", "selling_price", "km_driven",
"fuel", "seller_type", "transmission", "owner", "price_per_km")

print("Result 2 sample data:")

df.show(10)
print(f"Total records with price_per_km < 10: {df.count()}")

return df

def rds_mysql_load_data(data, table_name):

if data.count() != 0:
print(f"Loading the data into RDS table : {table_name}...")

# TODO: UPDATE THESE RDS CONNECTION DETAILS

# Replace with your actual RDS endpoint
jdbcUrl = "jdbc:mysql://YOUR_RDS_ENDPOINT:3306/dev" # Replace YOUR_RDS_ENDPOINT with actual RDS endpoint
username = "admin" # This should match what you set during RDS creation
password = "Awsuser1" # This should match what you set during RDS creation (case-sensitive)

data.write.mode('overwrite') \
.format('jdbc') \
.option('url', jdbcUrl) \
.option('dbtable', table_name) \
.option('user', username) \
.option('password', password) \
.option("driver", "com.mysql.cj.jdbc.Driver") \
.save()

print(f"Data successfully loaded into RDS table: {table_name}")

else:
print(f"Empty dataframe, hence cannot load the data into RDS table : {table_name}...")

def main():
""" Main driver program to control the flow of execution.
Please DO NOT change anything here.
"""

spark = (SparkSession.builder.appName("Car Data analysis").getOrCreate())

spark.sparkContext.setLogLevel("ERROR")
clean_data_path = "cleaned_data"

try:
task_1 = read_data(spark)
except Exception as e:
print("Getting error in the read_data function", e)
traceback.print_exc()
try:
task_2 = clean_data(task_1)
except Exception as e:
print("Getting error in the clean_data function", e)
traceback.print_exc()
try:
task_3 = result_1(task_2)
except Exception as e:
print("Getting error in the result_1 function", e)
traceback.print_exc()
try:
task_4 = result_2(task_2)
except Exception as e:
print("Getting error in the result_2 function", e)
traceback.print_exc()
try:
s3_load_data(task_2, clean_data_path)
except Exception as e:
print("Getting error while loading clean_data", e)
traceback.print_exc()

try:
rds_mysql_load_data(task_3, "average_selling_price")
except Exception as e:
print("Getting error while loading rds data", e)
traceback.print_exc()

try:
rds_mysql_load_data(task_4, "price_per_km")
except Exception as e:
print("Getting error while loading rds data", e)
traceback.print_exc()
spark.stop()

if __name__ == "__main__":
main()

T15 Hand-On Solution Id 80827
No ratings yet
T15 Hand-On Solution Id 80827
2 pages
Approach 2 - Middleware - SAP ECC or S4HANA BTP
No ratings yet
Approach 2 - Middleware - SAP ECC or S4HANA BTP
20 pages
Pyspark Basics
No ratings yet
Pyspark Basics
16 pages
PySpark ELT Cheat Sheet Guide
No ratings yet
PySpark ELT Cheat Sheet Guide
8 pages
Analytics Quefile Without Answer
No ratings yet
Analytics Quefile Without Answer
3 pages
IP Project On Car Rental System in India
100% (5)
IP Project On Car Rental System in India
33 pages
Import As Import As
No ratings yet
Import As Import As
18 pages
Assignment2 Problem
No ratings yet
Assignment2 Problem
4 pages
CS Luxuary Car Project
No ratings yet
CS Luxuary Car Project
29 pages
Spark Test Que
No ratings yet
Spark Test Que
3 pages
Unit 6 Pyspark - MLlib
No ratings yet
Unit 6 Pyspark - MLlib
6 pages
New Green Field School
No ratings yet
New Green Field School
33 pages
Pyspark MLlib
No ratings yet
Pyspark MLlib
4 pages
Python Project
No ratings yet
Python Project
12 pages
DMA Flask
No ratings yet
DMA Flask
14 pages
PySpark Cheatsheet - Elaborate
No ratings yet
PySpark Cheatsheet - Elaborate
14 pages
Car Rent PDF
No ratings yet
Car Rent PDF
17 pages
UI21CS29 Lab2
No ratings yet
UI21CS29 Lab2
11 pages
Must Know Pyspark Coding Before Databricks Interview
No ratings yet
Must Know Pyspark Coding Before Databricks Interview
7 pages
EDA Python For Data Analsis
No ratings yet
EDA Python For Data Analsis
10 pages
EDS - Python Cheat Sheet
0% (1)
EDS - Python Cheat Sheet
3 pages
DW Lab File
No ratings yet
DW Lab File
18 pages
Question Bank-BDA (Module 1&2) 2
No ratings yet
Question Bank-BDA (Module 1&2) 2
5 pages
Affiliated To C
No ratings yet
Affiliated To C
25 pages
Comp Project
No ratings yet
Comp Project
32 pages
Python Codes
No ratings yet
Python Codes
17 pages
Pyspark Practice Template
No ratings yet
Pyspark Practice Template
2 pages
Pyspark File Commands and Theory
No ratings yet
Pyspark File Commands and Theory
29 pages
Delhivery Feature Engineering - Solution Approach
No ratings yet
Delhivery Feature Engineering - Solution Approach
7 pages
Data Analysis Report
No ratings yet
Data Analysis Report
74 pages
Data Analysis With Python
No ratings yet
Data Analysis With Python
12 pages
DS P2 Tanvi
No ratings yet
DS P2 Tanvi
3 pages
Python Pyspark Q's
No ratings yet
Python Pyspark Q's
16 pages
Sales Management Project Report
No ratings yet
Sales Management Project Report
18 pages
2324 BigData Lab3
No ratings yet
2324 BigData Lab3
6 pages
Data Mining Journal 1 Kashan
No ratings yet
Data Mining Journal 1 Kashan
13 pages
Linear Regression
No ratings yet
Linear Regression
4 pages
DataGrokr Technical Assignment - Data Engineering
No ratings yet
DataGrokr Technical Assignment - Data Engineering
4 pages
Capital Bikeshare SQL
No ratings yet
Capital Bikeshare SQL
3 pages
Quikr Car Price Prediction Using Linear Regression 1717999953
No ratings yet
Quikr Car Price Prediction Using Linear Regression 1717999953
12 pages
Complete Data Engineer Interview Guide
No ratings yet
Complete Data Engineer Interview Guide
3 pages
Car Rental System Code
No ratings yet
Car Rental System Code
4 pages
Web App Code
No ratings yet
Web App Code
5 pages
Pyspark Funcamentals
No ratings yet
Pyspark Funcamentals
10 pages
PySpark DataFrame Operations Guide
No ratings yet
PySpark DataFrame Operations Guide
10 pages
IP Project
No ratings yet
IP Project
8 pages
Unit 4 - Working With Graphs - Python
No ratings yet
Unit 4 - Working With Graphs - Python
49 pages
Pyspark Interview Questions
No ratings yet
Pyspark Interview Questions
4 pages
EDA With Pandas CheatSheet
No ratings yet
EDA With Pandas CheatSheet
3 pages
Top 10 Production-Grade Reusable PySpark Scripts For Data Engineers - by Mayurkumar Surani - May, 2025 - Medium
No ratings yet
Top 10 Production-Grade Reusable PySpark Scripts For Data Engineers - by Mayurkumar Surani - May, 2025 - Medium
14 pages
Python Code
No ratings yet
Python Code
7 pages
Data Cleaning
No ratings yet
Data Cleaning
7 pages
Ip Project
No ratings yet
Ip Project
52 pages
Sales Analysis Using Python and SQL
No ratings yet
Sales Analysis Using Python and SQL
15 pages
Apache Spark
No ratings yet
Apache Spark
5 pages
Content Beyond Syllabus and Case Based Program
No ratings yet
Content Beyond Syllabus and Case Based Program
8 pages
Nishant Computerscience Projectfinal
No ratings yet
Nishant Computerscience Projectfinal
20 pages
Python Lab Manual
No ratings yet
Python Lab Manual
49 pages
Cheat Sheet - Pandas
No ratings yet
Cheat Sheet - Pandas
6 pages
PySpark Notes
No ratings yet
PySpark Notes
64 pages
Material Approval Application
No ratings yet
Material Approval Application
1 page
Computer Security & Forensics Exam
No ratings yet
Computer Security & Forensics Exam
5 pages
My Python Project
No ratings yet
My Python Project
4 pages
GAT Application Procedure
No ratings yet
GAT Application Procedure
2 pages
CLA Guitars
No ratings yet
CLA Guitars
13 pages
Nti Serimux S 16 Ds
No ratings yet
Nti Serimux S 16 Ds
4 pages
Revised Syllabus TY Information Technology W.e.f.ay 2020 21
No ratings yet
Revised Syllabus TY Information Technology W.e.f.ay 2020 21
28 pages
Psychosocial Factors Influencing Students Attitude Towards Computer Based Test
No ratings yet
Psychosocial Factors Influencing Students Attitude Towards Computer Based Test
7 pages
Ddco Question Bank
No ratings yet
Ddco Question Bank
1 page
Assignment 4 - OSF
No ratings yet
Assignment 4 - OSF
3 pages
Sjg18-046 (03) - Guangri New Control
No ratings yet
Sjg18-046 (03) - Guangri New Control
53 pages
PS.2024.C3.Corte1.Pruebas de Integracion.223204.GallegosBorraz
No ratings yet
PS.2024.C3.Corte1.Pruebas de Integracion.223204.GallegosBorraz
6 pages
Parallel-In, Parallel-Out, Universal Shift Register
No ratings yet
Parallel-In, Parallel-Out, Universal Shift Register
12 pages
Soft Computing: Concepts and Techniques: January 2014
No ratings yet
Soft Computing: Concepts and Techniques: January 2014
17 pages
16488092936246d54d2efc1RESULTWALK IN INTERVIEW HELD ON MARCH 2022
No ratings yet
16488092936246d54d2efc1RESULTWALK IN INTERVIEW HELD ON MARCH 2022
2 pages
STL ToneHub v2.0 User Manual
No ratings yet
STL ToneHub v2.0 User Manual
76 pages
Dynamic Planning With A LLM
No ratings yet
Dynamic Planning With A LLM
9 pages
Software Requirement Specification
No ratings yet
Software Requirement Specification
19 pages
Quick Start Guide: CR-HD PRO Diagnostic Tool
No ratings yet
Quick Start Guide: CR-HD PRO Diagnostic Tool
2 pages
AIML Lab: Regression Models Guide
No ratings yet
AIML Lab: Regression Models Guide
7 pages
Logitech Bundles PDF
No ratings yet
Logitech Bundles PDF
4 pages
Screenshot 2024-03-12 at 6.57.10 PM
No ratings yet
Screenshot 2024-03-12 at 6.57.10 PM
1 page
Ritu Kumari
No ratings yet
Ritu Kumari
3 pages
Save & Restore ARKit World Maps
No ratings yet
Save & Restore ARKit World Maps
9 pages
SB2X 115 02
No ratings yet
SB2X 115 02
20 pages
Chapter 4 Overview of Preventive Maintenance
No ratings yet
Chapter 4 Overview of Preventive Maintenance
14 pages
Hitachi
No ratings yet
Hitachi
7 pages
Improved BMS A Smart Electric Vehicle Design Based On An Intelligent Battery Management System
No ratings yet
Improved BMS A Smart Electric Vehicle Design Based On An Intelligent Battery Management System
8 pages
Dataset Penjualan Produk
No ratings yet
Dataset Penjualan Produk
4 pages

Car Analytics Solution

Uploaded by

Car Analytics Solution

Uploaded by

# -*- coding: utf-8 -*-

# TODO: REPLACE WITH YOUR ACTUAL BUCKET NAME

s3_input_path = "s3://" + bucket_name + "/inputfile/car_data.csv"

# Read CSV file from S3 with header=true

print("Data read successfully. Sample data:")

# Start with input dataframe

print(f"Original data count: {df.count()}")

# 1. Drop rows with null values in any column

# 2. Drop duplicate rows

print("Clean data sample:")

def s3_load_data(data, file_name):

output_path = "s3://" + bucket_name + "/output/" + file_name

# Write data to S3 as single partition CSV with header

print(f"Data successfully saved to {output_path}")

# Group by car_name and calculate average selling price and count

print("Result 1 sample data:")

# Create price_per_km column, filter and round

print("Result 2 sample data:")

def rds_mysql_load_data(data, table_name):

# TODO: UPDATE THESE RDS CONNECTION DETAILS

print(f"Data successfully loaded into RDS table: {table_name}")

spark = (SparkSession.builder.appName("Car Data analysis").getOrCreate())

You might also like

# -- coding: utf-8 --