1/18/2021 Lab 2 - Data Preparation 1.
ipynb - Colaboratory
Lab 2 - Data Preparation 1
#1. Install Apache Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.6.tgz
!tar xf spark-2.4.7-bin-hadoop2.6.tgz
!pip install -q findspark
#2. Setting environment variable
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.6"
#3. Inisiasi spark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
#4. Upload file
from google.colab import files
!rm data_telepon_seluler.csv
files.upload()
#5. Load data
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
dataset = spark.read.csv('data_telepon_seluler.csv',inferSchema=True, header =True, sep=",
dataset.printSchema()
#6. Menampilkan data
dataset.show() #20 data pertama
#dataset.head() #5 data pertama
#dataset.first()#1 data pertama
#dataset.head(10) # 10 data pertama
#7. Cek tipe data
type(dataset)
#8. Menampilkan data
#collect data + metadata
dataset.select('*').collect()
dataset.select('provinsi', '2012').collect()
#show data saja
https://colab.research.google.com/drive/1fJ3YNoDYuQvEV8bcdd5xTkacPysJvFXb#scrollTo=TLrIqNPgcz4B&printMode=true 1/3
1/18/2021 Lab 2 - Data Preparation 1.ipynb - Colaboratory
#collect data + metadata
dataset.select('*').collect()
dataset.select('provinsi', '2012').collect()
#show data saja
dataset.select('*').show()
dataset.select('provinsi', '2012').show()
#take data + metadata sebagian data
dataset.select('*').take(5)
dataset.select('provinsi', '2012').take(5)
#9. Cek tipe data kolom
dataset.select('provinsi')
#10. Distinct
dataset.select('provinsi', '2012').distinct().show()
#11. Menampilkan daftar kolom
dataset.columns
#12. Menampilkan data
dataset.select(dataset.columns[0:3]).show()
#13. Menampilkan data
dataset.show(2,truncate= True)
X = dataset.collect()[0]['2014']
X = dataset.collect()[0][3]
#14. Menampilkan sebagian data
selected_columns = ["provinsi", "kode_wilayah", "2012"]
subset_df_2 = dataset.select(selected_columns[0],selected_columns[1],selected_columns[2])
subset_df_2.head()
#15. Filtering
dataset.filter("provinsi = 'DI YOGYAKARTA'")
dataset.filter("provinsi in ('DI YOGYAKARTA')")
#16. Menampilkan data null
dataset.where(dataset["2012"].isNull()).show()
dataset.where(dataset["2012"].isNotNull()).show(999)
#17. Menampilkan struktur data
https://colab.research.google.com/drive/1fJ3YNoDYuQvEV8bcdd5xTkacPysJvFXb#scrollTo=TLrIqNPgcz4B&printMode=true 2/3
1/18/2021 Lab 2 - Data Preparation 1.ipynb - Colaboratory
p
print((dataset.count(), len(dataset.columns)))
#18. Menampilkan rangkuman data
dataset.describe().show()
dataset.describe("2012").show()
#19. Mengganti tipe kolom
dataset.createOrReplaceTempView("tmpprov")
df4 = spark.sql("SELECT provinsi, int('2012'),int('2013'),int('2014') from tmpprov")
dataset.printSchema()
df4.printSchema()
Copy protected with Online-PDF-No-Copy.com
https://colab.research.google.com/drive/1fJ3YNoDYuQvEV8bcdd5xTkacPysJvFXb#scrollTo=TLrIqNPgcz4B&printMode=true 3/3