diff --git a/02-HelloRDD/build.sbt b/02-HelloRDD/build.sbt index 219b4f4..54486a2 100644 --- a/02-HelloRDD/build.sbt +++ b/02-HelloRDD/build.sbt @@ -1,8 +1,13 @@ -ThisBuild / version := "0.1.0-SNAPSHOT" +name := "HelloRDD" +organization := "com.spark.learnings" +version := "0.1.0-SNAPSHOT" +scalaVersion := "2.12.10" +autoScalaLibrary := false +val sparkVersion = "3.0.0-preview2" -ThisBuild / scalaVersion := "2.12.10" +val sparkDependencies = Seq( + "org.apache.spark" %% "spark-core" % sparkVersion +) + +libraryDependencies ++= sparkDependencies -lazy val root = (project in file(".")) - .settings( - name := "02-HelloRDD" - ) diff --git a/02-HelloRDD/data/sample.csv b/02-HelloRDD/data/sample.csv new file mode 100644 index 0000000..1fc06ad --- /dev/null +++ b/02-HelloRDD/data/sample.csv @@ -0,0 +1,9 @@ +2014-08-27 11:29:31,37,"Female","United States","IL",NA,"No","Yes","Often","6-25","No","Yes","Yes","Not sure","No","Yes","Yes","Somewhat easy","No","No","Some of them","Yes","No","Maybe","Yes","No",NA +2014-08-27 11:29:37,44,"M","United States","IN",NA,"No","No","Rarely","More than 1000","No","No","Don't know","No","Don't know","Don't know","Don't know","Don't know","Maybe","No","No","No","No","No","Don't know","No",NA +2014-08-27 11:29:44,32,"Male","Canada",NA,NA,"No","No","Rarely","6-25","No","Yes","No","No","No","No","Don't know","Somewhat difficult","No","No","Yes","Yes","Yes","Yes","No","No",NA +2014-08-27 11:29:46,31,"Male","United Kingdom",NA,NA,"Yes","Yes","Often","26-100","No","Yes","No","Yes","No","No","No","Somewhat difficult","Yes","Yes","Some of them","No","Maybe","Maybe","No","Yes",NA +2014-08-27 11:30:22,31,"Male","United States","TX",NA,"No","No","Never","100-500","Yes","Yes","Yes","No","Don't know","Don't know","Don't know","Don't know","No","No","Some of them","Yes","Yes","Yes","Don't know","No",NA +2014-08-27 11:31:22,33,"Male","United States","TN",NA,"Yes","No","Sometimes","6-25","No","Yes","Yes","Not sure","No","Don't know","Don't know","Don't know","No","No","Yes","Yes","No","Maybe","Don't know","No",NA +2014-08-27 11:31:50,35,"Female","United States","MI",NA,"Yes","Yes","Sometimes","1-5","Yes","Yes","No","No","No","No","No","Somewhat difficult","Maybe","Maybe","Some of them","No","No","No","Don't know","No",NA +2014-08-27 11:32:05,39,"M","Canada",NA,NA,"No","No","Never","1-5","Yes","Yes","No","Yes","No","No","Yes","Don't know","No","No","No","No","No","No","No","No",NA +2014-08-27 11:32:39,42,"Female","United States","IL",NA,"Yes","Yes","Sometimes","100-500","No","Yes","Yes","Yes","No","No","No","Very difficult","Maybe","No","Yes","Yes","No","Maybe","No","No",NA \ No newline at end of file diff --git a/02-HelloRDD/src/main/scala/com/spark/scala/learning/examples/HelloRDD.scala b/02-HelloRDD/src/main/scala/com/spark/scala/learning/examples/HelloRDD.scala new file mode 100644 index 0000000..50c7fe4 --- /dev/null +++ b/02-HelloRDD/src/main/scala/com/spark/scala/learning/examples/HelloRDD.scala @@ -0,0 +1,37 @@ +package com.spark.scala.learning.examples + +import org.apache.log4j.{Level, Logger} +import org.apache.spark.{SparkConf, SparkContext} + +object HelloRDD extends Serializable { + + def main(args: Array[String]): Unit ={ + @transient lazy val logger: Logger = Logger.getLogger(getClass.getName) + Logger.getLogger("org").setLevel(Level.OFF) + Logger.getLogger("akka").setLevel(Level.OFF) + //Create Spark Context + val sparkConf =new SparkConf().setAppName("HelloRDD").setMaster("local[2]") + val sparkContext = new SparkContext(sparkConf) + //Read CSV file + val lineRDD = sparkContext.textFile(args(0)) + + //Give it a structure and select 4 columns + case class SurveyRecord(Age: Int, Gender: String, Country: String, State: String) + var colsRDD = lineRDD.map(line => { + val cols = line.split(",").map(_.trim) + SurveyRecord(cols(1).toInt, cols(2), cols(3), cols(4)) + }) + + //Apply filter + val filteredRDD = colsRDD.filter(r=> r.Age < 40) + filteredRDD.collect().foreach(println) + + ///manually implement the GroupBy + val kvRDD = filteredRDD.map(r=> (r.Country, 1)) + val countRDD = kvRDD.reduceByKey((v1, v2) => v1 + v2) + countRDD.collect().foreach(println) + + sparkContext.stop() + + } +}