val x = sc.
parallelize(List("spark rdd example", "sample example"))
val x = sc.parallelize(List("spark rdd example", "sample example”),2)
x.collect()
val textFileLocalTest = sc.textFile("/Users/syedrizvi/Desktop/HadoopExamples/file.txt");
val textFile = sc.textFile("hdfs://localhost:9000/test.txt")
Flat Map
val x = sc.parallelize(List("spark rdd example", "sample example"))
val y = x.flatMap(x => x.split(" "))
Map
val z = y.map(x => (x, 1));
Filter
val x = sc.parallelize(1 to 10)
Or with partition
val x = sc.parallelize(1 to 10, 2)
val y = x.filter(num => num%2==0)
y.collect();
Reduce
val x = sc.parallelize(1 to 10, 2)
val y = x.reduce((a, b) => (a+b))
Pair RDD Operations
GroupBy
val x = sc.parallelize(Array("Joseph", "Jimmy", "Tina","Thomas", "James", "Cory","Christine", "Jackeline",
"Juan"))
val y = x.groupBy(word => word.charAt(0))
y.collect();
ReduceByKey
val x = sc.parallelize(Array(("a", 1), ("b", 1), ("a", 1),("a", 1), ("b", 1),("b", 1),("b", 1), ("b", 1)))
val y = x.reduceByKey((key, value) => (key + value))
y.collect()
SortByKey
val y = x.sortByKey()
y.collect()
Joins
val salesprofit = sc.parallelize(Array(("Cadbury's", 3.5),("Nestle", 2.8),("Mars", 2.5), ("Thorton's", 2.2)));
val salesyear = sc.parallelize(Array(("Cadbury's", 2015),("Nestle", 2014),("Mars", 2014), ("Thorton's", 2013)));
val join = salesprofit.join(salesyear);
join.collect();
Spark SQL
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
val df = sqlContext.read.json("/Users/syedrizvi/Desktop/HadoopExamples/Spark/sample.json")
df.show();
df.printSchema();
df.select(“name”).show();
df.select(df("name"),df("age")+1).show();
df.filter(df("age")>21).show()
df.groupBy("age").count().show();
Creating Temp Views
df.createOrReplaceTempView("people")
val sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show();
Creating Data sets on the fly
case class Person(name: String, age: Long)
val caseClassDS = Seq(Person("Andy", 32)).toDS()
caseClassDS.show()
val primitiveDS = Seq(1, 2, 3).toDS()
primitiveDS.map(_ + 1).collect()
Creating Schemas with Reflection
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
case class Person(name: String, age: Long)
val peopleDF =
spark.sparkContext.textFile("/Users/syedrizvi/Desktop/HadoopExamples/Spark/people.txt").map(_.split(",")).m
ap(attributes=>Person(attributes(0),attributes(1).trim.toInt)).toDF();
peopleDF.createOrReplaceTempView("people")
val teenagersDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")
teenagersDF.map(teenager => "Name: " + teenager(0)).show()
teenagersDF.map(teenager => "Name: " + teenager.getAs[String]("name")).show()
Interacting with Hive
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
val warehouseLocation = "spark-warehouse"
val spark = SparkSession.builder().appName("Spark Hive Example").config("spark.sql.warehouse.dir",
warehouseLocation).enableHiveSupport().getOrCreate()
import spark.implicits._
import spark.sql
sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
sql("LOAD DATA LOCAL INPATH '/Users/syedrizvi/Desktop/HadoopExamples/Spark/kv1.txt' INTO TABLE
src")
sql("SELECT * FROM src").show()
sql("select current_database()").show(false)
Spark Streaming
To run the example from source
To Run net cat
nc -lk 9999
/usr/local/Cellar/apache-spark/2.1.0/bin/run-example streaming.NetworkWordCount localhost 9999
Your own word count
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
val ssc = new StreamingContext(sc, Seconds(1))
val lines = ssc.socketTextStream("localhost", 9999)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()