diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index c436f0a..0000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,8 +0,0 @@ -version: 2 -updates: - - package-ecosystem: github-actions - directory: "/" - schedule: - interval: daily - time: "04:00" - open-pull-requests-limit: 10 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index e713d2d..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: CI -on: [push, pull_request] - -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4.1.1 - - name: Cache sbt - uses: coursier/cache-action@v6 - - name: Java 8 setup - uses: olafurpg/setup-scala@v14 - with: - java-version: 1.8.0 - - run: sbt test diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 9484ce8..0000000 --- a/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -.bsp -.idea -target diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/.scalafmt.conf b/.scalafmt.conf deleted file mode 100644 index dea9466..0000000 --- a/.scalafmt.conf +++ /dev/null @@ -1,29 +0,0 @@ -version = "3.5.9" -maxColumn = 100 - -binPack.literalArgumentLists = true - -continuationIndent { - callSite = 2 - defnSite = 2 -} - -newlines { - alwaysBeforeMultilineDef = false - sometimesBeforeColonInMethodReturnType = true -} - -verticalMultiline { - newlineAfterImplicitKW = true - newlineBeforeImplicitKW = true -} - -docstrings = JavaDoc - -project.git = false - -rewrite { - rules = [PreferCurlyFors, RedundantBraces, RedundantParens, SortImports] - redundantBraces.generalExpressions = false - redundantBraces.maxLines = 1 -} diff --git a/AverageScorePerItem.scala.html b/AverageScorePerItem.scala.html new file mode 100644 index 0000000..ba15ce7 --- /dev/null +++ b/AverageScorePerItem.scala.html @@ -0,0 +1,2035 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.algebird.Semigroup +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object AverageScorePerItem { + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + input + .groupBy(_.user
+
+ +
+

Map into (sum, count)

+
.mapValues(x => (x.score, 1L)) 
+
+ +
+

Sum both per key with an implicit Semigroup[(Double, Long)]

+
.sum 
+
+ +
+

Map (sum, count) into average

+
.mapValues(p => p._1 / p._2) + .toTypedPipe + } + 
+
+ +
+
+

Scalding with Algebird Aggregator

+
def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + import com.twitter.algebird.AveragedValue + input + .groupBy(_.user
+
+ +
+

Map values into Double

+
.mapValues(_.score
+
+ +
+

Aggregate average per key

+
.aggregate(AveragedValue.aggregator) + .toTypedPipe + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { + input + .keyBy(_.user
+
+ +
+

Map into (sum, count)

+
.mapValues(x => (x.score, 1L)) 
+
+ +
+

Sum both per key with an implicit Semigroup[(Double, Long)]

+
.sumByKey 
+
+ +
+

Map (sum, count) into average

+
.mapValues(p => p._1 / p._2) + } + 
+
+ +
+
+

Spark

+

Summon an Algebird Semigroup[(Double, Long)] with implicit argument

+
def spark(input: RDD[Rating])(implicit sg: Semigroup[(Double, Long)]): RDD[(String, Double)] = { + input + .keyBy(_.user
+
+ +
+

Map into (sum, count)

+
.mapValues(x => (x.score, 1L)) 
+
+ +
+

Reduce both per key with plus = (T, T) => T where T is (Double, Long)

+
.reduceByKey(sg.plus) // plus: (T, T) => T where T is (Double, Long) 
+
+ +
+

Map (sum, count) into average

+
.mapValues(p => p._1 / p._2) + } + 
+
+ +
+
+

Spark with Algebird Aggregator

+
def sparkWithAlgebird(input: RDD[Rating]): RDD[(String, Double)] = { + import com.twitter.algebird.AveragedValue + import com.twitter.algebird.spark._ + input + .keyBy(_.user) + .mapValues(_.score
+
+ +
+

Map values into Double

+
.algebird 
+
+ +
+

Aggregate average per key

+
.aggregateByKey(AveragedValue.aggregator) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/Count.scala.html b/Count.scala.html new file mode 100644 index 0000000..f726ff1 --- /dev/null +++ b/Count.scala.html @@ -0,0 +1,1989 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object Count { + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { + input + .map(_ => 1L
+
+ +
+

Sum with an implicit Semigroup[Long]

+
.sum + .toTypedPipe + } + 
+
+ +
+
+

Scalding with Algebird Aggregator

+
def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Long] = { + import com.twitter.algebird.Aggregator.size + input + .aggregate(size) + .toTypedPipe + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[Rating]): SCollection[Long] = + input.count
+
+ +
+
+

Scio with Algebird Aggregator

+
def scioWithAlgebird(input: SCollection[Rating]): SCollection[Long] = { + import com.twitter.algebird.Aggregator.size + input + .aggregate(size) + } + 
+
+ +
+
+

Spark

+
def spark(input: RDD[Rating]): Long = { + input 
+
+ +
+

count is an action and collects data back to the driver node

+
.count + } + 
+
+ +
+
+

Spark with Algebird Aggregator

+
def sparkWithAlgebird(input: RDD[Rating]): Long = { + import com.twitter.algebird.Aggregator.size + import com.twitter.algebird.spark._ + input.algebird 
+
+ +
+

aggregate is an action and collects data back to the driver node

+
.aggregate(size) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/CountDistinctItems.scala.html b/CountDistinctItems.scala.html new file mode 100644 index 0000000..b796e5f --- /dev/null +++ b/CountDistinctItems.scala.html @@ -0,0 +1,2003 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.google.common.base.Charsets +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object CountDistinctItems { + 
+
+ +
+
+

Scalding Exact Approach

+
def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { + input + .map(_.item
+
+ +
+

Remove duplicates, requires a shuffle

+
.distinct + .map(_ => 1L
+
+ +
+

Sum with an implicit Semigroup[Long]

+
.sum + .toTypedPipe + } + 
+
+ +
+
+

Scalding Approximate Approach

+
def scaldingApproxWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Double] = { + import com.twitter.algebird.HyperLogLogAggregator + val aggregator = HyperLogLogAggregator.sizeAggregator(bits = 12) + input 
+
+ +
+

HyperLogLog expects bytes input

+
.map(_.item.getBytes(Charsets.UTF_8)) 
+
+ +
+

Aggregate globally into a Double

+
.aggregate(aggregator) + .toTypedPipe + } + 
+
+ +
+
+

Scio Exact Approach

+
def scio(input: SCollection[Rating]): SCollection[Long] = { + input + .map(_.item) + .distinct + .count + } + 
+
+ +
+
+

Scio Approximate Approach

+
def scioApprox(input: SCollection[Rating]): SCollection[Long] = { + input + .map(_.item) + .countApproxDistinct() + } + 
+
+ +
+
+

Spark Exact Approach

+
def spark(input: RDD[Rating]): Long = { + input + .map(_.item) + .distinct() + .count() + } + 
+
+ +
+
+

Spark Approximate Approach

+
def sparkApprox(input: RDD[Rating]): Long = { + input + .map(_.item) + .countApproxDistinct() + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/CountUsers.scala.html b/CountUsers.scala.html new file mode 100644 index 0000000..2f18be6 --- /dev/null +++ b/CountUsers.scala.html @@ -0,0 +1,1997 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object CountUsers { + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { + input + .filter(_.user == "Smith") + .map(_ => 1L
+
+ +
+

Sum with an implicit Semigroup[Long]

+
.sum + .toTypedPipe + } + 
+
+ +
+
+

Sclading with Algebird Aggregator

+
def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Long] = { + import com.twitter.algebird.Aggregator.count + input 
+
+ +
+

Aggregate globally into a single Long

+
.aggregate(count(_.user == "Smith")) + .toTypedPipe + } + + def scio(input: SCollection[Rating]): SCollection[Long] = { + input + .filter(_.user == "Smith") + .count + } + 
+
+ +
+
+

Scio with Algebird Aggregator

+
def scioWithAlgebird(input: SCollection[Rating]): SCollection[Long] = { + import com.twitter.algebird.Aggregator.count + input 
+
+ +
+

Aggregate globally into a single Long

+
.aggregate(count((_: Rating).user == "Smith")) + } + 
+
+ +
+
+

Spark

+
def spark(input: RDD[Rating]): Long = { + input + .filter(_.user == "Smith"
+
+ +
+

count is an action and collects data back to the driver node

+
.count() + } + 
+
+ +
+
+

Spark with Algebird Aggregator

+
def sparkWithAlgebird(input: RDD[Rating]): Long = { + import com.twitter.algebird.Aggregator.count + import com.twitter.algebird.spark._ + input.algebird 
+
+ +
+

aggregate is an action and collects data back to the driver node

+
.aggregate(count(_.user == "Smith")) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/DistinctItems.scala.html b/DistinctItems.scala.html new file mode 100644 index 0000000..ca2a999 --- /dev/null +++ b/DistinctItems.scala.html @@ -0,0 +1,1945 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object DistinctItems { + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Rating]): TypedPipe[String] = { + input + .map(_.item) + .distinct + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[Rating]): SCollection[String] = { + input + .map(_.item) + .distinct + } + 
+
+ +
+
+

Spark

+
def spark(input: RDD[Rating]): RDD[String] = { + input + .map(_.item) + .distinct() + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/FieldStatistics.scala.html b/FieldStatistics.scala.html new file mode 100644 index 0000000..a28b29b --- /dev/null +++ b/FieldStatistics.scala.html @@ -0,0 +1,2027 @@ + + + + + + + +
+

Input is a collection of case classes

+
package com.spotify.bdrc.pipeline + +import com.spotify.scio.coders.Coder +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object FieldStatistics { + + case class User(age: Int, income: Double, score: Double) + case class Stats(max: Double, min: Double, mean: Double, stddev: Double) + case class UserStats(age: Stats, income: Stats, score: Stats) + + import com.twitter.algebird._ + implicit val momentsCoder: Coder[Moments] = Coder.kryo[Moments] + 
+
+ +
+
+

Algebird Aggregator

+
def aggregator = { 
+
+ +
+

Create 3 Aggregators on age field with different logic

+
 
+
+ +
+

The first 2 are of type Aggregator[User, _, Int] which means it takes User as input and +generates Int as output. The last one is of type Aggregator[User, _, Moments], +where Moments include count, mean, standard deviation, etc. The input User is prepared +with a User => Int function _.age.

+
val maxAgeOp = Aggregator.max[Int].composePrepare[User](_.age) + val minAgeOp = Aggregator.min[Int].composePrepare[User](_.age) + val momentsAgeOp = Moments.aggregator.composePrepare[User](_.age) + 
+
+ +
+

Create 3 Aggregators on income field with different logic

+
val maxIncomeOp = Aggregator.max[Double].composePrepare[User](_.income) + val minIncomeOp = Aggregator.min[Double].composePrepare[User](_.income) + val momentsIncomeOp = Moments.aggregator.composePrepare[User](_.income) + 
+
+ +
+

Create 3 Aggregators on score field with different logic

+
val maxScoreOp = Aggregator.max[Double].composePrepare[User](_.score) + val minScoreOp = Aggregator.min[Double].composePrepare[User](_.score) + val momentsScoreOp = Moments.aggregator.composePrepare[User](_.score) + 
+
+ +
+

Apply 12 Aggregators on the same input, present result tuple 12 as UserStats.

+
MultiAggregator( + maxAgeOp, + minAgeOp, + momentsAgeOp, + maxIncomeOp, + minIncomeOp, + momentsIncomeOp, + maxScoreOp, + minScoreOp, + momentsScoreOp + ).andThenPresent { t => + val (maxAge, minAge, mAge, maxIncome, minIncome, mIncome, maxScore, minScore, mScore) = t + UserStats( + age = Stats(maxAge, minAge, mAge.mean, mAge.stddev), + income = Stats(maxIncome, minIncome, mIncome.mean, mIncome.stddev), + score = Stats(maxScore, minScore, mScore.mean, mScore.stddev) + ) + } + } + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[User]): TypedPipe[UserStats] = + input.aggregate(aggregator) + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[User]): SCollection[UserStats] = + input.aggregate(aggregator) + 
+
+ +
+
+

Spark

+
def spark(input: RDD[User]): UserStats = { 
+
+ +
+

Compute each field separately, potentially in-efficient if input is not cached

+
val s1 = input.map(_.age).stats() + val s2 = input.map(_.income).stats() + val s3 = input.map(_.score).stats() + UserStats( + age = Stats(s1.max, s1.min, s1.mean, s1.stdev), + income = Stats(s2.max, s2.min, s2.mean, s2.stdev), + score = Stats(s3.max, s3.min, s3.mean, s3.stdev) + ) + } + 
+
+ +
+
+

Spark with Algebird Aggregator

+
def sparkAlgebird(input: RDD[User]): UserStats = { + import com.twitter.algebird.spark._ + input.algebird.aggregate(aggregator) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/InvertedIndex.scala.html b/InvertedIndex.scala.html new file mode 100644 index 0000000..d5318ea --- /dev/null +++ b/InvertedIndex.scala.html @@ -0,0 +1,1980 @@ + + + + + + + +
+

Build inverted index from a corpus of text documents

+
 
+
+ +
+

Input is a collection of (id, text)

+
package com.spotify.bdrc.pipeline + +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object InvertedIndex { + + case class Document(id: Int, text: String) + case class Posting(word: String, ids: Seq[Int]) + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Document]): TypedPipe[Posting] = { + input 
+
+ +
+

Split text and output (word, document ID)

+
.flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id))) 
+
+ +
+

Group and convert document IDs per key to List[Int]

+
.group + .toList + .map(Posting.tupled) + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[Document]): SCollection[Posting] = { + input 
+
+ +
+

Split text and output (word, document ID)

+
.flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id))) 
+
+ +
+

Group document IDs per key into Iterable[Int]

+
.groupByKey + .map(kv => Posting(kv._1, kv._2.toSeq)) + } + 
+
+ +
+
+

Spark

+
def spark(input: RDD[Document]): RDD[Posting] = { + input 
+
+ +
+

Split text and output (word, document ID)

+
.flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id))) 
+
+ +
+

Group document IDs per key into Iterable[Int]

+
.groupByKey() + .map(kv => Posting(kv._1, kv._2.toSeq)) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/JoinLogAndMetadata.scala.html b/JoinLogAndMetadata.scala.html new file mode 100644 index 0000000..2ab9877 --- /dev/null +++ b/JoinLogAndMetadata.scala.html @@ -0,0 +1,2132 @@ + + + + + + + +
+

Compute average age of users who listened to a track by joining log event and user metadata.

+

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.{LogEvent, UserMeta} +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object JoinLogAndMetadata { + 
+
+ +
+
+

Scalding Naive Approach

+
def scaldingNaive( + left: TypedPipe[LogEvent], + right: TypedPipe[UserMeta] + ): TypedPipe[(String, Double)] = { + import com.twitter.algebird.AveragedValue + left + .groupBy(_.user
+
+ +
+

Join as (user, (LogEvent, UserMeta))

+
.join(right.groupBy(_.user)) 
+
+ +
+

Drop user key

+
.values 
+
+ +
+

Map into (track, age)

+
.map { case (logEvent, userMeta) => + (logEvent.track, userMeta.age.toDouble) + } + .group 
+
+ +
+

Aggregate average age per track

+
.aggregate(AveragedValue.aggregator) + .toTypedPipe + } + 
+
+ +
+
+

Scalding with Hash Join

+

hashJoin replicates the smaller RHS to all mappers on the LHS

+
def scaldingHashJoin( + left: TypedPipe[LogEvent], + right: TypedPipe[UserMeta] + ): TypedPipe[(String, Double)] = { + import com.twitter.algebird.AveragedValue
+
+ +
+

Map out fields to avoid shuffing large objects

+
val lhs = left.map(e => (e.user, e.track)) 
+
+ +
+

Force to disk to avoid repeating the same computation on each mapper on the LHS

+
val rhs = right.map(u => (u.user, u.age.toDouble)).forceToDisk + + lhs + .hashJoin(rhs) + .values + .group + .aggregate(AveragedValue.aggregator) + .toTypedPipe + } + 
+
+ +
+
+

Scio Naive Approach

+
def scioNaive( + left: SCollection[LogEvent], + right: SCollection[UserMeta] + ): SCollection[(String, Double)] = { + import com.twitter.algebird.AveragedValue + val lhs = left.map(e => (e.user, e.track)) + val rhs = right.map(u => (u.user, u.age.toDouble)) 
+
+ +
+

Join as (user, (track, age))

+
lhs + .join(rhs
+
+ +
+

Drop user key to make track as new key in (track, age)

+
.values 
+
+ +
+

Aggregate average age per track

+
.aggregateByKey(AveragedValue.aggregator) + } + 
+
+ +
+
+

Scio with Side Input

+

Side input makes RHS available on all workers

+
def scioSideInput( + left: SCollection[LogEvent], + right: SCollection[UserMeta] + ): SCollection[(String, Double)] = { + import com.twitter.algebird.AveragedValue
+
+ +
+

Convert RHS to a side input of Map[String, Double]

+
val rhs = right.map(u => (u.user, u.age.toDouble)).asMapSideInput
+
+ +
+

Replicate RHS to each worker

+
left + .withSideInputs(rhs
+
+ +
+

Access side input via the context

+
.map { case (e, sideContext) => (e.track, sideContext(rhs).getOrElse(e.user, 0.0)) } 
+
+ +
+

Convert back to regular SCollection

+
.toSCollection + .aggregateByKey(AveragedValue.aggregator) + } + 
+
+ +
+
+

Scio with Hash Join

+

hashJoin is a short cut to the side input approach

+
def scioHashJoin( + left: SCollection[LogEvent], + right: SCollection[UserMeta] + ): SCollection[(String, Double)] = { + import com.twitter.algebird.AveragedValue + val lhs = left.map(e => (e.user, e.track)) + val rhs = right.map(u => (u.user, u.age.toDouble)) + lhs + .hashJoin(rhs) + .values + .aggregateByKey(AveragedValue.aggregator) + } + 
+
+ +
+
+

Spark Naive Approach

+
def sparkNaive(left: RDD[LogEvent], right: RDD[UserMeta]): RDD[(String, Double)] = { + import com.twitter.algebird.spark._ + import com.twitter.algebird.AveragedValue + val lhs = left.map(e => (e.user, e.track)) + val rhs = right.map(u => (u.user, u.age.toDouble)) 
+
+ +
+

Join as (user, (track, age))

+
lhs + .join(rhs
+
+ +
+

Drop user key to make track as new key in (track, age)

+
.values + .algebird 
+
+ +
+

Aggregate average age per track

+
.aggregateByKey(AveragedValue.aggregator) + } + 
+
+ +
+
+

Spark with Broadcast Variable

+
def sparkBroadcast(left: RDD[LogEvent], right: RDD[UserMeta]): RDD[(String, Double)] = { + import com.twitter.algebird.spark._ + import com.twitter.algebird.AveragedValue
+
+ +
+

Retrieve SparkContext for creating broadcast variable

+
val sc = left.context
+
+ +
+

Collect RHS to driver memory and broadcast back to workers

+
val map = right.map(u => (u.user, u.age.toDouble)).collectAsMap() + val b = sc.broadcast(map) + + left 
+
+ +
+

In-memory lookup on each worker

+
.map(e => (e.track, b.value.getOrElse(e.user, 0.0))) + .algebird + .aggregateByKey(AveragedValue.aggregator) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/JoinLogs.scala.html b/JoinLogs.scala.html new file mode 100644 index 0000000..0c801c4 --- /dev/null +++ b/JoinLogs.scala.html @@ -0,0 +1,2035 @@ + + + + + + + +
+

Given two log datasets of play track and save track events, compute tracks that a user saved +after playing in a session.

+
 
+
+ +
+

Inputs are collections of (user, item, timestamp).

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.LogEvent +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object JoinLogs { + + val gapDuration = 3600000
+
+ +
+

Detect if a pair of (event type, LogEvent) tuples match a play and save sequence

+
def detectPlaySaveSequence(pair: Seq[(String, LogEvent)]): Option[String] = { + val Seq(first, second) = pair + if ( + first._1 == "play" && second._1 == "save" && first._2.track == second._2.track && + second._2.timestamp - first._2.timestamp <= gapDuration + ) { + Some(first._2.track) + } else { + None + } + } + 
+
+ +
+
+

Scalding

+
def scalding( + playEvents: TypedPipe[LogEvent], + saveEvents: TypedPipe[LogEvent] + ): TypedPipe[(String, String)] = { 
+
+ +
+

Map inputs to key-values and add event type information

+
val plays = playEvents.map(e => (e.user, ("play", e))).group + val saves = saveEvents.map(e => (e.user, ("save", e))).group + + plays + .cogroup(saves) { (user, p, s) => 
+
+ +
+

Iterables of play and save events for the user

+
(p ++ s).toList + .sortBy(_._2.timestamp
+
+ +
+

Neighboring pairs

+
.sliding(2) + .flatMap(detectPlaySaveSequence) + } + .toTypedPipe + } + 
+
+ +
+
+

Scio

+
def scio( + playEvents: SCollection[LogEvent], + saveEvents: SCollection[LogEvent] + ): SCollection[(String, String)] = { 
+
+ +
+

Map inputs to key-values and add event type information

+
val plays = playEvents.map(e => (e.user, ("play", e))) + val saves = saveEvents.map(e => (e.user, ("save", e))) + + plays + .cogroup(saves
+
+ +
+

Iterables of play and save events for the user

+
.flatMapValues { case (p, s) => + (p ++ s).toList + .sortBy(_._2.timestamp
+
+ +
+

Neighboring pairs

+
.sliding(2) + .flatMap(detectPlaySaveSequence) + } + } + 
+
+ +
+
+

Spark

+
def spark(playEvents: RDD[LogEvent], saveEvents: RDD[LogEvent]): RDD[(String, String)] = { 
+
+ +
+

Map inputs to key-values and add event type information

+
val plays = playEvents.map(e => (e.user, ("play", e))) + val saves = saveEvents.map(e => (e.user, ("save", e))) + + plays + .cogroup(saves) + .flatMapValues { case (p, s) => 
+
+ +
+

Iterables of play and save events for the user

+
(p ++ s).toList + .sortBy(_._2.timestamp
+
+ +
+

Neighboring pairs

+
.sliding(2) + .flatMap(detectPlaySaveSequence) + } + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index d645695..0000000 --- a/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/MaxItemPerUser.scala.html b/MaxItemPerUser.scala.html new file mode 100644 index 0000000..253db7e --- /dev/null +++ b/MaxItemPerUser.scala.html @@ -0,0 +1,2041 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object MaxItemPerUser { + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { + input + .groupBy(_.user
+
+ +
+

Reduce items per key by picking the side with higher score for each pair of input

+
.reduce((x, y) => if (x.score > y.score) x else y) + .values + } + 
+
+ +
+
+

Scalding with Algebird Aggregator

+
def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Rating] = { + import com.twitter.algebird.Aggregator.maxBy + input + .groupBy(_.user
+
+ +
+

Aggregate per key into a single Rating based on Double value via _.score

+
.aggregate(maxBy(_.score)) + .values + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[Rating]): SCollection[Rating] = { + input + .keyBy(_.user
+
+ +
+

Compute top one item per key as an Iterable[Rating]

+
.topByKey(1)(Ordering.by(_.score)) 
+
+ +
+

Drop user key

+
.values 
+
+ +
+

Flatten result Iterable[Rating]

+
.flatten + } + 
+
+ +
+
+

Scio with Algebird Aggregator

+
def scioWithAlgebird(input: SCollection[Rating]): SCollection[Rating] = { + import com.twitter.algebird.Aggregator.maxBy + input + .keyBy(_.user
+
+ +
+

Aggregate per key into a single Rating based on Double value via _.score. Explicit +type due to type inference limitation.

+
.aggregateByKey(maxBy { x: Rating => x.score }) + .values + } + 
+
+ +
+
+

Spark

+
def spark(input: RDD[Rating]): RDD[Rating] = { + input + .keyBy(_.user
+
+ +
+

Reduce items per key by picking the side with higher score for each pair of input

+
.reduceByKey((x, y) => if (x.score > y.score) x else y) + .values + } + 
+
+ +
+
+

Spark with Algebird Aggregator

+
def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { + import com.twitter.algebird.Aggregator.maxBy + import com.twitter.algebird.spark._ + input + .keyBy(_.user) + .algebird 
+
+ +
+

Aggregate per key into a single Rating based on Double value via _.score. Explicit +type due to type inference limitation.

+
.aggregateByKey(maxBy { x: Rating => x.score }) + .values + } + 
+
+ +
+
+

Spark with MLLib

+
def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { + import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ + input + .keyBy(_.user
+
+ +
+

From spark-mllib, compute top K per key with a priority queue

+
.topByKey(1)(Ordering.by(_.score)) + .flatMap(_._2) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/MinItemPerUser.scala.html b/MinItemPerUser.scala.html new file mode 100644 index 0000000..48d6a59 --- /dev/null +++ b/MinItemPerUser.scala.html @@ -0,0 +1,2041 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object MinItemPerUser { + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { + input + .groupBy(_.user
+
+ +
+

Reduce items per key by picking the side with lower score for each pair of input

+
.reduce((x, y) => if (x.score < y.score) x else y) + .values + } + 
+
+ +
+
+

Scalding with Algebird Aggregator

+
def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Rating] = { + import com.twitter.algebird.Aggregator.minBy + input + .groupBy(_.user
+
+ +
+

Aggregate per key into a single Rating based on Double value via _.score

+
.aggregate(minBy(_.score)) + .values + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[Rating]): SCollection[Rating] = { + input + .keyBy(_.user
+
+ +
+

Compute top one item per key as an Iterable[Rating] with a reverse comparator

+
.topByKey(1)(Ordering.by(-_.score)) 
+
+ +
+

Drop user key

+
.values 
+
+ +
+

Flatten result Iterable[Rating]

+
.flatten + } + 
+
+ +
+
+

Scio with Algebird Aggregator

+
def scioWithAlgebird(input: SCollection[Rating]): SCollection[Rating] = { + import com.twitter.algebird.Aggregator.minBy + input + .keyBy(_.user
+
+ +
+

Aggregate per key into a single Rating based on Double value via _.score. Explicit +type due to type inference limitation.

+
.aggregateByKey(minBy { x: Rating => x.score }) + .values + } + 
+
+ +
+
+

Spark

+
def spark(input: RDD[Rating]): RDD[Rating] = { + input + .keyBy(_.user
+
+ +
+

Reduce items per key by picking the side with lower score for each pair of input

+
.reduceByKey((x, y) => if (x.score < y.score) x else y) + .values + } + 
+
+ +
+
+

Spark with Algebird Aggregator

+
def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { + import com.twitter.algebird.Aggregator.minBy + import com.twitter.algebird.spark._ + input + .keyBy(_.user) + .algebird 
+
+ +
+

Aggregate per key into a single Rating based on Double value via _.score. Explicit +type due to type inference limitation.

+
.aggregateByKey(minBy { x: Rating => x.score }) + .values + } + 
+
+ +
+
+

Spark with MLLib

+
def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { + import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ + input + .keyBy(_.user
+
+ +
+

From spark-mllib, compute top K per key with a priority queue and a reverse comparator

+
.topByKey(1)(Ordering.by(-_.score)) + .flatMap(_._2) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/NOTICE b/NOTICE deleted file mode 100644 index a86b457..0000000 --- a/NOTICE +++ /dev/null @@ -1,2 +0,0 @@ -Big Data Rosetta Code -Copyright 2016 Spotify AB diff --git a/README.md b/README.md deleted file mode 100644 index 856a380..0000000 --- a/README.md +++ /dev/null @@ -1,29 +0,0 @@ -big-data-rosetta-code -===================== - -[![Build Status](https://img.shields.io/github/actions/workflow/status/spotify/big-data-rosetta-code/.github/workflows/ci.yml)](https://github.com/spotify/big-data-rosetta-code/actions?query=workflow%3ACI) -[![GitHub license](https://img.shields.io/github/license/spotify/big-data-rosetta-code.svg)](./LICENSE) - -Code snippets for solving common big data problems on various platforms. Inspired by [Rosetta Code](http://rosettacode.org/). - -For examples rended side by side with comments see: - -http://spotify.github.io/big-data-rosetta-code/ - -Currently the following are covered: - -- [Scalding](https://github.com/twitter/scalding) -- [Scio](https://github.com/spotify/scio) -- [Spark](https://github.com/apache/spark) - -# Topics - -- [src/main/scala/com/spotify/bdrc/scala](./src/main/scala/com/spotify/bdrc/scala) Scala tricks for data processing -- [src/main/scala/com/spotify/bdrc/pipeline](./src/main/scala/com/spotify/bdrc/pipeline) Data pipeline snippets -- [src/test/scala/com/spotify/bdrc/testing](./src/test/scala/com/spotify/bdrc/testing) Examples for pipeline testing - -# License - -Copyright 2016 Spotify AB. - -Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 diff --git a/Sessions.scala.html b/Sessions.scala.html new file mode 100644 index 0000000..58fde72 --- /dev/null +++ b/Sessions.scala.html @@ -0,0 +1,2048 @@ + + + + + + + +
+

Input is a collection of log events

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.LogEvent +import com.spotify.scio.extra.Iterators._ +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD +import org.joda.time.Instant + +import scala.collection.mutable + +object Sessions { + + val gapDuration = 3600000 + + case class Session(user: String, duration: Long, numItems: Int) + 
+
+ +
+

Wrapper for Iterator[LogEvent] that group items into sessions

+
class SessionIterator(self: Iterator[LogEvent]) extends Iterator[Seq[LogEvent]] { 
+
+ +
+

BufferedIterator allows peak ahead

+
private val bi = self.buffered + override def hasNext: Boolean = bi.hasNext + override def next(): Seq[LogEvent] = { + val buf = mutable.Buffer(bi.next()) + var last = buf.head.timestamp
+
+ +
+

Consume subsequent events until a gap is detected

+
while (bi.hasNext && bi.head.timestamp - last < gapDuration) { + val n = bi.next() + buf.append(n) + last = n.timestamp + } + buf + } + } + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[LogEvent]): TypedPipe[Session] = { + input + .groupBy(_.user
+
+ +
+

sortBy uses Hadoop secondary sort to sort keys during shuffle

+
.sortBy(_.timestamp
+
+ +
+

Iterate over values lazily and group items into sessions

+
.mapValueStream(new SessionIterator(_)) + .toTypedPipe 
+
+ +
+

Map over each (user, session items)

+
.map { case (user, items) => + Session(user, items.last.timestamp - items.head.timestamp, items.size) + } + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[LogEvent]): SCollection[Session] = { + input 
+
+ +
+

Values in groupBy are sorted by timestamp

+
.timestampBy(e => new Instant(e.timestamp)) 
+
+ +
+

No secondary sort in Scio, shuffle all items

+
.groupBy(_.user) + .flatMapValues { + _.iterator 
+
+ +
+

Generic version of SessionIterator from scio-extra

+
.timeSeries(_.timestamp) + .session(gapDuration) + } 
+
+ +
+

Map over each (user, session items)

+
.map { case (user, items) => + Session(user, items.last.timestamp - items.head.timestamp, items.size) + } + } + 
+
+ +
+
+

Spark

+
def spark(input: RDD[LogEvent]): RDD[Session] = { + input 
+
+ +
+

No secondary sort in Spark, shuffle all items

+
.groupBy(_.user) + .flatMapValues { + _ 
+
+ +
+

Order of values after shuffle is not guaranteed

+
.toList + .sortBy(_.timestamp) + .iterator 
+
+ +
+

Generic version of SessionIterator from scio-extra

+
.timeSeries(_.timestamp) + .session(gapDuration) + } 
+
+ +
+

Map over each (user, session items)

+
.map { case (user, items) => + Session(user, items.last.timestamp - items.head.timestamp, items.size) + } + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/Statistics.scala.html b/Statistics.scala.html new file mode 100644 index 0000000..5ce9cd9 --- /dev/null +++ b/Statistics.scala.html @@ -0,0 +1,2000 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.coders.Coder +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object Statistics { + + case class Stats(max: Double, min: Double, sum: Double, count: Long, mean: Double, stddev: Double) + + import com.twitter.algebird._ + implicit val momentsCoder: Coder[Moments] = Coder.kryo[Moments] + 
+
+ +
+
+

Algebird Aggregator

+
def aggregator = { 
+
+ +
+

Create 4 Aggregators with different logic

+
 
+
+ +
+

The first 3 are of type Aggregator[Rating, _, Double] which means it takes Rating as +input and generates Double as output. The last one is of type +Aggregator[Rating, _, Moments], where Moments include count, mean, standard deviation, +etc. The input Rating is prepared with a Rating => Double function _.score.

+
val maxOp = Aggregator.max[Double].composePrepare[Rating](_.score) + val minOp = Aggregator.min[Double].composePrepare[Rating](_.score) + val sumOp = Aggregator.prepareMonoid[Rating, Double](_.score) + val momentsOp = Moments.aggregator.composePrepare[Rating](_.score) + 
+
+ +
+

Apply 4 Aggregators on the same input, present result tuple 4 of +(Double, Double, Double, Moments) as Stats

+
MultiAggregator(maxOp, minOp, sumOp, momentsOp) + .andThenPresent { case (max, min, sum, moments) => + Stats(max, min, sum, moments.count, moments.mean, moments.stddev) + } + } + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Rating]): TypedPipe[Stats] = + input.aggregate(aggregator) + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[Rating]): SCollection[Stats] = { + input + .map(_.score) + .stats + .map(s => Stats(s.max, s.min, s.sum, s.count, s.mean, s.stdev)) + } + 
+
+ +
+
+

Scio with Algebird Aggregator

+
def scioAlgebird(input: SCollection[Rating]): SCollection[Stats] = + input.aggregate(aggregator) + 
+
+ +
+
+

Spark

+
def spark(input: RDD[Rating]): Stats = { + val s = input.map(_.score).stats() + Stats(s.max, s.min, s.sum, s.count, s.mean, s.stdev) + } + 
+
+ +
+
+

Spark with Algebird Aggregator

+
def sparkAlgebird(input: RDD[Rating]): Stats = { + import com.twitter.algebird.spark._ + input.algebird.aggregate(aggregator) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/SumPerItem.scala.html b/SumPerItem.scala.html new file mode 100644 index 0000000..07ba283 --- /dev/null +++ b/SumPerItem.scala.html @@ -0,0 +1,2005 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object SumPerItem { + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + input + .groupBy(_.item) + .mapValues(_.score
+
+ +
+

Sum per key with an implicit Semigroup[Double]

+
.sum + .toTypedPipe + } + 
+
+ +
+
+

Scalding with Algebird Aggregator

+
def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + import com.twitter.algebird.Aggregator.prepareMonoid + input + .groupBy(_.item
+
+ +
+

Aggregate per key with an aggregator that converts UserItemData to Double via +_.score before reduce

+
.aggregate(prepareMonoid(_.score)) + .toTypedPipe + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { + input + .map(x => (x.item, x.score)) + .sumByKey + } + 
+
+ +
+
+

Spark

+
def spark(input: RDD[Rating]): RDD[(String, Double)] = { + input + .map(x => (x.item, x.score)) + .reduceByKey(_ + _) + } + 
+
+ +
+
+

Spark with Algebird Semigroup

+
def sparkWithAlgebird1(input: RDD[Rating]): RDD[(String, Double)] = { + import com.twitter.algebird.spark._ + input + .map(x => (x.item, x.score)) + .algebird 
+
+ +
+

Sum per key with an implicit Semigroup[Double]

+
.sumByKey + } + 
+
+ +
+
+

Spark with Algebird Aggregator

+
def sparkWithAlgebird2(input: RDD[Rating]): RDD[(String, Double)] = { + import com.twitter.algebird.Aggregator.prepareMonoid + import com.twitter.algebird.spark._ + input + .keyBy(_.item) + .algebird 
+
+ +
+

Aggregate per key with an aggregator that converts UserItemData to Double via +_.score before reduce. Explicit type due to type inference limitation.

+
.aggregateByKey(prepareMonoid { x: Rating => x.score }) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/TopItems.scala.html b/TopItems.scala.html new file mode 100644 index 0000000..131e142 --- /dev/null +++ b/TopItems.scala.html @@ -0,0 +1,2075 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object TopItems { + + val topK = 100
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + input + .map(x => (x.item, x.score)) + .group 
+
+ +
+

Sum values with an implicit Semigroup[Double]

+
.sum 
+
+ +
+

Group all elements with a single key Unit

+
.groupAll 
+
+ +
+

Take top K with a priority queue

+
.sortedReverseTake(topK)(Ordering.by(_._2)) 
+
+ +
+

Drop Unit key

+
.values 
+
+ +
+

Flatten result Seq[(String, Double)]

+
.flatten + } + 
+
+ +
+
+

Scalding with Algebird Aggregator

+
def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + import com.twitter.algebird.Aggregator.sortedReverseTake + val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) + input + .map(x => (x.item, x.score)) + .group 
+
+ +
+

Sum values with an implicit Semigroup[Double]

+
.sum + .toTypedPipe 
+
+ +
+

Aggregate globally into a single Seq[(String, Double)]

+
.aggregate(aggregator
+
+ +
+

Flatten result Seq[(String, Double)]

+
.flatten + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { + input + .map(x => (x.item, x.score)) 
+
+ +
+

Sum values with an implicit Semigroup[Double]

+
.sumByKey 
+
+ +
+

Compute top K as an Iterable[(String, Double)]

+
.top(topK)(Ordering.by(_._2)) 
+
+ +
+

Flatten result Iterable[(String, Double)]

+
.flatten + } + 
+
+ +
+
+

Scio with Algebird Aggregator

+
def scioWithAlgebird(input: SCollection[Rating]): SCollection[(String, Double)] = { + import com.twitter.algebird.Aggregator.sortedReverseTake + val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) + input + .map(x => (x.item, x.score)) 
+
+ +
+

Sum values with an implicit Semigroup[Double]

+
.sumByKey 
+
+ +
+

Aggregate globally into a single Seq[(String, Double)]

+
.aggregate(aggregator
+
+ +
+

Flatten result Seq[(String, Double)]

+
.flatten + } + 
+
+ +
+
+

Spark

+
def spark(input: RDD[Rating]): Seq[(String, Double)] = { + input + .map(x => (x.item, x.score)) 
+
+ +
+

Sum values with addition

+
.reduceByKey(_ + _) 
+
+ +
+

top is an action and collects data back to the driver node

+
.top(topK)(Ordering.by(_._2)) + } + 
+
+ +
+
+

Spark with Algebird Aggregator

+
def sparkWithAlgebird(input: RDD[Rating]): Seq[(String, Double)] = { + import com.twitter.algebird.Aggregator.sortedReverseTake + import com.twitter.algebird.spark._ + val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) + input + .map(x => (x.item, x.score)) 
+
+ +
+

Sum values with addition

+
.reduceByKey(_ + _) + .algebird 
+
+ +
+

aggregate is an action and collects data back to the driver node

+
.aggregate(aggregator) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/TopItemsPerUser.scala.html b/TopItemsPerUser.scala.html new file mode 100644 index 0000000..85e2c5b --- /dev/null +++ b/TopItemsPerUser.scala.html @@ -0,0 +1,2033 @@ + + + + + + + +
+

Input is a collection of (user, item, score)

+
package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object TopItemsPerUser { + + val topK = 100
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { + input + .groupBy(_.user
+
+ +
+

Take top K per group with a priority queue

+
.sortedReverseTake(topK)(Ordering.by(_.score)) 
+
+ +
+

Drop user key

+
.values 
+
+ +
+

Flatten result Seq[Rating]

+
.flatten + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[Rating]): SCollection[Rating] = { + input + .keyBy(_.user
+
+ +
+

Compute top K per key

+
.topByKey(topK)(Ordering.by(_.score)) 
+
+ +
+

Drop user key

+
.values 
+
+ +
+

Flatten result Iterable[Rating]

+
.flatten + } + 
+
+ +
+
+

Spark Naive Approach

+
def spark(input: RDD[Rating]): RDD[Rating] = { + input 
+
+ +
+

groupBy shuffles all data, inefficient

+
.groupBy(_.user
+
+ +
+

Drop user key

+
.values 
+
+ +
+

Convert grouped values to a List[Rating] and sort on a single node, inefficient

+
.flatMap(_.toList.sortBy(-_.score).take(topK)) + } + 
+
+ +
+
+

Spark with Algebird Aggregator

+
def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { + import com.twitter.algebird.Aggregator.sortedReverseTake + import com.twitter.algebird.spark._ + val aggregator = sortedReverseTake[Rating](topK)(Ordering.by(_.score)) + input + .keyBy(_.user) + .algebird 
+
+ +
+

Aggregate per key into a Seq[Rating]

+
.aggregateByKey(aggregator
+
+ +
+

Flatten result Seq[Rating]

+
.flatMap(_._2) + } + 
+
+ +
+
+

Spark with MLLib

+
def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { + import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ + input + .keyBy(_.user
+
+ +
+

From spark-mllib, compute top K per key with a priority queue

+
.topByKey(topK)(Ordering.by(_.score)) 
+
+ +
+

Flatten result Seq[Rating]

+
.flatMap(_._2) + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/WordCount.scala.html b/WordCount.scala.html new file mode 100644 index 0000000..c1ab905 --- /dev/null +++ b/WordCount.scala.html @@ -0,0 +1,1980 @@ + + + + + + + +
+

+
package com.spotify.bdrc.pipeline + +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object WordCount { + 
+
+ +
+
+

Scalding

+
def scalding(input: TypedPipe[String]): TypedPipe[(String, Long)] = { + input + .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) 
+
+ +
+

groupBy is lazy

+
.groupBy(identity
+
+ +
+

Operations like size after groupBy can be lifted into the map phase

+
.size + .toTypedPipe + } + 
+
+ +
+
+

Scio

+
def scio(input: SCollection[String]): SCollection[(String, Long)] = { + input + .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) + .countByValue + } + 
+
+ +
+
+

Spark Transformation

+
def sparkTransformation(input: RDD[String]): RDD[(String, Long)] = { + input + .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) 
+
+ +
+

There is no countByValue transformation in Spark although it is equivalent to mapping +into initial count of 1 and reduce with addition

+
.map((_, 1L)) 
+
+ +
+

reduceByKey can lift function into the map phase

+
.reduceByKey(_ + _) + } + 
+
+ +
+
+

Spark Action

+
def sparkAction(input: RDD[String]): Seq[(String, Long)] = { + input + .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) 
+
+ +
+

countByValue is an action and collects data back to the driver node

+
.countByValue() + .toSeq + } + +} 
+
+ + + + + + \ No newline at end of file diff --git a/build.sbt b/build.sbt deleted file mode 100644 index 16095cc..0000000 --- a/build.sbt +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -import com.github.sbt.git.SbtGit.GitKeys.gitRemoteRepo -import _root_.io.regadas.sbt.SbtSoccoKeys._ - -organization := "com.spotify" -name := "big-data-rosetta-code" -version := "0.1.0-SNAPSHOT" - -val scioVersion = "0.13.5" -val scaldingVersion = "0.17.4" -val sparkVersion = "3.5.0" -val algebirdVersion = "0.13.10" -val scalacheckVersion = "1.17.0" -val scalameterVersion = "0.19" -val scalatestVersion = "3.2.17" -val scalatestPlusVersion = "3.2.17.0" - -scalaVersion := "2.12.18" -scalacOptions ++= Seq( - "-target:jvm-1.8", - "-deprecation", - "-feature", - "-unchecked", - "-language:higherKinds" -) -javacOptions ++= Seq("-source", "1.8", "-target", "1.8") - -resolvers += "Cascading libraries" at "https://conjars.wensel.net/repo/" -libraryDependencies ++= Seq( - "com.spotify" %% "scio-core" % scioVersion, - "com.spotify" %% "scio-extra" % scioVersion, - "com.spotify" %% "scio-test" % scioVersion % "test", - "com.twitter" %% "scalding-core" % scaldingVersion, - "com.twitter" %% "algebird-spark" % algebirdVersion, - "org.apache.spark" %% "spark-core" % sparkVersion, - "org.apache.spark" %% "spark-mllib" % sparkVersion, - "org.scalatest" %% "scalatest" % scalatestVersion % "test", - "org.scalatestplus" %% "scalacheck-1-17" % scalatestPlusVersion % "test", - "org.scalacheck" %% "scalacheck" % scalacheckVersion % "test", - "com.storm-enroute" %% "scalameter" % scalameterVersion % "test" -) - -val scalaMeterFramework = new TestFramework( - "org.scalameter.ScalaMeterFramework" -) -testFrameworks += scalaMeterFramework -testOptions += Tests.Argument(scalaMeterFramework, "-silent") -Test / parallelExecution := false -logBuffered := false - -soccoOnCompile := true -soccoPackage := List( - "com.spotify.scio:http://spotify.github.io/scio/api", - "com.twitter.algebird:http://twitter.github.io/algebird/api", - "com.twitter.scalding:http://twitter.github.io/scalding/api", - "org.apache.spark:http://spark.apache.org/docs/latest/api/scala" -) -addCompilerPlugin(("io.regadas" %% "socco-ng" % "0.1.4").cross(CrossVersion.full)) -makeSite := makeSite.dependsOn(Compile / compile).value -gitRemoteRepo := "git@github.com:spotify/big-data-rosetta-code.git" - -enablePlugins(SbtSoccoPlugin) -enablePlugins(GhpagesPlugin) diff --git a/catalog-info.yaml b/catalog-info.yaml deleted file mode 100644 index 37286dd..0000000 --- a/catalog-info.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: backstage.io/v1alpha1 -kind: Resource -metadata: - name: big-data-rosetta-code -spec: - type: resource - owner: flatmap diff --git a/index.html b/index.html new file mode 100644 index 0000000..94e6dde --- /dev/null +++ b/index.html @@ -0,0 +1,38 @@ + + + + +Codestin Search App + + + +
+ + + + diff --git a/make-site.sh b/make-site.sh deleted file mode 100755 index 771514b..0000000 --- a/make-site.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -sbt makeSite ghpagesPushSite diff --git a/project/build.properties b/project/build.properties deleted file mode 100644 index e8a1e24..0000000 --- a/project/build.properties +++ /dev/null @@ -1 +0,0 @@ -sbt.version=1.9.7 diff --git a/project/plugins.sbt b/project/plugins.sbt deleted file mode 100644 index 572c523..0000000 --- a/project/plugins.sbt +++ /dev/null @@ -1,3 +0,0 @@ -addSbtPlugin("com.github.sbt" % "sbt-ghpages" % "0.8.0") -addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2") -addSbtPlugin("io.regadas" % "sbt-socco" % "0.1.5") diff --git a/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala b/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala deleted file mode 100644 index b0d8336..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Average Score per Item -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.algebird.Semigroup -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object AverageScorePerItem { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - input - .groupBy(_.user) - // Map into (sum, count) - .mapValues(x => (x.score, 1L)) - // Sum both per key with an implicit `Semigroup[(Double, Long)]` - .sum - // Map (sum, count) into average - .mapValues(p => p._1 / p._2) - .toTypedPipe - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.AveragedValue - input - .groupBy(_.user) - // Map values into `Double` - .mapValues(_.score) - // Aggregate average per key - .aggregate(AveragedValue.aggregator) - .toTypedPipe - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { - input - .keyBy(_.user) - // Map into (sum, count) - .mapValues(x => (x.score, 1L)) - // Sum both per key with an implicit `Semigroup[(Double, Long)]` - .sumByKey - // Map (sum, count) into average - .mapValues(p => p._1 / p._2) - } - - // ## Spark - // Summon an Algebird `Semigroup[(Double, Long)]` with implicit argument - def spark(input: RDD[Rating])(implicit sg: Semigroup[(Double, Long)]): RDD[(String, Double)] = { - input - .keyBy(_.user) - // Map into (sum, count) - .mapValues(x => (x.score, 1L)) - // Reduce both per key with `plus = (T, T) => T` where `T` is `(Double, Long)` - .reduceByKey(sg.plus) // plus: (T, T) => T where T is (Double, Long) - // Map (sum, count) into average - .mapValues(p => p._1 / p._2) - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): RDD[(String, Double)] = { - import com.twitter.algebird.AveragedValue - import com.twitter.algebird.spark._ - input - .keyBy(_.user) - .mapValues(_.score) - // Map values into `Double` - .algebird - // Aggregate average per key - .aggregateByKey(AveragedValue.aggregator) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/BloomFilterSetDifference.scala b/src/main/scala/com/spotify/bdrc/pipeline/BloomFilterSetDifference.scala deleted file mode 100644 index 1c269f3..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/BloomFilterSetDifference.scala +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright 2017 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.algebird._ -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -/** - * Filter LHS by removing items that exist in the RHS using a Bloom Filter. - * - * Inputs are collections of strings. - */ -object BloomFilterSetDifference { - - def scalding(lhs: TypedPipe[String], rhs: TypedPipe[String]): TypedPipe[String] = { - val width = BloomFilter.optimalWidth(1000, 0.01).get - val numHashes = BloomFilter.optimalNumHashes(1000, width) - lhs - .cross(rhs.aggregate(BloomFilterAggregator(numHashes, width))) - .filter { case (s, bf) => bf.contains(s).isTrue } - .keys - } - - def scio(lhs: SCollection[String], rhs: SCollection[String]): SCollection[String] = { - val width = BloomFilter.optimalWidth(1000, 0.01).get - val numHashes = BloomFilter.optimalNumHashes(1000, width) - lhs - .cross(rhs.aggregate(BloomFilterAggregator[String](numHashes, width))) - .filter { case (s, bf) => bf.contains(s).isTrue } - .keys - } - - def spark(lhs: RDD[String], rhs: RDD[String]): RDD[String] = { - import com.twitter.algebird.spark._ - val width = BloomFilter.optimalWidth(1000, 0.01).get - val numHashes = BloomFilter.optimalNumHashes(1000, width) - val bf = rhs.algebird.aggregate(BloomFilterAggregator(numHashes, width)) - lhs.filter(s => bf.contains(s).isTrue) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/Count.scala b/src/main/scala/com/spotify/bdrc/pipeline/Count.scala deleted file mode 100644 index 1203a3c..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/Count.scala +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Count Number of Items -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object Count { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { - input - .map(_ => 1L) - // Sum with an implicit `Semigroup[Long]` - .sum - .toTypedPipe - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Long] = { - import com.twitter.algebird.Aggregator.size - input - .aggregate(size) - .toTypedPipe - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[Long] = - input.count - - // ## Scio with Algebird `Aggregator` - def scioWithAlgebird(input: SCollection[Rating]): SCollection[Long] = { - import com.twitter.algebird.Aggregator.size - input - .aggregate(size) - } - - // # Spark - def spark(input: RDD[Rating]): Long = { - input - // `count` is an action and collects data back to the driver node - .count - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): Long = { - import com.twitter.algebird.Aggregator.size - import com.twitter.algebird.spark._ - input.algebird - // `aggregate` is an action and collects data back to the driver node - .aggregate(size) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala b/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala deleted file mode 100644 index c4e9e6e..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Count Number of Distinct Items -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.google.common.base.Charsets -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object CountDistinctItems { - - // ## Scalding Exact Approach - def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { - input - .map(_.item) - // Remove duplicates, requires a shuffle - .distinct - .map(_ => 1L) - // Sum with an implicit `Semigroup[Long]` - .sum - .toTypedPipe - } - - // ## Scalding Approximate Approach - def scaldingApproxWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Double] = { - import com.twitter.algebird.HyperLogLogAggregator - val aggregator = HyperLogLogAggregator.sizeAggregator(bits = 12) - input - // `HyperLogLog` expects bytes input - .map(_.item.getBytes(Charsets.UTF_8)) - // Aggregate globally into a `Double` - .aggregate(aggregator) - .toTypedPipe - } - - // ## Scio Exact Approach - def scio(input: SCollection[Rating]): SCollection[Long] = { - input - .map(_.item) - .distinct - .count - } - - // ## Scio Approximate Approach - def scioApprox(input: SCollection[Rating]): SCollection[Long] = { - input - .map(_.item) - .countApproxDistinct() - } - - // ## Spark Exact Approach - def spark(input: RDD[Rating]): Long = { - input - .map(_.item) - .distinct() - .count() - } - - // ## Spark Approximate Approach - def sparkApprox(input: RDD[Rating]): Long = { - input - .map(_.item) - .countApproxDistinct() - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala b/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala deleted file mode 100644 index e7a1458..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Count the Number of Items of a Given User -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object CountUsers { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { - input - .filter(_.user == "Smith") - .map(_ => 1L) - // Sum with an implicit `Semigroup[Long]` - .sum - .toTypedPipe - } - - // ## Sclading with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Long] = { - import com.twitter.algebird.Aggregator.count - input - // Aggregate globally into a single `Long` - .aggregate(count(_.user == "Smith")) - .toTypedPipe - } - - def scio(input: SCollection[Rating]): SCollection[Long] = { - input - .filter(_.user == "Smith") - .count - } - - // ## Scio with Algebird `Aggregator` - def scioWithAlgebird(input: SCollection[Rating]): SCollection[Long] = { - import com.twitter.algebird.Aggregator.count - input - // Aggregate globally into a single `Long` - .aggregate(count((_: Rating).user == "Smith")) - } - - // ## Spark - def spark(input: RDD[Rating]): Long = { - input - .filter(_.user == "Smith") - // `count` is an action and collects data back to the driver node - .count() - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): Long = { - import com.twitter.algebird.Aggregator.count - import com.twitter.algebird.spark._ - input.algebird - // `aggregate` is an action and collects data back to the driver node - .aggregate(count(_.user == "Smith")) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala b/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala deleted file mode 100644 index 295fe4b..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Collection of Distinct Items -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object DistinctItems { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[String] = { - input - .map(_.item) - .distinct - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[String] = { - input - .map(_.item) - .distinct - } - - // ## Spark - def spark(input: RDD[Rating]): RDD[String] = { - input - .map(_.item) - .distinct() - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala b/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala deleted file mode 100644 index 5a3a08e..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Basic Descriptive Statistics for Each Field -// Input is a collection of case classes -package com.spotify.bdrc.pipeline - -import com.spotify.scio.coders.Coder -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object FieldStatistics { - - case class User(age: Int, income: Double, score: Double) - case class Stats(max: Double, min: Double, mean: Double, stddev: Double) - case class UserStats(age: Stats, income: Stats, score: Stats) - - import com.twitter.algebird._ - implicit val momentsCoder: Coder[Moments] = Coder.kryo[Moments] - - // ## Algebird `Aggregator` - def aggregator = { - // Create 3 `Aggregator`s on `age` field with different logic - - // The first 2 are of type `Aggregator[User, _, Int]` which means it takes `User` as input and - // generates `Int` as output. The last one is of type `Aggregator[User, _, Moments]`, - // where `Moments` include count, mean, standard deviation, etc. The input `User` is prepared - // with a `User => Int` function `_.age`. - val maxAgeOp = Aggregator.max[Int].composePrepare[User](_.age) - val minAgeOp = Aggregator.min[Int].composePrepare[User](_.age) - val momentsAgeOp = Moments.aggregator.composePrepare[User](_.age) - - // Create 3 `Aggregator`s on `income` field with different logic - val maxIncomeOp = Aggregator.max[Double].composePrepare[User](_.income) - val minIncomeOp = Aggregator.min[Double].composePrepare[User](_.income) - val momentsIncomeOp = Moments.aggregator.composePrepare[User](_.income) - - // Create 3 `Aggregator`s on `score` field with different logic - val maxScoreOp = Aggregator.max[Double].composePrepare[User](_.score) - val minScoreOp = Aggregator.min[Double].composePrepare[User](_.score) - val momentsScoreOp = Moments.aggregator.composePrepare[User](_.score) - - // Apply 12 `Aggregator`s on the same input, present result tuple 12 as `UserStats`. - MultiAggregator( - maxAgeOp, - minAgeOp, - momentsAgeOp, - maxIncomeOp, - minIncomeOp, - momentsIncomeOp, - maxScoreOp, - minScoreOp, - momentsScoreOp - ).andThenPresent { t => - val (maxAge, minAge, mAge, maxIncome, minIncome, mIncome, maxScore, minScore, mScore) = t - UserStats( - age = Stats(maxAge, minAge, mAge.mean, mAge.stddev), - income = Stats(maxIncome, minIncome, mIncome.mean, mIncome.stddev), - score = Stats(maxScore, minScore, mScore.mean, mScore.stddev) - ) - } - } - - // ## Scalding - def scalding(input: TypedPipe[User]): TypedPipe[UserStats] = - input.aggregate(aggregator) - - // ## Scio - def scio(input: SCollection[User]): SCollection[UserStats] = - input.aggregate(aggregator) - - // ## Spark - def spark(input: RDD[User]): UserStats = { - // Compute each field separately, potentially in-efficient if input is not cached - val s1 = input.map(_.age).stats() - val s2 = input.map(_.income).stats() - val s3 = input.map(_.score).stats() - UserStats( - age = Stats(s1.max, s1.min, s1.mean, s1.stdev), - income = Stats(s2.max, s2.min, s2.mean, s2.stdev), - score = Stats(s3.max, s3.min, s3.mean, s3.stdev) - ) - } - - // ## Spark with Algebird `Aggregator` - def sparkAlgebird(input: RDD[User]): UserStats = { - import com.twitter.algebird.spark._ - input.algebird.aggregate(aggregator) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/FindMedian.scala b/src/main/scala/com/spotify/bdrc/pipeline/FindMedian.scala deleted file mode 100644 index e8f2b74..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/FindMedian.scala +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -/** - * Compute the median of a collection of numbers. - */ -object FindMedian { - - // Computing the exact median is very expensive as it requires sorting and counting elements. - // QTree is a compact data structure for approximate quantile and range queries. - - def scalding(input: TypedPipe[Long]): TypedPipe[(Double, Double)] = { - import com.twitter.algebird._ - input - .aggregate(QTreeAggregator[Long](0.5)) - .map(i => (i.lower.lower, i.upper.upper)) - } - - def scio(input: SCollection[Long]): SCollection[(Double, Double)] = { - import com.twitter.algebird._ - input - .aggregate(QTreeAggregator[Long](0.5)) - .map(i => (i.lower.lower, i.upper.upper)) - } - - def spark(input: RDD[Long]): (Double, Double) = { - import com.twitter.algebird._ - import com.twitter.algebird.spark._ - val i = input.algebird.aggregate(QTreeAggregator[Long](0.5)) - (i.lower.lower, i.upper.upper) - } - - // TODO: exact version - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala b/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala deleted file mode 100644 index 55f9edd..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2017 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Build Inverted Index -// Build inverted index from a corpus of text documents - -// Input is a collection of (id, text) -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object InvertedIndex { - - case class Document(id: Int, text: String) - case class Posting(word: String, ids: Seq[Int]) - - // ## Scalding - def scalding(input: TypedPipe[Document]): TypedPipe[Posting] = { - input - // Split text and output (word, document ID) - .flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id))) - // Group and convert document IDs per key to `List[Int]` - .group - .toList - .map(Posting.tupled) - } - - // ## Scio - def scio(input: SCollection[Document]): SCollection[Posting] = { - input - // Split text and output (word, document ID) - .flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id))) - // Group document IDs per key into `Iterable[Int]` - .groupByKey - .map(kv => Posting(kv._1, kv._2.toSeq)) - } - - // ## Spark - def spark(input: RDD[Document]): RDD[Posting] = { - input - // Split text and output (word, document ID) - .flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id))) - // Group document IDs per key into `Iterable[Int]` - .groupByKey() - .map(kv => Posting(kv._1, kv._2.toSeq)) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala b/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala deleted file mode 100644 index 0ac595b..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Join Log and Metadata Datasets -// Compute average age of users who listened to a track by joining log event and user metadata. -// -// - LHS input is a large collection of (user, page, timestamp). -// - RHS input is a small collection of (user, age). -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.{LogEvent, UserMeta} -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object JoinLogAndMetadata { - - // ## Scalding Naive Approach - def scaldingNaive( - left: TypedPipe[LogEvent], - right: TypedPipe[UserMeta] - ): TypedPipe[(String, Double)] = { - import com.twitter.algebird.AveragedValue - left - .groupBy(_.user) - // Join as (user, (LogEvent, UserMeta)) - .join(right.groupBy(_.user)) - // Drop user key - .values - // Map into (track, age) - .map { case (logEvent, userMeta) => - (logEvent.track, userMeta.age.toDouble) - } - .group - // Aggregate average age per track - .aggregate(AveragedValue.aggregator) - .toTypedPipe - } - - // ## Scalding with Hash Join - // `hashJoin` replicates the smaller RHS to all mappers on the LHS - def scaldingHashJoin( - left: TypedPipe[LogEvent], - right: TypedPipe[UserMeta] - ): TypedPipe[(String, Double)] = { - import com.twitter.algebird.AveragedValue - - // Map out fields to avoid shuffing large objects - val lhs = left.map(e => (e.user, e.track)) - // Force to disk to avoid repeating the same computation on each mapper on the LHS - val rhs = right.map(u => (u.user, u.age.toDouble)).forceToDisk - - lhs - .hashJoin(rhs) - .values - .group - .aggregate(AveragedValue.aggregator) - .toTypedPipe - } - - // ## Scio Naive Approach - def scioNaive( - left: SCollection[LogEvent], - right: SCollection[UserMeta] - ): SCollection[(String, Double)] = { - import com.twitter.algebird.AveragedValue - val lhs = left.map(e => (e.user, e.track)) - val rhs = right.map(u => (u.user, u.age.toDouble)) - // Join as (user, (track, age)) - lhs - .join(rhs) - // Drop user key to make track as new key in (track, age) - .values - // Aggregate average age per track - .aggregateByKey(AveragedValue.aggregator) - } - - // ## Scio with Side Input - // Side input makes RHS available on all workers - def scioSideInput( - left: SCollection[LogEvent], - right: SCollection[UserMeta] - ): SCollection[(String, Double)] = { - import com.twitter.algebird.AveragedValue - - // Convert RHS to a side input of `Map[String, Double]` - val rhs = right.map(u => (u.user, u.age.toDouble)).asMapSideInput - - // Replicate RHS to each worker - left - .withSideInputs(rhs) - // Access side input via the context - .map { case (e, sideContext) => (e.track, sideContext(rhs).getOrElse(e.user, 0.0)) } - // Convert back to regular SCollection - .toSCollection - .aggregateByKey(AveragedValue.aggregator) - } - - // ## Scio with Hash Join - // `hashJoin` is a short cut to the side input approach - def scioHashJoin( - left: SCollection[LogEvent], - right: SCollection[UserMeta] - ): SCollection[(String, Double)] = { - import com.twitter.algebird.AveragedValue - val lhs = left.map(e => (e.user, e.track)) - val rhs = right.map(u => (u.user, u.age.toDouble)) - lhs - .hashJoin(rhs) - .values - .aggregateByKey(AveragedValue.aggregator) - } - - // ## Spark Naive Approach - def sparkNaive(left: RDD[LogEvent], right: RDD[UserMeta]): RDD[(String, Double)] = { - import com.twitter.algebird.spark._ - import com.twitter.algebird.AveragedValue - val lhs = left.map(e => (e.user, e.track)) - val rhs = right.map(u => (u.user, u.age.toDouble)) - // Join as (user, (track, age)) - lhs - .join(rhs) - // Drop user key to make track as new key in (track, age) - .values - .algebird - // Aggregate average age per track - .aggregateByKey(AveragedValue.aggregator) - } - - // ## Spark with Broadcast Variable - def sparkBroadcast(left: RDD[LogEvent], right: RDD[UserMeta]): RDD[(String, Double)] = { - import com.twitter.algebird.spark._ - import com.twitter.algebird.AveragedValue - - // Retrieve `SparkContext` for creating broadcast variable - val sc = left.context - - // Collect RHS to driver memory and broadcast back to workers - val map = right.map(u => (u.user, u.age.toDouble)).collectAsMap() - val b = sc.broadcast(map) - - left - // In-memory lookup on each worker - .map(e => (e.track, b.value.getOrElse(e.user, 0.0))) - .algebird - .aggregateByKey(AveragedValue.aggregator) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala b/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala deleted file mode 100644 index 9273a4d..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Join Two Log Datasets and Compute Action Sequence -// Given two log datasets of play track and save track events, compute tracks that a user saved -// after playing in a session. - -// Inputs are collections of (user, item, timestamp). -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.LogEvent -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object JoinLogs { - - val gapDuration = 3600000 - - // Detect if a pair of (event type, LogEvent) tuples match a play and save sequence - def detectPlaySaveSequence(pair: Seq[(String, LogEvent)]): Option[String] = { - val Seq(first, second) = pair - if ( - first._1 == "play" && second._1 == "save" && first._2.track == second._2.track && - second._2.timestamp - first._2.timestamp <= gapDuration - ) { - Some(first._2.track) - } else { - None - } - } - - // ## Scalding - def scalding( - playEvents: TypedPipe[LogEvent], - saveEvents: TypedPipe[LogEvent] - ): TypedPipe[(String, String)] = { - // Map inputs to key-values and add event type information - val plays = playEvents.map(e => (e.user, ("play", e))).group - val saves = saveEvents.map(e => (e.user, ("save", e))).group - - plays - .cogroup(saves) { (user, p, s) => - // `Iterable`s of play and save events for the user - (p ++ s).toList - .sortBy(_._2.timestamp) - // Neighboring pairs - .sliding(2) - .flatMap(detectPlaySaveSequence) - } - .toTypedPipe - } - - // ## Scio - def scio( - playEvents: SCollection[LogEvent], - saveEvents: SCollection[LogEvent] - ): SCollection[(String, String)] = { - // Map inputs to key-values and add event type information - val plays = playEvents.map(e => (e.user, ("play", e))) - val saves = saveEvents.map(e => (e.user, ("save", e))) - - plays - .cogroup(saves) - // `Iterable`s of play and save events for the user - .flatMapValues { case (p, s) => - (p ++ s).toList - .sortBy(_._2.timestamp) - // Neighboring pairs - .sliding(2) - .flatMap(detectPlaySaveSequence) - } - } - - // ## Spark - def spark(playEvents: RDD[LogEvent], saveEvents: RDD[LogEvent]): RDD[(String, String)] = { - // Map inputs to key-values and add event type information - val plays = playEvents.map(e => (e.user, ("play", e))) - val saves = saveEvents.map(e => (e.user, ("save", e))) - - plays - .cogroup(saves) - .flatMapValues { case (p, s) => - // `Iterable`s of play and save events for the user - (p ++ s).toList - .sortBy(_._2.timestamp) - // Neighboring pairs - .sliding(2) - .flatMap(detectPlaySaveSequence) - } - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala b/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala deleted file mode 100644 index a3baaab..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute One Item with Max Score per User -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object MaxItemPerUser { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { - input - .groupBy(_.user) - // Reduce items per key by picking the side with higher score for each pair of input - .reduce((x, y) => if (x.score > y.score) x else y) - .values - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Rating] = { - import com.twitter.algebird.Aggregator.maxBy - input - .groupBy(_.user) - // Aggregate per key into a single `Rating` based on `Double` value via `_.score` - .aggregate(maxBy(_.score)) - .values - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[Rating] = { - input - .keyBy(_.user) - // Compute top one item per key as an `Iterable[Rating]` - .topByKey(1)(Ordering.by(_.score)) - // Drop user key - .values - // Flatten result `Iterable[Rating]` - .flatten - } - - // ## Scio with Algebird `Aggregator` - def scioWithAlgebird(input: SCollection[Rating]): SCollection[Rating] = { - import com.twitter.algebird.Aggregator.maxBy - input - .keyBy(_.user) - // Aggregate per key into a single `Rating` based on `Double` value via `_.score`. Explicit - // type due to type inference limitation. - .aggregateByKey(maxBy { x: Rating => x.score }) - .values - } - - // ## Spark - def spark(input: RDD[Rating]): RDD[Rating] = { - input - .keyBy(_.user) - // Reduce items per key by picking the side with higher score for each pair of input - .reduceByKey((x, y) => if (x.score > y.score) x else y) - .values - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { - import com.twitter.algebird.Aggregator.maxBy - import com.twitter.algebird.spark._ - input - .keyBy(_.user) - .algebird - // Aggregate per key into a single `Rating` based on `Double` value via `_.score`. Explicit - // type due to type inference limitation. - .aggregateByKey(maxBy { x: Rating => x.score }) - .values - } - - // ## Spark with MLLib - def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { - import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ - input - .keyBy(_.user) - // From `spark-mllib`, compute top K per key with a priority queue - .topByKey(1)(Ordering.by(_.score)) - .flatMap(_._2) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala b/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala deleted file mode 100644 index c2f323a..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute One Item with Min Score per User -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object MinItemPerUser { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { - input - .groupBy(_.user) - // Reduce items per key by picking the side with lower score for each pair of input - .reduce((x, y) => if (x.score < y.score) x else y) - .values - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Rating] = { - import com.twitter.algebird.Aggregator.minBy - input - .groupBy(_.user) - // Aggregate per key into a single `Rating` based on `Double` value via `_.score` - .aggregate(minBy(_.score)) - .values - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[Rating] = { - input - .keyBy(_.user) - // Compute top one item per key as an `Iterable[Rating]` with a reverse comparator - .topByKey(1)(Ordering.by(-_.score)) - // Drop user key - .values - // Flatten result `Iterable[Rating]` - .flatten - } - - // ## Scio with Algebird `Aggregator` - def scioWithAlgebird(input: SCollection[Rating]): SCollection[Rating] = { - import com.twitter.algebird.Aggregator.minBy - input - .keyBy(_.user) - // Aggregate per key into a single `Rating` based on `Double` value via `_.score`. Explicit - // type due to type inference limitation. - .aggregateByKey(minBy { x: Rating => x.score }) - .values - } - - // ## Spark - def spark(input: RDD[Rating]): RDD[Rating] = { - input - .keyBy(_.user) - // Reduce items per key by picking the side with lower score for each pair of input - .reduceByKey((x, y) => if (x.score < y.score) x else y) - .values - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { - import com.twitter.algebird.Aggregator.minBy - import com.twitter.algebird.spark._ - input - .keyBy(_.user) - .algebird - // Aggregate per key into a single `Rating` based on `Double` value via `_.score`. Explicit - // type due to type inference limitation. - .aggregateByKey(minBy { x: Rating => x.score }) - .values - } - - // ## Spark with MLLib - def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { - import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ - input - .keyBy(_.user) - // From `spark-mllib`, compute top K per key with a priority queue and a reverse comparator - .topByKey(1)(Ordering.by(-_.score)) - .flatMap(_._2) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/PageRank.scala b/src/main/scala/com/spotify/bdrc/pipeline/PageRank.scala deleted file mode 100644 index d675460..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/PageRank.scala +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -/** - * Classic PageRank. - * - * Input is a collection of (source URL, destination URL). - */ -object PageRank { - - val iterations = 10 - val dampingFactor = 0.85 - - def scalding(input: TypedPipe[(String, String)]): TypedPipe[(String, Double)] = { - val links = input.group.toList // (src URL, list of dst URL) - var ranks = input.keys.distinct.map((_, 1.0)) // (src URL, 1.0) - - for (i <- 1 to 10) { - val contribs = links - .join(ranks) - .toTypedPipe - .values - // re-distribute rank of src URL among collection of dst URLs - .flatMap { case (urls, rank) => - val size = urls.size - urls.map((_, rank / size)) - } - ranks = contribs.group.sum - .mapValues((1 - dampingFactor) + dampingFactor * _) - .toTypedPipe - } - - ranks - } - - def scio(input: SCollection[(String, String)]): SCollection[(String, Double)] = { - val links = input.groupByKey - var ranks = links.mapValues(_ => 1.0) - - for (i <- 1 to 10) { - val contribs = links - .join(ranks) - .values - .flatMap { case (urls, rank) => - val size = urls.size - urls.map((_, rank / size)) - } - ranks = contribs.sumByKey - .mapValues((1 - dampingFactor) + dampingFactor * _) - } - - ranks - } - - def spark(input: RDD[(String, String)]): RDD[(String, Double)] = { - val links = input - .groupByKey() // (src URL, iterable of dst URL) - .cache() // links is reused in every iteration - var ranks = links.mapValues(_ => 1.0) // (src URL, 1.0) - - for (i <- 1 to 10) { - val contribs = links - .join(ranks) - .values - // re-distribute rank of src URL among collection of dst URLs - .flatMap { case (urls, rank) => - val size = urls.size - urls.map((_, rank / size)) - } - ranks = contribs - .reduceByKey(_ + _) - .mapValues((1 - dampingFactor) + dampingFactor * _) - } - - ranks - } -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala b/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala deleted file mode 100644 index 4d89beb..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Session Duration and Number of Items from Log Data -// Input is a collection of log events -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.LogEvent -import com.spotify.scio.extra.Iterators._ -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD -import org.joda.time.Instant - -import scala.collection.mutable - -object Sessions { - - val gapDuration = 3600000 - - case class Session(user: String, duration: Long, numItems: Int) - - // Wrapper for `Iterator[LogEvent]` that group items into sessions - class SessionIterator(self: Iterator[LogEvent]) extends Iterator[Seq[LogEvent]] { - // `BufferedIterator` allows peak ahead - private val bi = self.buffered - override def hasNext: Boolean = bi.hasNext - override def next(): Seq[LogEvent] = { - val buf = mutable.Buffer(bi.next()) - var last = buf.head.timestamp - - // Consume subsequent events until a gap is detected - while (bi.hasNext && bi.head.timestamp - last < gapDuration) { - val n = bi.next() - buf.append(n) - last = n.timestamp - } - buf - } - } - - // ## Scalding - def scalding(input: TypedPipe[LogEvent]): TypedPipe[Session] = { - input - .groupBy(_.user) - // `sortBy` uses Hadoop secondary sort to sort keys during shuffle - .sortBy(_.timestamp) - // Iterate over values lazily and group items into sessions - .mapValueStream(new SessionIterator(_)) - .toTypedPipe - // Map over each (user, session items) - .map { case (user, items) => - Session(user, items.last.timestamp - items.head.timestamp, items.size) - } - } - - // ## Scio - def scio(input: SCollection[LogEvent]): SCollection[Session] = { - input - // Values in `groupBy` are sorted by timestamp - .timestampBy(e => new Instant(e.timestamp)) - // No secondary sort in Scio, shuffle all items - .groupBy(_.user) - .flatMapValues { - _.iterator - // Generic version of `SessionIterator` from `scio-extra` - .timeSeries(_.timestamp) - .session(gapDuration) - } - // Map over each (user, session items) - .map { case (user, items) => - Session(user, items.last.timestamp - items.head.timestamp, items.size) - } - } - - // ## Spark - def spark(input: RDD[LogEvent]): RDD[Session] = { - input - // No secondary sort in Spark, shuffle all items - .groupBy(_.user) - .flatMapValues { - _ - // Order of values after shuffle is not guaranteed - .toList - .sortBy(_.timestamp) - .iterator - // Generic version of `SessionIterator` from `scio-extra` - .timeSeries(_.timestamp) - .session(gapDuration) - } - // Map over each (user, session items) - .map { case (user, items) => - Session(user, items.last.timestamp - items.head.timestamp, items.size) - } - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala b/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala deleted file mode 100644 index 8fa15ff..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Basic Descriptive Statistics -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.coders.Coder -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object Statistics { - - case class Stats(max: Double, min: Double, sum: Double, count: Long, mean: Double, stddev: Double) - - import com.twitter.algebird._ - implicit val momentsCoder: Coder[Moments] = Coder.kryo[Moments] - - // ## Algebird `Aggregator` - def aggregator = { - // Create 4 `Aggregator`s with different logic - - // The first 3 are of type `Aggregator[Rating, _, Double]` which means it takes `Rating` as - // input and generates `Double` as output. The last one is of type - // `Aggregator[Rating, _, Moments]`, where `Moments` include count, mean, standard deviation, - // etc. The input `Rating` is prepared with a `Rating => Double` function `_.score`. - val maxOp = Aggregator.max[Double].composePrepare[Rating](_.score) - val minOp = Aggregator.min[Double].composePrepare[Rating](_.score) - val sumOp = Aggregator.prepareMonoid[Rating, Double](_.score) - val momentsOp = Moments.aggregator.composePrepare[Rating](_.score) - - // Apply 4 `Aggregator`s on the same input, present result tuple 4 of - // `(Double, Double, Double, Moments)` as `Stats` - MultiAggregator(maxOp, minOp, sumOp, momentsOp) - .andThenPresent { case (max, min, sum, moments) => - Stats(max, min, sum, moments.count, moments.mean, moments.stddev) - } - } - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Stats] = - input.aggregate(aggregator) - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[Stats] = { - input - .map(_.score) - .stats - .map(s => Stats(s.max, s.min, s.sum, s.count, s.mean, s.stdev)) - } - - // ## Scio with Algebird `Aggregator` - def scioAlgebird(input: SCollection[Rating]): SCollection[Stats] = - input.aggregate(aggregator) - - // ## Spark - def spark(input: RDD[Rating]): Stats = { - val s = input.map(_.score).stats() - Stats(s.max, s.min, s.sum, s.count, s.mean, s.stdev) - } - - // ## Spark with Algebird `Aggregator` - def sparkAlgebird(input: RDD[Rating]): Stats = { - import com.twitter.algebird.spark._ - input.algebird.aggregate(aggregator) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala b/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala deleted file mode 100644 index 14bf627..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute the Sum of Scores per Item -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object SumPerItem { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - input - .groupBy(_.item) - .mapValues(_.score) - // Sum per key with an implicit `Semigroup[Double]` - .sum - .toTypedPipe - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.Aggregator.prepareMonoid - input - .groupBy(_.item) - // Aggregate per key with an aggregator that converts `UserItemData` to `Double` via - // `_.score` before reduce - .aggregate(prepareMonoid(_.score)) - .toTypedPipe - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { - input - .map(x => (x.item, x.score)) - .sumByKey - } - - // ## Spark - def spark(input: RDD[Rating]): RDD[(String, Double)] = { - input - .map(x => (x.item, x.score)) - .reduceByKey(_ + _) - } - - // ## Spark with Algebird `Semigroup` - def sparkWithAlgebird1(input: RDD[Rating]): RDD[(String, Double)] = { - import com.twitter.algebird.spark._ - input - .map(x => (x.item, x.score)) - .algebird - // Sum per key with an implicit `Semigroup[Double]` - .sumByKey - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird2(input: RDD[Rating]): RDD[(String, Double)] = { - import com.twitter.algebird.Aggregator.prepareMonoid - import com.twitter.algebird.spark._ - input - .keyBy(_.item) - .algebird - // Aggregate per key with an aggregator that converts `UserItemData` to `Double` via - // `_.score` before reduce. Explicit type due to type inference limitation. - .aggregateByKey(prepareMonoid { x: Rating => x.score }) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/TfIdf.scala b/src/main/scala/com/spotify/bdrc/pipeline/TfIdf.scala deleted file mode 100644 index 86e2c06..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/TfIdf.scala +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -/** - * Compute TF-IDF for a set of documents. - * - * Input is a Seq of (doc, text). - */ -object TfIdf { - - case class Score(term: String, doc: String, score: Double) - - def scalding(input: Seq[(String, TypedPipe[String])]): TypedPipe[Score] = { - val numDocs = input.size - - val docToTerms = input - .map { case (doc, pipe) => - pipe - .flatMap(_.split("\\W+").filter(_.nonEmpty)) - .map(t => (doc, t.toLowerCase)) - } - // union input collections - .reduce(_ ++ _) // (d, t) - - val docToTermAndFreq = docToTerms - .groupBy(identity) - .size - .toTypedPipe - .map { case ((d, t), tf) => (d, (t, tf)) } - - val termToDfN = docToTerms.distinct.values - .groupBy(identity) - .size // (t, df) - .mapValues(_.toDouble / numDocs) // (t, df/N) - - docToTerms.keys - .groupBy(identity) - .size // (d, |d|) - .join(docToTermAndFreq) - .toTypedPipe - .map { case (d, (dLen, (t, tf))) => (t, (d, tf.toDouble / dLen)) } // (t, (d, tf/|d|)) - .join(termToDfN) - .toTypedPipe - .map { case (t, ((d, tfd), dfN)) => Score(t, d, tfd * math.log(1 / dfN)) } - } - - def scio(input: Seq[(String, SCollection[String])]): SCollection[Score] = { - val numDocs = input.size - - val docToTerms = input - .map { case (doc, pipe) => - pipe - .flatMap(_.split("\\W+").filter(_.nonEmpty)) - .map(t => (doc, t.toLowerCase)) - } - // union input collections - .reduce(_ ++ _) // (d, t) - - val docToTermAndCFreq = docToTerms - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) - .map { case ((d, t), tf) => (d, (t, tf)) } - - val termToDfN = docToTerms.distinct.values - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) // (t, df) - .mapValues(_.toDouble / numDocs) // (t, df/N) - - docToTerms.keys - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) // (d, |d|) - .join(docToTermAndCFreq) - .map { case (d, (dLen, (t, tf))) => (t, (d, tf.toDouble / dLen)) } // (t, (d, tf/|d|)) - .join(termToDfN) - .map { case (t, ((d, tfd), dfN)) => Score(t, d, tfd * math.log(1 / dfN)) } - } - - /** Spark implementation using transformations to keep computation distributed. */ - def sparkTransformations(input: Seq[(String, RDD[String])]): RDD[Score] = { - val numDocs = input.size - - val docToTerms = input - .map { case (doc, pipe) => - pipe - .flatMap(_.split("\\W+").filter(_.nonEmpty)) - .map(t => (doc, t.toLowerCase)) - } - // union input collections - .reduce(_ ++ _) // (d, t) - .cache() // docToTerms is reused 3 times - - val docToTermAndCFreq = docToTerms - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) - .map { case ((d, t), tf) => (d, (t, tf)) } - - val termToDfN = docToTerms - .distinct() - .values - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) // (t, df) - .mapValues(_.toDouble / numDocs) // (t, df/N) - - docToTerms.keys - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) // (d, |d|) - .join(docToTermAndCFreq) - .map { case (d, (dLen, (t, tf))) => (t, (d, tf.toDouble / dLen)) } // (t, (d, tf/|d|)) - .join(termToDfN) - .map { case (t, ((d, tfd), dfN)) => Score(t, d, tfd * math.log(1 / dfN)) } - } - - /** Spark implementation using actions to compute some steps on the driver node. */ - def sparkActions(input: Seq[(String, RDD[String])]): Seq[Score] = { - val numDocs = input.size - - val docToTerms = input - .map { case (doc, pipe) => - pipe - .flatMap(_.split("\\W+").filter(_.nonEmpty)) - .map(t => (doc, t.toLowerCase)) - } - .reduce(_ ++ _) // (d, t) - .cache() // docToTerms is reused 3 times - - val docToTermAndCFreq = docToTerms - .countByValue() - // performed on driver node - .map { case ((d, t), tf) => (d, (t, tf)) } - - val termToDfN = docToTerms - .distinct() - .values - .countByValue() // (t, df) - // performed on driver node - .mapValues(_.toDouble / numDocs) // (t, df/N) - - docToTerms.keys - .countByValue() // (d, |d|) - // performed on driver node - .toSeq - .map { case (d, dLen) => - val (t, tf) = docToTermAndCFreq(d) - //(t, (d, tf.toDouble / dLen)) // (t, (d, tf/|d|)) - val tfd = tf.toDouble / dLen - val dfN = termToDfN(t) - Score(t, d, tfd * math.log(1 / dfN)) - } - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala b/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala deleted file mode 100644 index ad52209..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Top K Items Globally -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object TopItems { - - val topK = 100 - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - input - .map(x => (x.item, x.score)) - .group - // Sum values with an implicit `Semigroup[Double]` - .sum - // Group all elements with a single key `Unit` - .groupAll - // Take top K with a priority queue - .sortedReverseTake(topK)(Ordering.by(_._2)) - // Drop `Unit` key - .values - // Flatten result `Seq[(String, Double)]` - .flatten - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) - input - .map(x => (x.item, x.score)) - .group - // Sum values with an implicit `Semigroup[Double]` - .sum - .toTypedPipe - // Aggregate globally into a single `Seq[(String, Double)]` - .aggregate(aggregator) - // Flatten result `Seq[(String, Double)]` - .flatten - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { - input - .map(x => (x.item, x.score)) - // Sum values with an implicit `Semigroup[Double]` - .sumByKey - // Compute top K as an `Iterable[(String, Double)]` - .top(topK)(Ordering.by(_._2)) - // Flatten result `Iterable[(String, Double)]` - .flatten - } - - // ## Scio with Algebird `Aggregator` - def scioWithAlgebird(input: SCollection[Rating]): SCollection[(String, Double)] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) - input - .map(x => (x.item, x.score)) - // Sum values with an implicit `Semigroup[Double]` - .sumByKey - // Aggregate globally into a single `Seq[(String, Double)]` - .aggregate(aggregator) - // Flatten result `Seq[(String, Double)]` - .flatten - } - - // ## Spark - def spark(input: RDD[Rating]): Seq[(String, Double)] = { - input - .map(x => (x.item, x.score)) - // Sum values with addition - .reduceByKey(_ + _) - // `top` is an action and collects data back to the driver node - .top(topK)(Ordering.by(_._2)) - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): Seq[(String, Double)] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - import com.twitter.algebird.spark._ - val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) - input - .map(x => (x.item, x.score)) - // Sum values with addition - .reduceByKey(_ + _) - .algebird - // `aggregate` is an action and collects data back to the driver node - .aggregate(aggregator) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala b/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala deleted file mode 100644 index b294873..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Top K Items Globally -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object TopItemsPerUser { - - val topK = 100 - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { - input - .groupBy(_.user) - // Take top K per group with a priority queue - .sortedReverseTake(topK)(Ordering.by(_.score)) - // Drop user key - .values - // Flatten result `Seq[Rating]` - .flatten - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[Rating] = { - input - .keyBy(_.user) - // Compute top K per key - .topByKey(topK)(Ordering.by(_.score)) - // Drop user key - .values - // Flatten result `Iterable[Rating]` - .flatten - } - - // ## Spark Naive Approach - def spark(input: RDD[Rating]): RDD[Rating] = { - input - // `groupBy` shuffles all data, inefficient - .groupBy(_.user) - // Drop user key - .values - // Convert grouped values to a `List[Rating]` and sort on a single node, inefficient - .flatMap(_.toList.sortBy(-_.score).take(topK)) - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - import com.twitter.algebird.spark._ - val aggregator = sortedReverseTake[Rating](topK)(Ordering.by(_.score)) - input - .keyBy(_.user) - .algebird - // Aggregate per key into a `Seq[Rating]` - .aggregateByKey(aggregator) - // Flatten result `Seq[Rating]` - .flatMap(_._2) - } - - // ## Spark with MLLib - def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { - import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ - input - .keyBy(_.user) - // From `spark-mllib`, compute top K per key with a priority queue - .topByKey(topK)(Ordering.by(_.score)) - // Flatten result `Seq[Rating]` - .flatMap(_._2) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/TotalAndDistinctCount.scala b/src/main/scala/com/spotify/bdrc/pipeline/TotalAndDistinctCount.scala deleted file mode 100644 index 561b899..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/TotalAndDistinctCount.scala +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -/** - * Compute number of total and distinct items. - * - * Input is a collection of (user, item, score). - */ -object TotalAndDistinctCount { - - def aggregator = { - import com.twitter.algebird._ - // Exact total count, approximate unique count - val totalCount = Aggregator.size - val uniqueCount = Aggregator.approximateUniqueCount[String] - MultiAggregator(totalCount, uniqueCount) - } - - def scaldingExact(input: TypedPipe[String]): TypedPipe[(Long, Long)] = { - input - .map((_, 1L)) - .group - .sum // (key, total count per key) - .toTypedPipe - .map(kv => (kv._1, (kv._2, 1L))) - .group - .sum // (key, (total count, distinct count)) - .values - } - - def scaldingApproximate(input: TypedPipe[String]): TypedPipe[(Long, Long)] = - input.aggregate(aggregator) - - def scioExact(input: SCollection[String]): SCollection[(Long, Long)] = { - input - .map((_, 1L)) - .sumByKey // (key, total count per key) - .map(kv => (kv._1, (kv._2, 1L))) - .sumByKey // (key, (total count, distinct count)) - .values - } - - def scioApproximate(input: SCollection[String]): SCollection[(Long, Long)] = - input.aggregate(aggregator) - - def sparkAlgebird(input: RDD[String]): RDD[(Long, Long)] = { - import com.twitter.algebird.spark._ - input - .map((_, 1L)) - .algebird - .sumByKey[String, Long] // (key, total count per key) - .map(kv => (kv._1, (kv._2, 1L))) - .algebird - .sumByKey[String, (Long, Long)] // (key, (total count, distinct count)) - .values - } - - def sparkInMemory(input: RDD[String]): (Long, Long) = { - input.cache() - (input.count(), input.distinct().count()) - } - - def sparkApproximate(input: RDD[String]): (Long, Long) = { - input.cache() - (input.count(), input.countApproxDistinct()) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala b/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala deleted file mode 100644 index 4eeb3d5..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Classic Word Count -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object WordCount { - - // ## Scalding - def scalding(input: TypedPipe[String]): TypedPipe[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - // `groupBy` is lazy - .groupBy(identity) - // Operations like `size` after `groupBy` can be lifted into the map phase - .size - .toTypedPipe - } - - // ## Scio - def scio(input: SCollection[String]): SCollection[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - .countByValue - } - - // ## Spark Transformation - def sparkTransformation(input: RDD[String]): RDD[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - // There is no `countByValue` transformation in Spark although it is equivalent to mapping - // into initial count of `1` and reduce with addition - .map((_, 1L)) - // `reduceByKey` can lift function into the map phase - .reduceByKey(_ + _) - } - - // ## Spark Action - def sparkAction(input: RDD[String]): Seq[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - // `countByValue` is an action and collects data back to the driver node - .countByValue() - .toSeq - } - -} diff --git a/src/main/scala/com/spotify/bdrc/scala/Collections.scala b/src/main/scala/com/spotify/bdrc/scala/Collections.scala deleted file mode 100644 index 68b27d0..0000000 --- a/src/main/scala/com/spotify/bdrc/scala/Collections.scala +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2017 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.scala - -/** - * Examples for working with Scala collections. - */ -object Collections { - - def mapValues: Unit = { - val m = Map("a" -> 1, "b" -> 2, "c" -> 3) - - // Native approach, inefficient since .toList, .map, .toMap each creates a copy - m.toList.map(t => (t._1, t._2 + 1)).toMap - - // Better, one copy - m.map(kv => (kv._1, kv._2 + 1)) - - // Lazy version, no copy - m.mapValues(_ + 1) - } - - def mergeMaps: Unit = { - val m1 = Map("a" -> 1.0, "b" -> 2.0, "c" -> 3.0) - val m2 = Map("a" -> 1.5, "b" -> 2.5, "d" -> 3.5) - - // Native approach, inefficient since it creates many copies - val i = m1.keySet intersect m2.keySet - val m = i.map(k => k -> (m1(k) + m2(k))) // sum values of common keys - (m1 -- i) ++ (m2 -- i) ++ m // inefficient, creates 2 more temporary maps - m1 ++ m2 ++ m // slightly better, values from RHS overwrites those from LHS - - // Slightly better but still creates a temporary set - (m1.keySet ++ m2.keySet).map(k => k -> (m1.getOrElse(k, 0.0) + m2.getOrElse(k, 0.0))) - - // Better but slightly cryptic - m1 ++ m2.map { case (k, v) => k -> (v + m1.getOrElse(k, 0.0)) } - } - - def listToMap: Unit = { - val l = List(1, 2, 3, 4, 5) - - // Native approach, creates a temporary copy - l.map(x => "key" + x -> x).toMap - - // Slightly better, using a mutable builder - val b = Map.newBuilder[String, Int] - l.foreach(x => b += "key" + x -> x) - b.result() - - // Use implicits to automatically build for the target collection type Map[String, Int] - val m: Map[String, Int] = l.map(x => "key" + x -> x)(scala.collection.breakOut) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/scala/FilterMessyData.scala b/src/main/scala/com/spotify/bdrc/scala/FilterMessyData.scala deleted file mode 100644 index 4769e71..0000000 --- a/src/main/scala/com/spotify/bdrc/scala/FilterMessyData.scala +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.scala - -import scala.util.Try - -/** - * Filter out messy data that may cause computation to fail. - * - * Input is a collection of case classes with messy values. - */ -object FilterMessyData { - - case class MessyData(user: String, gender: String, scores: Array[Double], favorites: Set[String]) - - /** Dummy method that may fail for invalid records. */ - def compute(x: MessyData): String = "dummy_result" - - /** Naive approach that checks every field accessed. */ - def naive(input: Seq[MessyData]): Seq[String] = { - input - .filter { x => - x.user != null && x.gender != null && - x.scores != null && x.scores.nonEmpty && - x.favorites != null && x.favorites.nonEmpty - } - .map(compute) // may still fail for unexpected cases - } - - /** - * Smart approach that throws any failed records away. - * - * Try.toOption returns Some if the computation succeeds or None if it fails. - * Option[U] is implicitly converted to TraversableOnce[U] that flatMap expects. - * - * WARNING: THIS APPROACH IGNORES ANY EXCEPTION AND IS POTENTIALLY UNSAFE. - */ - def withUnsafeFlatMap(input: Seq[MessyData]): Seq[String] = - input - .flatMap(x => Try(compute(x)).toOption) - - /** - * Smart approach that throws any failed records away. - * - * Try/catch block returns a Seq of one item if compute succeeds and Nil if it fails. - * This approach is safer since you have control over what exceptions to expect. - */ - def withSafeFlatMap(input: Seq[MessyData]): Seq[String] = { - input - .flatMap { x => - try { - Seq(compute(x)) - } catch { - case _: NullPointerException => Nil - } - } - } - -} diff --git a/src/main/scala/com/spotify/bdrc/scala/HandlingOptions.scala b/src/main/scala/com/spotify/bdrc/scala/HandlingOptions.scala deleted file mode 100644 index 5387469..0000000 --- a/src/main/scala/com/spotify/bdrc/scala/HandlingOptions.scala +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.scala - -/** - * Handling data with multiple Option[T]s more gracefully. - * - * Input is a collection of case classes with nested Option[T]. - */ -object HandlingOptions { - - case class Metadata(track: Option[Track], audio: Option[Audio]) - case class Track(id: String, name: String, artist: Option[Artist]) - case class Artist(id: String, name: String) - case class Audio(tempo: Int, key: String) - - /** Naive approach that checks every field accessed is defined. */ - def naive(input: Seq[Metadata]): Seq[(String, Int)] = { - input - .filter(m => m.track.isDefined && m.track.get.artist.isDefined && m.audio.isDefined) - .map { m => - // Option[T].get is safe since we already checked with Option[T].isDefined - (m.track.get.artist.get.id, m.audio.get.tempo) - } - } - - /** - * Smart approach that uses for comprehension. - * - * For-comprehension extracts values from Options and yields Some if all Options are defined. - * It yields None if any of the Options is None. - */ - def withFlatMap(input: Seq[Metadata]): Seq[(String, Int)] = { - input.flatMap { md => - for { - tr <- md.track // extract Track from Option[Track] - ar <- tr.artist // extract Artist from Option[Artist] - au <- md.audio // extract Audio from Option[Audio] - } yield (ar.id, au.tempo) - } - } - - /** The for-comprehension above translates to nested flatMaps. */ - def withNestedFlatMap(input: Seq[Metadata]): Seq[(String, Int)] = { - input.flatMap { md => - md.track.flatMap { tr => - tr.artist.flatMap(ar => md.audio.map(au => (ar.id, au.tempo))) - } - } - } - -} diff --git a/src/main/scala/com/spotify/bdrc/scala/JavaPrimitives.scala b/src/main/scala/com/spotify/bdrc/scala/JavaPrimitives.scala deleted file mode 100644 index ff1be50..0000000 --- a/src/main/scala/com/spotify/bdrc/scala/JavaPrimitives.scala +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.scala - -/** - * Examples for working with Java primitives. - */ -object JavaPrimitives { - - // java.lang.Double is a boxed type (object) while double in Java is a primitive type. - // scala.Double can be either boxed or primitive depending on the context, e.g. it's boxed when - // used as a type parameter in a generic class but primitive when used in an array or on the - // stack. - // Due to type system limitations, M[java.lang.Double] and M[scala.Double] are incompatible types - // but they can be casted safely back and forth since both are implemented as Java boxed types. - import java.lang.{Double => JDouble} - import java.util.{List => JList} - - import scala.collection.JavaConverters._ - - /** - * `xs.asScala` returns `mutable.Buffer[JDouble]` where `Buffer` is a sub-type of `Seq` but - * `JDouble` is not the same type as `Double` (`scala.Double`). Casting is safe because `JDouble` - * and `Double` are equivalent when used as type parameters (boxed objects). It's also cheaper - * than `.map(_.toDouble)` which creates a copy of the `Buffer`. - */ - def jDoubleListToSeq(xs: JList[JDouble]): Seq[Double] = xs.asScala.asInstanceOf[Seq[Double]] - - /** - * Array[Double] is more efficient since it's implemented as a Java primitive array. Arrays are - * also mutable so it'scheaper to pre-allocate and mutate elements. Java iterator and while loop - * are faster than `xs.asScala.asInstanceOf[Seq[Double]].toArray`. - */ - def jDoubleListToArray(xs: JList[JDouble]): Array[Double] = { - val a = new Array[Double](xs.size()) - var i = 0 - val iterator = xs.iterator() - while (iterator.hasNext) { - a(i) = iterator.next() - i += 1 - } - a - } - -} diff --git a/src/main/scala/com/spotify/bdrc/util/Records.scala b/src/main/scala/com/spotify/bdrc/util/Records.scala deleted file mode 100644 index 070cf0e..0000000 --- a/src/main/scala/com/spotify/bdrc/util/Records.scala +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.util - -object Records { - - case class LogEvent(user: String, track: String, timestamp: Long) - case class Rating(user: String, item: String, score: Double) - case class UserMeta(user: String, age: Int) - -} diff --git a/src/test/scala/com/spotify/bdrc/bench/ForYieldBenchmark.scala b/src/test/scala/com/spotify/bdrc/bench/ForYieldBenchmark.scala deleted file mode 100644 index 4fce4e5..0000000 --- a/src/test/scala/com/spotify/bdrc/bench/ForYieldBenchmark.scala +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.bench - -import java.lang.{Iterable => JIterable} - -import com.google.common.collect.Lists -import org.scalameter.api._ -import org.scalameter.picklers.noPickler._ - -import scala.collection.JavaConverters._ - -/** Micro-benchmark for for/yield pattern. */ -object ForYieldBenchmark extends Bench.LocalTime { - - val lSizes = Gen.enumeration("lSize")(1, 10, 100, 1000) - val rSizes = Gen.enumeration("rSize")(1, 10, 100, 1000) - - def jIterable(i: Int): JIterable[String] = - Lists.newArrayList((0 until i).map("v%05d".format(_)): _*).asInstanceOf[JIterable[String]] - - val inputs = for { - l <- lSizes - r <- rSizes - } yield (jIterable(l), jIterable(r)) - - performance of "Join" in { - measure method "forIterable" in { - using(inputs) in { p => - for { - a <- p._1.asScala - b <- p._2.asScala - } yield ("key", (a, b)) - } - } - - // Iterator version is lazy and more efficient - measure method "forIterator" in { - using(inputs) in { p => - val r = for { - a <- p._1.asScala.iterator - b <- p._2.asScala.iterator - } yield ("key", (a, b)) - r.toIterable - } - } - } - -} diff --git a/src/test/scala/com/spotify/bdrc/testing/T01EndToEndTest.scala b/src/test/scala/com/spotify/bdrc/testing/T01EndToEndTest.scala deleted file mode 100644 index 4f9b83f..0000000 --- a/src/test/scala/com/spotify/bdrc/testing/T01EndToEndTest.scala +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.testing - -import com.spotify.scio._ -import com.spotify.scio.io.TextIO -import com.spotify.scio.testing.PipelineSpec - -object WordCount1 { - def main(cmdlineArgs: Array[String]): Unit = { - val (sc, args) = ContextAndArgs(cmdlineArgs) - sc.textFile(args("input")) - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - .countByValue - .map(kv => kv._1 + ": " + kv._2) - .saveAsTextFile(args("output")) - sc.run() - } -} - -/** - * Test an entire pipeline end-to-end - * - * Pros: - * - Complete test of the entire pipeline - * - Covers argument parsing and I/O handling - * - May also reveal serialization issues - * - * Cons: - * - Hard to handcraft input and expected data - * - Hard to cover edge cases for complex pipelines - * - Can be slow in some frameworks - * - * Supported in: Scalding, Scio - * - * Recommendation: - * This is a good approach to test small and simple pipelines since it offers the best code - * coverage. It can also be used for pipelines with complex argument parsing and I/O handling, - * e.g. ones with dynamic I/O based on arguments. - * - * Very complex pipelines with lots of steps may be broken down into smaller logical blocks and - * tested separately using the transform test approach. - */ -class T01EndToEndTest extends PipelineSpec { - - val input = Seq("a b c d e", "a b a b") - val expected = Seq("a: 3", "b: 3", "c: 1", "d: 1", "e: 1") - - "WordCount1" should "work" in { - JobTest[com.spotify.bdrc.testing.WordCount1.type] - .args("--input=in.txt", "--output=out.txt") - .input(TextIO("in.txt"), input) - .output(TextIO("out.txt"))(output => output should containInAnyOrder(expected)) - .run() - } - -} diff --git a/src/test/scala/com/spotify/bdrc/testing/T02TransformTest.scala b/src/test/scala/com/spotify/bdrc/testing/T02TransformTest.scala deleted file mode 100644 index 9a95a9a..0000000 --- a/src/test/scala/com/spotify/bdrc/testing/T02TransformTest.scala +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.testing - -import com.spotify.scio._ -import com.spotify.scio.testing._ -import com.spotify.scio.values.SCollection - -object WordCount2 { - def main(cmdlineArgs: Array[String]): Unit = { - val (sc, args) = ContextAndArgs(cmdlineArgs) - val input = sc.textFile(args("input")) - val wc = countWords(input) - val output = formatOutput(wc) - output.saveAsTextFile(args("output")) - } - - def countWords(input: SCollection[String]): SCollection[(String, Long)] = - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - .countByValue - - def formatOutput(input: SCollection[(String, Long)]): SCollection[String] = - input - .map(kv => kv._1 + ": " + kv._2) -} - -/** - * Test pipeline transforms - * - * Pros: - * - Break down complex pipelines into smaller reusable pieces - * - Easier to handcraft input and expected data than end-to-end test - * - * Cons: - * - Does not cover argument parsing and IO handling - * - May disrupt pipeline logic flow if overused - * - * Supported in: Scalding, Scio, Spark - * - * Recommendation: - * Complex pipelines can be broken into logical blocks and tested using this approach. Individual - * transforms should have clear roles in the pipeline, e.g. parsing input, formatting output, - * aggregating data, training model, predicting labels, etc. It should also be easy to craft input - * and expected data for these transforms and cover all code bases and edge cases. - * - * The level of granularity of each transform is also important. A transform should be small enough - * for readability but big enough to avoid disruption to the main pipeline flow. Things to - * consider are: number of inputs and outputs, group or join operations, etc. - */ -class TransformTest extends PipelineSpec { - - val input = Seq("a b c d e", "a b a b") - val expected = Seq("a: 3", "b: 3", "c: 1", "d: 1", "e: 1") - val intermediate = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L)) - - "countWords" should "work" in { - runWithContext { sc => - val in = sc.parallelize(input) - WordCount2.countWords(in) should containInAnyOrder(intermediate) - } - } - - "formatOutput" should "work" in { - runWithContext { sc => - val in = sc.parallelize(intermediate) - WordCount2.formatOutput(in) should containInAnyOrder(expected) - } - } - -} diff --git a/src/test/scala/com/spotify/bdrc/testing/T03FunctionTest.scala b/src/test/scala/com/spotify/bdrc/testing/T03FunctionTest.scala deleted file mode 100644 index bbf88eb..0000000 --- a/src/test/scala/com/spotify/bdrc/testing/T03FunctionTest.scala +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.testing - -import com.spotify.scio._ -import org.scalatest.flatspec.AnyFlatSpec -import org.scalatest.matchers.should.Matchers - -object WordCount3 { - def main(cmdlineArgs: Array[String]): Unit = { - val (sc, args) = ContextAndArgs(cmdlineArgs) - sc.textFile(args("input")) - .flatMap(split) - .countByValue - .map(format) - .saveAsTextFile(args("output")) - } - - def split(input: String): Seq[String] = input.split("[^a-zA-Z']+").filter(_.nonEmpty) - def format(kv: (String, Long)): String = kv._1 + ": " + kv._2 -} - -/** - * Test individual functions used in a pipeline - * - * Pros: - * - Fastest to test - * - Easy to cover edge cases - * - * Cons: - * - Limited scope of coverage - * - May disrupt pipeline logic flow if overused - * - * Supported in: any framework - * - * Recommendation: - * This is recommended for commonly reused functions or those with complex business logic, e.g. - * numerical computation, log clean up and filtering, value group operations after groupByKey. - * - * The level of granularity of each function is also important. Typical candidates are multi-line - * functions that are used more than once. Functions with complex logic and hard to test at a - * higher level (transform or end-to-end), e.g. user session analysis after grouping by user key, - * can also be tested with this approach. - */ -class FunctionTest extends AnyFlatSpec with Matchers { - - "split" should "work" in { - WordCount3.split("a b,c d\te\n\nf") should equal(Seq("a", "b", "c", "d", "e", "f")) - } - - "format" should "work" in { - WordCount3.format(("a", 10L)) should equal("a: 10") - } - -} diff --git a/src/test/scala/com/spotify/bdrc/testing/T04PropertyBasedTest.scala b/src/test/scala/com/spotify/bdrc/testing/T04PropertyBasedTest.scala deleted file mode 100644 index d765d1f..0000000 --- a/src/test/scala/com/spotify/bdrc/testing/T04PropertyBasedTest.scala +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.testing - -import com.google.common.collect.MinMaxPriorityQueue -import org.scalacheck.Prop._ -import org.scalacheck.{Gen, Properties} -import org.scalatest.propspec.AnyPropSpec -import org.scalatest.matchers.should.Matchers -import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks - -import scala.collection.JavaConverters._ - -object Utils { - - def top[T: Ordering](xs: Seq[T], num: Int): Seq[T] = { - if (xs.isEmpty) { - Seq.empty[T] - } else { - val size = math.min(num, xs.size) - val ord = implicitly[Ordering[T]] - MinMaxPriorityQueue - .orderedBy(ord.reverse) - .expectedSize(size) - .maximumSize(size) - .create[T](xs.asJava) - .asScala - .toSeq - .sorted(ord.reverse) - } - } - - def split(input: String): Seq[String] = - input - .split("[^a-zA-Z']+") - .filter(_.nonEmpty) - .map(_.toLowerCase) - - def cosineSim(v1: Seq[Double], v2: Seq[Double]): Double = { - require(v1.length == v2.length) - var s1 = 0.0 - var s2 = 0.0 - var dp = 0.0 - var i = 0 - while (i < v1.length) { - s1 += v1(i) * v1(i) - s2 += v2(i) * v2(i) - dp += v1(i) * v2(i) - i += 1 - } - dp / math.sqrt(s1 * s2) - } - -} - -/** - * Property-based testing using ScalaCheck - * - * http://scalacheck.org/ - * - * Pros: - * - No need to handcraft input data - * - May reveal rare edge cases, e.g. null input, extreme values, empty lists - * - * Cons: - * - Hard to test business logic - * - Some properties may be hard to verify - * - Can be slow for expensive computations - * - * Supported in: any framework - * - * Recommendation: - * This is useful for functions simple input and output types, especially those of heavy - * mathematically computation, e.g. linear algebra, hash functions, set operations. - * - * However, since input data are randomly generated based on type signature, it might produce edge - * cases irrelevant to the business logic, e.g. Double.MinValue, strings with Unicode characters. - * You might also have to construct your own generator if certain distribution of input data is - * expected, e.g. positive integers, strings from a finite set. - * - * See AlgebirdSpec.scala for more examples of testing Algebird features using ScalaCheck - * https://github.com/spotify/scio/blob/master/scio-examples/src/test/scala/com/spotify/scio/examples/extra/AlgebirdSpec.scala - */ -class PropertyBasedTest extends AnyPropSpec with ScalaCheckDrivenPropertyChecks with Matchers { - - property("top") { - forAll { xs: Seq[Long] => Utils.top(xs, 5) shouldBe xs.sorted.reverse.take(5) } - } - - property("split") { - forAll { line: String => Utils.split(line).forall(_.matches("[a-z']+")) } - } - - // Generator for List[Double] of 100 doubles between -100.0 and 100.0 - val genVector = Gen.listOfN(100, Gen.choose(-100.0, 100.0)) - - property("cosineSim") { - forAll(genVector, genVector) { (v1, v2) => - val s1 = Utils.cosineSim(v1, v2) - val s2 = Utils.cosineSim(v2, v1) - - s1 should (be >= -1.0 and be <= 1.0) - s1 shouldBe s2 - Utils.cosineSim(v1, v1) shouldBe 1.0 - Utils.cosineSim(v1, v1.map(-_)) shouldBe -1.0 - } - } - -} diff --git a/src/test/scala/com/spotify/bdrc/testing/T05MixedTest.scala b/src/test/scala/com/spotify/bdrc/testing/T05MixedTest.scala deleted file mode 100644 index ba4abb2..0000000 --- a/src/test/scala/com/spotify/bdrc/testing/T05MixedTest.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.testing - -import com.spotify.scio._ -import com.spotify.scio.io.TextIO -import com.spotify.scio.testing.PipelineSpec -import com.spotify.scio.values.SCollection - -object WordCount4 { - def main(cmdlineArgs: Array[String]): Unit = { - val (sc, args) = ContextAndArgs(cmdlineArgs) - val input = sc.textFile(args("input")) - val wc = countWords(input) - val output = formatOutput(wc) - output.saveAsTextFile(args("output")) - sc.run() - } - - // transforms - def countWords(input: SCollection[String]): SCollection[(String, Long)] = - input.flatMap(split).countByValue - def formatOutput(input: SCollection[(String, Long)]): SCollection[String] = - input.map(format) - - // functions - def split(input: String): Seq[String] = input.split("[^a-zA-Z']+").filter(_.nonEmpty) - def format(kv: (String, Long)): String = kv._1 + ": " + kv._2 -} - -/** - * Mixed function, transform and end-to-end tests - * - * Property-based tests require an object that extends Properties and therefore are not included. - */ -class MixedTest extends PipelineSpec { - - val input = Seq("a b c d e", "a b a b") - val expected = Seq("a: 3", "b: 3", "c: 1", "d: 1", "e: 1") - val intermediate = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L)) - - // Function tests - - "split" should "work" in { - WordCount3.split("a b,c d\te\n\nf") should equal(Seq("a", "b", "c", "d", "e", "f")) - } - - "format" should "work" in { - WordCount3.format(("a", 10L)) should equal("a: 10") - } - - // Transform tests - - "countWords" should "work" in { - runWithContext { sc => - val in = sc.parallelize(input) - WordCount4.countWords(in) should containInAnyOrder(intermediate) - } - } - - "formatOutput" should "work" in { - runWithContext { sc => - val in = sc.parallelize(intermediate) - WordCount4.formatOutput(in) should containInAnyOrder(expected) - } - } - - // End-to-end test - - "WordCount1" should "work" in { - JobTest[com.spotify.bdrc.testing.WordCount4.type] - .args("--input=in.txt", "--output=out.txt") - .input(TextIO("in.txt"), input) - .output(TextIO("out.txt"))(output => output should containInAnyOrder(expected)) - .run() - } - -}