@@ -369,7 +369,7 @@

Aggregate per key into a Seq[Rating]

diff --git a/WordCount.scala.html b/WordCount.scala.html index f36e393..f9c51c4 100644 --- a/WordCount.scala.html +++ b/WordCount.scala.html @@ -306,8 +306,8 @@

Scio

def scio(input: SCollection[String]): SCollection[(String, Long)] = { input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - .countByValue + .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) + .countByValue }

diff --git a/index.html b/index.html index a4b0750..b378a5d 100644 --- a/index.html +++ b/index.html @@ -17,6 +17,7 @@ - [JoinLogs.scala](JoinLogs.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala)) - Join Two Log Datasets and Compute Action Sequence - [MaxItemPerUser.scala](MaxItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala)) - Compute One Item with Max Score per User - [MinItemPerUser.scala](MinItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala)) - Compute One Item with Min Score per User +- [Sessions.scala](Sessions.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala)) - Compute Session Duration and Number of Items from Log Data - [Statistics.scala](Statistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala)) - Compute Basic Descriptive Statistics - [SumPerItem.scala](SumPerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala)) - Compute the Sum of Scores per Item - [TopItems.scala](TopItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala)) - Compute Top K Items Globally From 47e12f313053d71cf5f7908cc3a5412e65f97859 Mon Sep 17 00:00:00 2001 From: Filipe Regadas Date: Fri, 13 Dec 2019 15:24:56 -0500 Subject: [PATCH 07/11] updated site --- AverageScorePerItem.scala.html | 2030 ------------------------------ Count.scala.html | 1986 ------------------------------ CountDistinctItems.scala.html | 1997 ------------------------------ CountUsers.scala.html | 1993 ------------------------------ DistinctItems.scala.html | 1942 ----------------------------- FieldStatistics.scala.html | 2014 ------------------------------ InvertedIndex.scala.html | 1977 ------------------------------ JoinLogAndMetadata.scala.html | 2111 -------------------------------- JoinLogs.scala.html | 2024 ------------------------------ MaxItemPerUser.scala.html | 2034 ------------------------------ MinItemPerUser.scala.html | 2034 ------------------------------ Sessions.scala.html | 2043 ------------------------------- Statistics.scala.html | 1996 ------------------------------ SumPerItem.scala.html | 1999 ------------------------------ TopItems.scala.html | 2069 ------------------------------- TopItemsPerUser.scala.html | 2028 ------------------------------ WordCount.scala.html | 1976 ------------------------------ index.html | 29 - 18 files changed, 34282 deletions(-) delete mode 100644 AverageScorePerItem.scala.html delete mode 100644 Count.scala.html delete mode 100644 CountDistinctItems.scala.html delete mode 100644 CountUsers.scala.html delete mode 100644 DistinctItems.scala.html delete mode 100644 FieldStatistics.scala.html delete mode 100644 InvertedIndex.scala.html delete mode 100644 JoinLogAndMetadata.scala.html delete mode 100644 JoinLogs.scala.html delete mode 100644 MaxItemPerUser.scala.html delete mode 100644 MinItemPerUser.scala.html delete mode 100644 Sessions.scala.html delete mode 100644 Statistics.scala.html delete mode 100644 SumPerItem.scala.html delete mode 100644 TopItems.scala.html delete mode 100644 TopItemsPerUser.scala.html delete mode 100644 WordCount.scala.html delete mode 100644 index.html diff --git a/AverageScorePerItem.scala.html b/AverageScorePerItem.scala.html deleted file mode 100644 index 90750d6..0000000 --- a/AverageScorePerItem.scala.html +++ /dev/null @@ -1,2030 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.algebird.Semigroup -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object AverageScorePerItem { -

- -

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - input - .groupBy(_.user)

- -

Map into (sum, count)

.mapValues(x => (x.score, 1L))

- -

Sum both per key with an implicit Semigroup[(Double, Long)]

.sum

- -

Map (sum, count) into average

.mapValues(p => p._1 / p._2) - .toTypedPipe - } -

- -

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.AveragedValue - input - .groupBy(_.user)

- -

Map values into Double

.mapValues(_.score)

- -

Aggregate average per key

.aggregate(AveragedValue.aggregator) - .toTypedPipe - } -

- -

Scio

def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { - input - .keyBy(_.user)

- -

Map into (sum, count)

.mapValues(x => (x.score, 1L))

- -

Sum both per key with an implicit Semigroup[(Double, Long)]

.sumByKey

- -

Map (sum, count) into average

.mapValues(p => p._1 / p._2) - } -

- -

Spark

Summon an Algebird Semigroup[(Double, Long)] with implicit argument

def spark(input: RDD[Rating])(implicit sg: Semigroup[(Double, Long)]): RDD[(String, Double)] = { - input - .keyBy(_.user)

- -

Map into (sum, count)

.mapValues(x => (x.score, 1L))

- -

Reduce both per key with plus = (T, T) => T where T is (Double, Long)

.reduceByKey(sg.plus) // plus: (T, T) => T where T is (Double, Long)

- -

Map (sum, count) into average

.mapValues(p => p._1 / p._2) - } -

- -

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): RDD[(String, Double)] = { - import com.twitter.algebird.AveragedValue - import com.twitter.algebird.spark._ - input - .keyBy(_.user) - .mapValues(_.score)

- -

Map values into Double

.algebird

- -

Aggregate average per key

.aggregateByKey(AveragedValue.aggregator) - } - -}

- - - - - - \ No newline at end of file diff --git a/Count.scala.html b/Count.scala.html deleted file mode 100644 index 64bc8de..0000000 --- a/Count.scala.html +++ /dev/null @@ -1,1986 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

- -

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { - input - .map(_ => 1L)

- -

Sum with an implicit Semigroup[Long]

.sum - .toTypedPipe - } -

- -

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Long] = { - import com.twitter.algebird.Aggregator.size - input - .aggregate(size) - .toTypedPipe - } -

- -

Scio

def scio(input: SCollection[Rating]): SCollection[Long] = { - input - .count - } -

- -

Scio with Algebird `Aggregator`

def scioWithAlgebird(input: SCollection[Rating]): SCollection[Long] = { - import com.twitter.algebird.Aggregator.size - input - .aggregate(size) - } -

- -

Spark

def spark(input: RDD[Rating]): Long = { - input

- -

count is an action and collects data back to the driver node

.count - } -

- -

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): Long = { - import com.twitter.algebird.Aggregator.size - import com.twitter.algebird.spark._ - input - .algebird

- -

aggregate is an action and collects data back to the driver node

.aggregate(size) - } - -}

- - - - - - \ No newline at end of file diff --git a/CountDistinctItems.scala.html b/CountDistinctItems.scala.html deleted file mode 100644 index b4852a0..0000000 --- a/CountDistinctItems.scala.html +++ /dev/null @@ -1,1997 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

package com.spotify.bdrc.pipeline - -import com.google.common.base.Charsets -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object CountDistinctItems { -

- -

Scalding Exact Approach

def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { - input - .map(_.item)

- -

Remove duplicates, requires a shuffle

.distinct - .map(_ => 1L)

- -

Sum with an implicit Semigroup[Long]

.sum - .toTypedPipe - } -

- -

Scalding Approximate Approach

def scaldingApproxWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Double] = { - import com.twitter.algebird.HyperLogLogAggregator - val aggregator = HyperLogLogAggregator.sizeAggregator(bits = 12) - input

- -

HyperLogLog expects bytes input

.map(_.item.getBytes(Charsets.UTF_8))

- -

Aggregate globally into a Double

.aggregate(aggregator) - .toTypedPipe - } -

- -

Scio Exact Approach

def scio(input: SCollection[Rating]): SCollection[Long] = { - input - .map(_.item) - .distinct - .count - } -

- -

Scio Approximate Approach

def scioApprox(input: SCollection[Rating]): SCollection[Long] = { - input - .map(_.item) - .countApproxDistinct() - } -

- -

Spark Exact Approach

def spark(input: RDD[Rating]): Long = { - input - .map(_.item) - .distinct() - .count() - } -

- -

Spark Approximate Approach

def sparkApprox(input: RDD[Rating]): Long = { - input - .map(_.item) - .countApproxDistinct() - } - -}

- - - - - - \ No newline at end of file diff --git a/CountUsers.scala.html b/CountUsers.scala.html deleted file mode 100644 index a9ee312..0000000 --- a/CountUsers.scala.html +++ /dev/null @@ -1,1993 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

- -

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { - input - .filter(_.user == "Smith") - .map(_ => 1L)

- -

Sum with an implicit Semigroup[Long]

.sum - .toTypedPipe - } -

- -

Sclading with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Long] = { - import com.twitter.algebird.Aggregator.count - input

- -

Aggregate globally into a single Long

.aggregate(count(_.user == "Smith")) - .toTypedPipe - } - - def scio(input: SCollection[Rating]): SCollection[Long] = { - input - .filter(_.user == "Smith") - .count - } -

- -

Scio with Algebird `Aggregator`

def scioWithAlgebird(input: SCollection[Rating]): SCollection[Long] = { - import com.twitter.algebird.Aggregator.count - input

- -

Aggregate globally into a single Long

.aggregate(count(_.user == "Smith")) - } -

- -

Spark

def spark(input: RDD[Rating]): Long = { - input - .filter(_.user == "Smith")

- -

count is an action and collects data back to the driver node

.count() - } -

- -

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): Long = { - import com.twitter.algebird.Aggregator.count - import com.twitter.algebird.spark._ - input - .algebird

- -

aggregate is an action and collects data back to the driver node

.aggregate(count(_.user == "Smith")) - } - -}

- - - - - - \ No newline at end of file diff --git a/DistinctItems.scala.html b/DistinctItems.scala.html deleted file mode 100644 index 922a9a8..0000000 --- a/DistinctItems.scala.html +++ /dev/null @@ -1,1942 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

- -

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[String] = { - input - .map(_.item) - .distinct - } -

- -

Scio

def scio(input: SCollection[Rating]): SCollection[String] = { - input - .map(_.item) - .distinct - } -

- -

Spark

def spark(input: RDD[Rating]): RDD[String] = { - input - .map(_.item) - .distinct() - } - -}

- - - - - - \ No newline at end of file diff --git a/FieldStatistics.scala.html b/FieldStatistics.scala.html deleted file mode 100644 index 9cd7508..0000000 --- a/FieldStatistics.scala.html +++ /dev/null @@ -1,2014 +0,0 @@ - - - - - - - -

Input is a collection of case classes

package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object FieldStatistics { - - case class User(age: Int, income: Double, score: Double) - case class Stats(max: Double, min: Double, mean: Double, stddev: Double) - case class UserStats(age: Stats, income: Stats, score: Stats) -

- -

Algebird `Aggregator`

def aggregator = { - import com.twitter.algebird._ -

- -

Create 3 Aggregators on age field with different logic

- -

The first 2 are of type Aggregator[User, _, Int] which means it takes User as input and -generates Int as output. The last one is of type Aggregator[User, _, Moments], -where Moments include count, mean, standard deviation, etc. The input User is prepared -with a User => Int function _.age.

val maxAgeOp = Aggregator.max[Int].composePrepare[User](_.age) - val minAgeOp = Aggregator.min[Int].composePrepare[User](_.age) - val momentsAgeOp = Moments.aggregator.composePrepare[User](_.age) -

- -

Create 3 Aggregators on income field with different logic

val maxIncomeOp = Aggregator.max[Double].composePrepare[User](_.income) - val minIncomeOp = Aggregator.min[Double].composePrepare[User](_.income) - val momentsIncomeOp = Moments.aggregator.composePrepare[User](_.income) -

- -

Create 3 Aggregators on score field with different logic

val maxScoreOp = Aggregator.max[Double].composePrepare[User](_.score) - val minScoreOp = Aggregator.min[Double].composePrepare[User](_.score) - val momentsScoreOp = Moments.aggregator.composePrepare[User](_.score) -

- -

Apply 12 Aggregators on the same input, present result tuple 12 as UserStats.

MultiAggregator( - maxAgeOp, minAgeOp, momentsAgeOp, - maxIncomeOp, minIncomeOp, momentsIncomeOp, - maxScoreOp, minScoreOp, momentsScoreOp) - .andThenPresent { t => - val (maxAge, minAge, mAge, maxIncome, minIncome, mIncome, maxScore, minScore, mScore) = t - UserStats( - age = Stats(maxAge, minAge, mAge.mean, mAge.stddev), - income = Stats(maxIncome, minIncome, mIncome.mean, mIncome.stddev), - score = Stats(maxScore, minScore, mScore.mean, mScore.stddev)) - } - } -

- -

Scalding

def scalding(input: TypedPipe[User]): TypedPipe[UserStats] = { - input.aggregate(aggregator) - } -

- -

Scio

def scio(input: SCollection[User]): SCollection[UserStats] = { - input.aggregate(aggregator) - } -

- -

Spark

def spark(input: RDD[User]): UserStats = {

- -

Compute each field separately, potentially in-efficient if input is not cached

val s1 = input.map(_.age).stats() - val s2 = input.map(_.income).stats() - val s3 = input.map(_.score).stats() - UserStats( - age = Stats(s1.max, s1.min, s1.mean, s1.stdev), - income = Stats(s2.max, s2.min, s2.mean, s2.stdev), - score = Stats(s3.max, s3.min, s3.mean, s3.stdev)) - } -

- -

Spark with Algebird `Aggregator`

def sparkAlgebird(input: RDD[User]): UserStats = { - import com.twitter.algebird.spark._ - input.algebird.aggregate(aggregator) - } - -}

- - - - - - \ No newline at end of file diff --git a/InvertedIndex.scala.html b/InvertedIndex.scala.html deleted file mode 100644 index 1cc9cdd..0000000 --- a/InvertedIndex.scala.html +++ /dev/null @@ -1,1977 +0,0 @@ - - - - - - - -

Build inverted index from a corpus of text documents

- -

Input is a collection of (id, text)

package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object InvertedIndex { - - case class Document(id: Int, text: String) - case class Posting(word: String, ids: Seq[Int]) -

- -

Scalding

def scalding(input: TypedPipe[Document]): TypedPipe[Posting] = { - input

- -

Split text and output (word, document ID)

.flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id)))

- -

Group and convert document IDs per key to List[Int]

.group - .toList - .map(Posting.tupled) - } -

- -

Scio

def scio(input: SCollection[Document]): SCollection[Posting] = { - input

- -

Split text and output (word, document ID)

.flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id)))

- -

Group document IDs per key into Iterable[Int]

.groupByKey - .map(kv => Posting(kv._1, kv._2.toSeq)) - } -

- -

Spark

def spark(input: RDD[Document]): RDD[Posting] = { - input

- -

Split text and output (word, document ID)

.flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id)))

- -

Group document IDs per key into Iterable[Int]

.groupByKey() - .map(kv => Posting(kv._1, kv._2.toSeq)) - } - -}

- - - - - - \ No newline at end of file diff --git a/JoinLogAndMetadata.scala.html b/JoinLogAndMetadata.scala.html deleted file mode 100644 index 64bec46..0000000 --- a/JoinLogAndMetadata.scala.html +++ /dev/null @@ -1,2111 +0,0 @@ - - - - - - - -

Compute average age of users who listened to a track by joining log event and user metadata.

LHS input is a large collection of (user, page, timestamp).
RHS input is a small collection of (user, age).

package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.{UserMeta, LogEvent} -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object JoinLogAndMetadata { -

- -

Scalding Naive Approach

def scaldingNaive(left: TypedPipe[LogEvent], - right: TypedPipe[UserMeta]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.AveragedValue - left - .groupBy(_.user)

- -

Join as (user, (LogEvent, UserMeta))

.join(right.groupBy(_.user))

- -

Drop user key

.values

- -

Map into (track, age)

.map { case (logEvent, userMeta) => - (logEvent.track, userMeta.age.toDouble) - } - .group

- -

Aggregate average age per track

.aggregate(AveragedValue.aggregator) - .toTypedPipe - } -

- -

Scalding with Hash Join

hashJoin replicates the smaller RHS to all mappers on the LHS

def scaldingHashJoin(left: TypedPipe[LogEvent], - right: TypedPipe[UserMeta]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.AveragedValue -

- -

Map out fields to avoid shuffing large objects

val lhs = left.map(e => (e.user, e.track))

- -

Force to disk to avoid repeating the same computation on each mapper on the LHS

val rhs = right.map(u => (u.user, u.age.toDouble)).forceToDisk - - lhs - .hashJoin(rhs) - .values - .group - .aggregate(AveragedValue.aggregator) - .toTypedPipe - } -

- -

Scio Naive Approach

def scioNaive(left: SCollection[LogEvent], - right: SCollection[UserMeta]): SCollection[(String, Double)] = { - import com.twitter.algebird.AveragedValue - val lhs = left.map(e => (e.user, e.track)) - val rhs = right.map(u => (u.user, u.age.toDouble))

- -

Join as (user, (track, age))

lhs.join(rhs)

- -

Drop user key to make track as new key in (track, age)

.values

- -

Aggregate average age per track

.aggregateByKey(AveragedValue.aggregator) - } -

- -

Scio with Side Input

Side input makes RHS available on all workers

def scioSideInput(left: SCollection[LogEvent], - right: SCollection[UserMeta]): SCollection[(String, Double)] = { - import com.twitter.algebird.AveragedValue -

- -

Convert RHS to a side input of Map[String, Double]

val rhs = right.map(u => (u.user, u.age.toDouble)).asMapSideInput -

- -

Replicate RHS to each worker

left.withSideInputs(rhs)

- -

Access side input via the context

.map { case (e, sideContext) => (e.track, sideContext(rhs).getOrElse(e.user, 0.0)) }

- -

Convert back to regular SCollection

.toSCollection - .aggregateByKey(AveragedValue.aggregator) - } -

- -

Scio with Hash Join

hashJoin is a short cut to the side input approach

def scioHashJoin(left: SCollection[LogEvent], - right: SCollection[UserMeta]): SCollection[(String, Double)] = { - import com.twitter.algebird.AveragedValue - val lhs = left.map(e => (e.user, e.track)) - val rhs = right.map(u => (u.user, u.age.toDouble)) - lhs.hashJoin(rhs) - .values - .aggregateByKey(AveragedValue.aggregator) - } -

- -

Spark Naive Approach

def sparkNaive(left: RDD[LogEvent], right: RDD[UserMeta]): RDD[(String, Double)] = { - import com.twitter.algebird.spark._ - import com.twitter.algebird.AveragedValue - val lhs = left.map(e => (e.user, e.track)) - val rhs = right.map(u => (u.user, u.age.toDouble))

- -

Join as (user, (track, age))

lhs.join(rhs)

- -

Drop user key to make track as new key in (track, age)

.values - .algebird

- -

Aggregate average age per track

.aggregateByKey(AveragedValue.aggregator) - } -

- -

Spark with Broadcast Variable

def sparkBroadcast(left: RDD[LogEvent], right: RDD[UserMeta]): RDD[(String, Double)] = { - import com.twitter.algebird.spark._ - import com.twitter.algebird.AveragedValue -

- -

Retrieve SparkContext for creating broadcast variable

val sc = left.context -

- -

Collect RHS to driver memory and broadcast back to workers

val map = right.map(u => (u.user, u.age.toDouble)).collectAsMap() - val b = sc.broadcast(map) - - left

- -

In-memory lookup on each worker

.map(e => (e.track, b.value.getOrElse(e.user, 0.0))) - .algebird - .aggregateByKey(AveragedValue.aggregator) - } - -}

- - - - - - \ No newline at end of file diff --git a/JoinLogs.scala.html b/JoinLogs.scala.html deleted file mode 100644 index 50606cb..0000000 --- a/JoinLogs.scala.html +++ /dev/null @@ -1,2024 +0,0 @@ - - - - - - - -

Given two log datasets of play track and save track events, compute tracks that a user saved -after playing in a session.

- -

Inputs are collections of (user, item, timestamp).

package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.LogEvent -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object JoinLogs { - - val gapDuration = 3600000 -

- -

Detect if a pair of (event type, LogEvent) tuples match a play and save sequence

def detectPlaySaveSequence(pair: Seq[(String, LogEvent)]): Option[String] = { - val Seq(first, second) = pair - if (first._1 == "play" && second._1 == "save" && first._2.track == second._2.track && - second._2.timestamp - first._2.timestamp <= gapDuration) { - Some(first._2.track) - } else { - None - } - } -

- -

Scalding

def scalding(playEvents: TypedPipe[LogEvent], - saveEvents: TypedPipe[LogEvent]): TypedPipe[(String, String)] = {

- -

Map inputs to key-values and add event type information

val plays = playEvents.map(e => (e.user, ("play", e))).group - val saves = saveEvents.map(e => (e.user, ("save", e))).group - - plays - .cogroup(saves) { (user, p, s) =>

- -

Iterables of play and save events for the user

(p ++ s).toList - .sortBy(_._2.timestamp)

- -

Neighboring pairs

.sliding(2) - .flatMap(detectPlaySaveSequence) - } - .toTypedPipe - } -

- -

Scio

def scio(playEvents: SCollection[LogEvent], - saveEvents: SCollection[LogEvent]): SCollection[(String, String)] = {

- -

Map inputs to key-values and add event type information

val plays = playEvents.map(e => (e.user, ("play", e))) - val saves = saveEvents.map(e => (e.user, ("save", e))) - - plays.cogroup(saves)

- -

Iterables of play and save events for the user

.flatMapValues { case (p, s) => - (p ++ s).toList - .sortBy(_._2.timestamp)

- -

Neighboring pairs

.sliding(2) - .flatMap(detectPlaySaveSequence) - } - } -

- -

Spark

def spark(playEvents: RDD[LogEvent], saveEvents: RDD[LogEvent]): RDD[(String, String)] = {

- -

Map inputs to key-values and add event type information

val plays = playEvents.map(e => (e.user, ("play", e))) - val saves = saveEvents.map(e => (e.user, ("save", e))) - - plays.cogroup(saves) - .flatMapValues { case (p, s) =>

- -

Iterables of play and save events for the user

(p ++ s).toList - .sortBy(_._2.timestamp)

- -

Neighboring pairs

.sliding(2) - .flatMap(detectPlaySaveSequence) - } - } - -}

- - - - - - \ No newline at end of file diff --git a/MaxItemPerUser.scala.html b/MaxItemPerUser.scala.html deleted file mode 100644 index 57fc277..0000000 --- a/MaxItemPerUser.scala.html +++ /dev/null @@ -1,2034 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

- -

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { - input - .groupBy(_.user)

- -

Reduce items per key by picking the side with higher score for each pair of input

.reduce((x, y) => if (x.score > y.score) x else y) - .values - } -

- -

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Rating] = { - import com.twitter.algebird.Aggregator.maxBy - input - .groupBy(_.user)

- -

Aggregate per key into a single Rating based on Double value via _.score

.aggregate(maxBy(_.score)) - .values - } -

- -

Scio

def scio(input: SCollection[Rating]): SCollection[Rating] = { - input - .keyBy(_.user)

- -

Compute top one item per key as an Iterable[Rating]

.topByKey(1, Ordering.by(_.score))

- -

Drop user key

.values

- -

Flatten result Iterable[Rating]

.flatten - } -

- -

Scio with Algebird `Aggregator`

def scioWithAlgebird(input: SCollection[Rating]): SCollection[Rating] = { - import com.twitter.algebird.Aggregator.maxBy - input - .keyBy(_.user)

- -

Aggregate per key into a single Rating based on Double value via _.score. Explicit -type due to type inference limitation.

.aggregateByKey(maxBy { x: Rating => x.score}) - .values - } -

- -

Spark

def spark(input: RDD[Rating]): RDD[Rating] = { - input - .keyBy(_.user)

- -

Reduce items per key by picking the side with higher score for each pair of input

.reduceByKey((x, y) => if (x.score > y.score) x else y) - .values - } -

- -

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { - import com.twitter.algebird.Aggregator.maxBy - import com.twitter.algebird.spark._ - input - .keyBy(_.user) - .algebird

- -

Aggregate per key into a single Rating based on Double value via _.score. Explicit -type due to type inference limitation.

.aggregateByKey(maxBy { x: Rating => x.score }) - .values - } -

- -

Spark with MLLib

def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { - import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ - input - .keyBy(_.user)

- -

From spark-mllib, compute top K per key with a priority queue

.topByKey(1)(Ordering.by(_.score)) - .flatMap(_._2) - } - -}

- - - - - - \ No newline at end of file diff --git a/MinItemPerUser.scala.html b/MinItemPerUser.scala.html deleted file mode 100644 index 261d0a1..0000000 --- a/MinItemPerUser.scala.html +++ /dev/null @@ -1,2034 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

- -

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { - input - .groupBy(_.user)

- -

Reduce items per key by picking the side with lower score for each pair of input

.reduce((x, y) => if (x.score < y.score) x else y) - .values - } -

- -

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Rating] = { - import com.twitter.algebird.Aggregator.minBy - input - .groupBy(_.user)

- -

Aggregate per key into a single Rating based on Double value via _.score

.aggregate(minBy(_.score)) - .values - } -

- -

Scio

def scio(input: SCollection[Rating]): SCollection[Rating] = { - input - .keyBy(_.user)

- -

Compute top one item per key as an Iterable[Rating] with a reverse comparator

.topByKey(1, Ordering.by(-_.score))

- -

Drop user key

.values

- -

Flatten result Iterable[Rating]

.flatten - } -

- -

Scio with Algebird `Aggregator`

def scioWithAlgebird(input: SCollection[Rating]): SCollection[Rating] = { - import com.twitter.algebird.Aggregator.minBy - input - .keyBy(_.user)

- -

Aggregate per key into a single Rating based on Double value via _.score. Explicit -type due to type inference limitation.

.aggregateByKey(minBy { x: Rating => x.score}) - .values - } -

- -

Spark

def spark(input: RDD[Rating]): RDD[Rating] = { - input - .keyBy(_.user)

- -

Reduce items per key by picking the side with lower score for each pair of input

.reduceByKey((x, y) => if (x.score < y.score) x else y) - .values - } -

- -

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { - import com.twitter.algebird.Aggregator.minBy - import com.twitter.algebird.spark._ - input - .keyBy(_.user) - .algebird

- -

Aggregate per key into a single Rating based on Double value via _.score. Explicit -type due to type inference limitation.

.aggregateByKey(minBy { x: Rating => x.score }) - .values - } -

- -

Spark with MLLib

def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { - import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ - input - .keyBy(_.user)

- -

From spark-mllib, compute top K per key with a priority queue and a reverse comparator

.topByKey(1)(Ordering.by(-_.score)) - .flatMap(_._2) - } - -}

- - - - - - \ No newline at end of file diff --git a/Sessions.scala.html b/Sessions.scala.html deleted file mode 100644 index 7f9735a..0000000 --- a/Sessions.scala.html +++ /dev/null @@ -1,2043 +0,0 @@ - - - - - - - -

Input is a collection of log events

package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.LogEvent -import com.spotify.scio.extra.Iterators._ -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD -import org.joda.time.Instant - -import scala.collection.mutable - -object Sessions { - - val gapDuration = 3600000 - - case class Session(user: String, duration: Long, numItems: Int) -

- -

Wrapper for Iterator[LogEvent] that group items into sessions

class SessionIterator(self: Iterator[LogEvent]) extends Iterator[Seq[LogEvent]] {

- -

BufferedIterator allows peak ahead

private val bi = self.buffered - override def hasNext: Boolean = bi.hasNext - override def next(): Seq[LogEvent] = { - val buf = mutable.Buffer(bi.next()) - var last = buf.head.timestamp -

- -

Consume subsequent events until a gap is detected

while (bi.hasNext && bi.head.timestamp - last < gapDuration) { - val n = bi.next() - buf.append(n) - last = n.timestamp - } - buf - } - } -

- -

Scalding

def scalding(input: TypedPipe[LogEvent]): TypedPipe[Session] = { - input - .groupBy(_.user)

- -

sortBy uses Hadoop secondary sort to sort keys during shuffle

.sortBy(_.timestamp)

- -

Iterate over values lazily and group items into sessions

.mapValueStream(new SessionIterator(_)) - .toTypedPipe

- -

Map over each (user, session items)

.map { case (user, items) => - Session(user, items.last.timestamp - items.head.timestamp, items.size) - } - } -

- -

Scio

def scio(input: SCollection[LogEvent]): SCollection[Session] = { - input

- -

Values in groupBy are sorted by timestamp

.timestampBy(e => new Instant(e.timestamp))

- -

No secondary sort in Scio, shuffle all items

.groupBy(_.user) - .flatMapValues { _ - .iterator

- -

Generic version of SessionIterator from scio-extra

.timeSeries(_.timestamp) - .session(gapDuration) - }

- -

Map over each (user, session items)

.map { case (user, items) => - Session(user, items.last.timestamp - items.head.timestamp, items.size) - } - } -

- -

Spark

def spark(input: RDD[LogEvent]): RDD[Session] = { - input

- -

No secondary sort in Spark, shuffle all items

.groupBy(_.user) - .flatMapValues { _

- -

Order of values after shuffle is not guaranteed

.toList.sortBy(_.timestamp) - .iterator

- -

Generic version of SessionIterator from scio-extra

.timeSeries(_.timestamp) - .session(gapDuration) - }

- -

Map over each (user, session items)

.map { case (user, items) => - Session(user, items.last.timestamp - items.head.timestamp, items.size) - } - } - -}

- - - - - - \ No newline at end of file diff --git a/Statistics.scala.html b/Statistics.scala.html deleted file mode 100644 index fde5ae6..0000000 --- a/Statistics.scala.html +++ /dev/null @@ -1,1996 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object Statistics { - - case class Stats(max: Double, min: Double, - sum: Double, count: Long, - mean: Double, stddev: Double) -

- -

Algebird `Aggregator`

def aggregator = { - import com.twitter.algebird._ -

- -

Create 4 Aggregators with different logic

- -

The first 3 are of type Aggregator[Rating, _, Double] which means it takes Rating as -input and generates Double as output. The last one is of type -Aggregator[Rating, _, Moments], where Moments include count, mean, standard deviation, -etc. The input Rating is prepared with a Rating => Double function _.score.

val maxOp = Aggregator.max[Double].composePrepare[Rating](_.score) - val minOp = Aggregator.min[Double].composePrepare[Rating](_.score) - val sumOp = Aggregator.prepareMonoid[Rating, Double](_.score) - val momentsOp = Moments.aggregator.composePrepare[Rating](_.score) -

- -

Apply 4 Aggregators on the same input, present result tuple 4 of -(Double, Double, Double, Moments) as Stats

MultiAggregator(maxOp, minOp, sumOp, momentsOp) - .andThenPresent { case (max, min, sum, moments) => - Stats(max, min, sum, moments.count, moments.mean, moments.stddev) - } - } -

- -

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Stats] = { - input.aggregate(aggregator) - } -

- -

Scio

def scio(input: SCollection[Rating]): SCollection[Stats] = { - input - .map(_.score) - .stats - .map(s => Stats(s.max, s.min, s.sum, s.count, s.mean, s.stdev)) - } -

- -

Scio with Algebird `Aggregator`

def scioAlgebird(input: SCollection[Rating]): SCollection[Stats] = { - input.aggregate(aggregator) - } -

- -

Spark

def spark(input: RDD[Rating]): Stats = { - val s = input.map(_.score).stats() - Stats(s.max, s.min, s.sum, s.count, s.mean, s.stdev) - } -

- -

Spark with Algebird `Aggregator`

def sparkAlgebird(input: RDD[Rating]): Stats = { - import com.twitter.algebird.spark._ - input.algebird.aggregate(aggregator) - } - -}

- - - - - - \ No newline at end of file diff --git a/SumPerItem.scala.html b/SumPerItem.scala.html deleted file mode 100644 index cf01344..0000000 --- a/SumPerItem.scala.html +++ /dev/null @@ -1,1999 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

- -

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - input - .groupBy(_.item) - .mapValues(_.score)

- -

Sum per key with an implicit Semigroup[Double]

.sum - .toTypedPipe - } -

- -

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.Aggregator.prepareMonoid - input - .groupBy(_.item)

- -

Aggregate per key with an aggregator that converts UserItemData to Double via -_.score before reduce

.aggregate(prepareMonoid(_.score)) - .toTypedPipe - } -

- -

Scio

def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { - input - .map(x => (x.item, x.score)) - .sumByKey - } -

- -

Spark

def spark(input: RDD[Rating]): RDD[(String, Double)] = { - input - .map(x => (x.item, x.score)) - .reduceByKey(_ + _) - } -

- -

Spark with Algebird `Semigroup`

def sparkWithAlgebird1(input: RDD[Rating]): RDD[(String, Double)] = { - import com.twitter.algebird.spark._ - input - .map(x => (x.item, x.score)) - .algebird

- -

Sum per key with an implicit Semigroup[Double]

.sumByKey - } -

- -

Spark with Algebird `Aggregator`

def sparkWithAlgebird2(input: RDD[Rating]): RDD[(String, Double)] = { - import com.twitter.algebird.Aggregator.prepareMonoid - import com.twitter.algebird.spark._ - input - .keyBy(_.item) - .algebird

- -

Aggregate per key with an aggregator that converts UserItemData to Double via -_.score before reduce. Explicit type due to type inference limitation.

.aggregateByKey(prepareMonoid { x: Rating => x.score }) - } - -}

- - - - - - \ No newline at end of file diff --git a/TopItems.scala.html b/TopItems.scala.html deleted file mode 100644 index d788649..0000000 --- a/TopItems.scala.html +++ /dev/null @@ -1,2069 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

- -

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - input - .map(x => (x.item, x.score)) - .group

- -

Sum values with an implicit Semigroup[Double]

.sum

- -

Group all elements with a single key Unit

.groupAll

- -

Take top K with a priority queue

.sortedReverseTake(topK)(Ordering.by(_._2))

- -

Drop Unit key

.values

- -

Flatten result Seq[(String, Double)]

.flatten - } -

- -

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) - input - .map(x => (x.item, x.score)) - .group

- -

Sum values with an implicit Semigroup[Double]

.sum - .toTypedPipe

- -

Aggregate globally into a single Seq[(String, Double)]

.aggregate(aggregator)

- -

Flatten result Seq[(String, Double)]

.flatten - } -

- -

Scio

def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { - input - .map(x => (x.item, x.score))

- -

Sum values with an implicit Semigroup[Double]

.sumByKey

- -

Compute top K as an Iterable[(String, Double)]

.top(topK, Ordering.by(_._2))

- -

Flatten result Iterable[(String, Double)]

.flatten - } -

- -

Scio with Algebird `Aggregator`

def scioWithAlgebird(input: SCollection[Rating]): SCollection[(String, Double)] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) - input - .map(x => (x.item, x.score))

- -

Sum values with an implicit Semigroup[Double]

.sumByKey

- -

Aggregate globally into a single Seq[(String, Double)]

.aggregate(aggregator)

- -

Flatten result Seq[(String, Double)]

.flatten - } -

- -

Spark

def spark(input: RDD[Rating]): Seq[(String, Double)] = { - input - .map(x => (x.item, x.score))

- -

Sum values with addition

.reduceByKey(_ + _)

- -

top is an action and collects data back to the driver node

.top(topK)(Ordering.by(_._2)) - } -

- -

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): Seq[(String, Double)] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - import com.twitter.algebird.spark._ - val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) - input - .map(x => (x.item, x.score))

- -

Sum values with addition

.reduceByKey(_ + _) - .algebird

- -

aggregate is an action and collects data back to the driver node

.aggregate(aggregator) - } - -}

- - - - - - \ No newline at end of file diff --git a/TopItemsPerUser.scala.html b/TopItemsPerUser.scala.html deleted file mode 100644 index edc301c..0000000 --- a/TopItemsPerUser.scala.html +++ /dev/null @@ -1,2028 +0,0 @@ - - - - - - - -

Input is a collection of (user, item, score)

- -

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { - input - .groupBy(_.user)

- -

Take top K per group with a priority queue

.sortedReverseTake(topK)(Ordering.by(_.score))

- -

Drop user key

.values

- -

Flatten result Seq[Rating]

.flatten - } -

- -

Scio

def scio(input: SCollection[Rating]): SCollection[Rating] = { - input - .keyBy(_.user)

- -

Compute top K per key

.topByKey(topK, Ordering.by(_.score))

- -

Drop user key

.values

- -

Flatten result Iterable[Rating]

.flatten - } -

- -

Spark Naive Approach

def spark(input: RDD[Rating]): RDD[Rating] = { - input

- -

groupBy shuffles all data, inefficient

.groupBy(_.user)

- -

Drop user key

.values

- -

Convert grouped values to a List[Rating] and sort on a single node, inefficient

.flatMap(_.toList.sortBy(-_.score).take(topK)) - } -

- -

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - import com.twitter.algebird.spark._ - val aggregator = sortedReverseTake[Rating](topK)(Ordering.by(_.score)) - input - .keyBy(_.user) - .algebird

- -

Aggregate per key into a Seq[Rating]

.aggregateByKey(aggregator)

- -

Flatten result Seq[Rating]

.flatMap(_._2) - } -

- -

Spark with MLLib

def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { - import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ - input - .keyBy(_.user)

- -

From spark-mllib, compute top K per key with a priority queue

.topByKey(topK)(Ordering.by(_.score))

- -

Flatten result Seq[Rating]

.flatMap(_._2) - } - -}

- - - - - - \ No newline at end of file diff --git a/WordCount.scala.html b/WordCount.scala.html deleted file mode 100644 index f9c51c4..0000000 --- a/WordCount.scala.html +++ /dev/null @@ -1,1976 +0,0 @@ - - - - - - - -

package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object WordCount { -

- -

Scalding

def scalding(input: TypedPipe[String]): TypedPipe[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty))

- -

groupBy is lazy

.groupBy(identity)

- -

Operations like size after groupBy can be lifted into the map phase

.size - .toTypedPipe - } -

- -

Scio

def scio(input: SCollection[String]): SCollection[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - .countByValue - } -

- -

Spark Transformation

def sparkTransformation(input: RDD[String]): RDD[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty))

- -

There is no countByValue transformation in Spark although it is equivalent to mapping -into initial count of 1 and reduce with addition

.map((_, 1L))

- -

reduceByKey can lift function into the map phase

.reduceByKey(_ + _) - } -

- -

Spark Action

def sparkAction(input: RDD[String]): Seq[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty))

- -

countByValue is an action and collects data back to the driver node

.countByValue() - .toSeq - } - -}

- - - - - - \ No newline at end of file diff --git a/index.html b/index.html deleted file mode 100644 index b378a5d..0000000 --- a/index.html +++ /dev/null @@ -1,29 +0,0 @@ - - - Codestin Search App - - - -### /pipeline/ - -- [AverageScorePerItem.scala](AverageScorePerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala)) - Compute Average Score per Item -- [Count.scala](Count.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Count.scala)) - Count Number of Items -- [CountDistinctItems.scala](CountDistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala)) - Count Number of Distinct Items -- [CountUsers.scala](CountUsers.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala)) - Count the Number of Items of a Given User -- [DistinctItems.scala](DistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala)) - Compute Collection of Distinct Items -- [FieldStatistics.scala](FieldStatistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala)) - Compute Basic Descriptive Statistics for Each Field -- [InvertedIndex.scala](InvertedIndex.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala)) - Build Inverted Index -- [JoinLogAndMetadata.scala](JoinLogAndMetadata.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala)) - Join Log and Metadata Datasets -- [JoinLogs.scala](JoinLogs.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala)) - Join Two Log Datasets and Compute Action Sequence -- [MaxItemPerUser.scala](MaxItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala)) - Compute One Item with Max Score per User -- [MinItemPerUser.scala](MinItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala)) - Compute One Item with Min Score per User -- [Sessions.scala](Sessions.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala)) - Compute Session Duration and Number of Items from Log Data -- [Statistics.scala](Statistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala)) - Compute Basic Descriptive Statistics -- [SumPerItem.scala](SumPerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala)) - Compute the Sum of Scores per Item -- [TopItems.scala](TopItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala)) - Compute Top K Items Globally -- [TopItemsPerUser.scala](TopItemsPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala)) - Compute Top K Items Globally -- [WordCount.scala](WordCount.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala)) - Classic Word Count - - - - From d7aada3782e6866fdc41fd59020f3afe188ece7d Mon Sep 17 00:00:00 2001 From: Filipe Regadas Date: Fri, 13 Dec 2019 15:25:16 -0500 Subject: [PATCH 08/11] updated site --- AverageScorePerItem.scala.html | 2030 ++++++++++++++++++++++++++++++ Count.scala.html | 1983 +++++++++++++++++++++++++++++ CountDistinctItems.scala.html | 1997 ++++++++++++++++++++++++++++++ CountUsers.scala.html | 1992 ++++++++++++++++++++++++++++++ DistinctItems.scala.html | 1942 +++++++++++++++++++++++++++++ FieldStatistics.scala.html | 2020 ++++++++++++++++++++++++++++++ InvertedIndex.scala.html | 1977 +++++++++++++++++++++++++++++ JoinLogAndMetadata.scala.html | 2126 ++++++++++++++++++++++++++++++++ JoinLogs.scala.html | 2032 ++++++++++++++++++++++++++++++ MaxItemPerUser.scala.html | 2038 ++++++++++++++++++++++++++++++ MinItemPerUser.scala.html | 2038 ++++++++++++++++++++++++++++++ Sessions.scala.html | 2048 ++++++++++++++++++++++++++++++ Statistics.scala.html | 1993 ++++++++++++++++++++++++++++++ SumPerItem.scala.html | 2001 ++++++++++++++++++++++++++++++ TopItems.scala.html | 2069 +++++++++++++++++++++++++++++++ TopItemsPerUser.scala.html | 2028 ++++++++++++++++++++++++++++++ WordCount.scala.html | 1976 +++++++++++++++++++++++++++++ index.html | 26 + 18 files changed, 34316 insertions(+) create mode 100644 AverageScorePerItem.scala.html create mode 100644 Count.scala.html create mode 100644 CountDistinctItems.scala.html create mode 100644 CountUsers.scala.html create mode 100644 DistinctItems.scala.html create mode 100644 FieldStatistics.scala.html create mode 100644 InvertedIndex.scala.html create mode 100644 JoinLogAndMetadata.scala.html create mode 100644 JoinLogs.scala.html create mode 100644 MaxItemPerUser.scala.html create mode 100644 MinItemPerUser.scala.html create mode 100644 Sessions.scala.html create mode 100644 Statistics.scala.html create mode 100644 SumPerItem.scala.html create mode 100644 TopItems.scala.html create mode 100644 TopItemsPerUser.scala.html create mode 100644 WordCount.scala.html create mode 100644 index.html diff --git a/AverageScorePerItem.scala.html b/AverageScorePerItem.scala.html new file mode 100644 index 0000000..546c951 --- /dev/null +++ b/AverageScorePerItem.scala.html @@ -0,0 +1,2030 @@ + + + + + + + +

Input is a collection of (user, item, score)

package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.algebird.Semigroup +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object AverageScorePerItem { +

+ +

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + input + .groupBy(_.user)

+ +

Map into (sum, count)

.mapValues(x => (x.score, 1L))

+ +

Sum both per key with an implicit Semigroup[(Double, Long)]

.sum

+ +

Map (sum, count) into average

.mapValues(p => p._1 / p._2) + .toTypedPipe + } +

+ +

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + import com.twitter.algebird.AveragedValue + input + .groupBy(_.user)

+ +

Map values into Double

.mapValues(_.score)

+ +

Aggregate average per key

.aggregate(AveragedValue.aggregator) + .toTypedPipe + } +

+ +

Scio

def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { + input + .keyBy(_.user)

+ +

Map into (sum, count)

.mapValues(x => (x.score, 1L))

+ +

Sum both per key with an implicit Semigroup[(Double, Long)]

.sumByKey

+ +

Map (sum, count) into average

.mapValues(p => p._1 / p._2) + } +

+ +

Spark

Summon an Algebird Semigroup[(Double, Long)] with implicit argument

def spark(input: RDD[Rating])(implicit sg: Semigroup[(Double, Long)]): RDD[(String, Double)] = { + input + .keyBy(_.user)

+ +

Map into (sum, count)

.mapValues(x => (x.score, 1L))

+ +

Reduce both per key with plus = (T, T) => T where T is (Double, Long)

.reduceByKey(sg.plus) // plus: (T, T) => T where T is (Double, Long)

+ +

Map (sum, count) into average

.mapValues(p => p._1 / p._2) + } +

+ +

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): RDD[(String, Double)] = { + import com.twitter.algebird.AveragedValue + import com.twitter.algebird.spark._ + input + .keyBy(_.user) + .mapValues(_.score)

+ +

Map values into Double

.algebird

+ +

Aggregate average per key

.aggregateByKey(AveragedValue.aggregator) + } + +}

+ + + + + + \ No newline at end of file diff --git a/Count.scala.html b/Count.scala.html new file mode 100644 index 0000000..bc3c436 --- /dev/null +++ b/Count.scala.html @@ -0,0 +1,1983 @@ + + + + + + + +

Input is a collection of (user, item, score)

+ +

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { + input + .map(_ => 1L)

+ +

Sum with an implicit Semigroup[Long]

.sum + .toTypedPipe + } +

+ +

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Long] = { + import com.twitter.algebird.Aggregator.size + input + .aggregate(size) + .toTypedPipe + } +

+ +

Scio

def scio(input: SCollection[Rating]): SCollection[Long] = + input.count +

+ +

Scio with Algebird `Aggregator`

def scioWithAlgebird(input: SCollection[Rating]): SCollection[Long] = { + import com.twitter.algebird.Aggregator.size + input + .aggregate(size) + } +

+ +

Spark

def spark(input: RDD[Rating]): Long = { + input

+ +

count is an action and collects data back to the driver node

.count + } +

+ +

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): Long = { + import com.twitter.algebird.Aggregator.size + import com.twitter.algebird.spark._ + input.algebird

+ +

aggregate is an action and collects data back to the driver node

.aggregate(size) + } + +}

+ + + + + + \ No newline at end of file diff --git a/CountDistinctItems.scala.html b/CountDistinctItems.scala.html new file mode 100644 index 0000000..b4852a0 --- /dev/null +++ b/CountDistinctItems.scala.html @@ -0,0 +1,1997 @@ + + + + + + + +

Input is a collection of (user, item, score)

package com.spotify.bdrc.pipeline + +import com.google.common.base.Charsets +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object CountDistinctItems { +

+ +

Scalding Exact Approach

def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { + input + .map(_.item)

+ +

Remove duplicates, requires a shuffle

.distinct + .map(_ => 1L)

+ +

Sum with an implicit Semigroup[Long]

.sum + .toTypedPipe + } +

+ +

Scalding Approximate Approach

def scaldingApproxWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Double] = { + import com.twitter.algebird.HyperLogLogAggregator + val aggregator = HyperLogLogAggregator.sizeAggregator(bits = 12) + input

+ +

HyperLogLog expects bytes input

.map(_.item.getBytes(Charsets.UTF_8))

+ +

Aggregate globally into a Double

.aggregate(aggregator) + .toTypedPipe + } +

+ +

Scio Exact Approach

def scio(input: SCollection[Rating]): SCollection[Long] = { + input + .map(_.item) + .distinct + .count + } +

+ +

Scio Approximate Approach

def scioApprox(input: SCollection[Rating]): SCollection[Long] = { + input + .map(_.item) + .countApproxDistinct() + } +

+ +

Spark Exact Approach

def spark(input: RDD[Rating]): Long = { + input + .map(_.item) + .distinct() + .count() + } +

+ +

Spark Approximate Approach

def sparkApprox(input: RDD[Rating]): Long = { + input + .map(_.item) + .countApproxDistinct() + } + +}

+ + + + + + \ No newline at end of file diff --git a/CountUsers.scala.html b/CountUsers.scala.html new file mode 100644 index 0000000..5c088e2 --- /dev/null +++ b/CountUsers.scala.html @@ -0,0 +1,1992 @@ + + + + + + + +

Input is a collection of (user, item, score)

+ +

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { + input + .filter(_.user == "Smith") + .map(_ => 1L)

+ +

Sum with an implicit Semigroup[Long]

.sum + .toTypedPipe + } +

+ +

Sclading with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Long] = { + import com.twitter.algebird.Aggregator.count + input

+ +

Aggregate globally into a single Long

.aggregate(count(_.user == "Smith")) + .toTypedPipe + } + + def scio(input: SCollection[Rating]): SCollection[Long] = { + input + .filter(_.user == "Smith") + .count + } +

+ +

Scio with Algebird `Aggregator`

def scioWithAlgebird(input: SCollection[Rating]): SCollection[Long] = { + import com.twitter.algebird.Aggregator.count + input

+ +

Aggregate globally into a single Long

.aggregate(count(_.user == "Smith")) + } +

+ +

Spark

def spark(input: RDD[Rating]): Long = { + input + .filter(_.user == "Smith")

+ +

count is an action and collects data back to the driver node

.count() + } +

+ +

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): Long = { + import com.twitter.algebird.Aggregator.count + import com.twitter.algebird.spark._ + input.algebird

+ +

aggregate is an action and collects data back to the driver node

.aggregate(count(_.user == "Smith")) + } + +}

+ + + + + + \ No newline at end of file diff --git a/DistinctItems.scala.html b/DistinctItems.scala.html new file mode 100644 index 0000000..922a9a8 --- /dev/null +++ b/DistinctItems.scala.html @@ -0,0 +1,1942 @@ + + + + + + + +

Input is a collection of (user, item, score)

+ +

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[String] = { + input + .map(_.item) + .distinct + } +

+ +

Scio

def scio(input: SCollection[Rating]): SCollection[String] = { + input + .map(_.item) + .distinct + } +

+ +

Spark

def spark(input: RDD[Rating]): RDD[String] = { + input + .map(_.item) + .distinct() + } + +}

+ + + + + + \ No newline at end of file diff --git a/FieldStatistics.scala.html b/FieldStatistics.scala.html new file mode 100644 index 0000000..ba0d075 --- /dev/null +++ b/FieldStatistics.scala.html @@ -0,0 +1,2020 @@ + + + + + + + +

Input is a collection of case classes

package com.spotify.bdrc.pipeline + +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object FieldStatistics { + + case class User(age: Int, income: Double, score: Double) + case class Stats(max: Double, min: Double, mean: Double, stddev: Double) + case class UserStats(age: Stats, income: Stats, score: Stats) +

+ +

Algebird `Aggregator`

def aggregator = { + import com.twitter.algebird._ +

+ +

Create 3 Aggregators on age field with different logic

+ +

The first 2 are of type Aggregator[User, _, Int] which means it takes User as input and +generates Int as output. The last one is of type Aggregator[User, _, Moments], +where Moments include count, mean, standard deviation, etc. The input User is prepared +with a User => Int function _.age.

val maxAgeOp = Aggregator.max[Int].composePrepare[User](_.age) + val minAgeOp = Aggregator.min[Int].composePrepare[User](_.age) + val momentsAgeOp = Moments.aggregator.composePrepare[User](_.age) +

+ +

Create 3 Aggregators on income field with different logic

val maxIncomeOp = Aggregator.max[Double].composePrepare[User](_.income) + val minIncomeOp = Aggregator.min[Double].composePrepare[User](_.income) + val momentsIncomeOp = Moments.aggregator.composePrepare[User](_.income) +

+ +

Create 3 Aggregators on score field with different logic

val maxScoreOp = Aggregator.max[Double].composePrepare[User](_.score) + val minScoreOp = Aggregator.min[Double].composePrepare[User](_.score) + val momentsScoreOp = Moments.aggregator.composePrepare[User](_.score) +

+ +

Apply 12 Aggregators on the same input, present result tuple 12 as UserStats.

MultiAggregator( + maxAgeOp, + minAgeOp, + momentsAgeOp, + maxIncomeOp, + minIncomeOp, + momentsIncomeOp, + maxScoreOp, + minScoreOp, + momentsScoreOp + ).andThenPresent { t => + val (maxAge, minAge, mAge, maxIncome, minIncome, mIncome, maxScore, minScore, mScore) = t + UserStats( + age = Stats(maxAge, minAge, mAge.mean, mAge.stddev), + income = Stats(maxIncome, minIncome, mIncome.mean, mIncome.stddev), + score = Stats(maxScore, minScore, mScore.mean, mScore.stddev) + ) + } + } +

+ +

Scalding

def scalding(input: TypedPipe[User]): TypedPipe[UserStats] = + input.aggregate(aggregator) +

+ +

Scio

def scio(input: SCollection[User]): SCollection[UserStats] = + input.aggregate(aggregator) +

+ +

Spark

def spark(input: RDD[User]): UserStats = {

+ +

Compute each field separately, potentially in-efficient if input is not cached

val s1 = input.map(_.age).stats() + val s2 = input.map(_.income).stats() + val s3 = input.map(_.score).stats() + UserStats( + age = Stats(s1.max, s1.min, s1.mean, s1.stdev), + income = Stats(s2.max, s2.min, s2.mean, s2.stdev), + score = Stats(s3.max, s3.min, s3.mean, s3.stdev) + ) + } +

+ +

Spark with Algebird `Aggregator`

def sparkAlgebird(input: RDD[User]): UserStats = { + import com.twitter.algebird.spark._ + input.algebird.aggregate(aggregator) + } + +}

+ + + + + + \ No newline at end of file diff --git a/InvertedIndex.scala.html b/InvertedIndex.scala.html new file mode 100644 index 0000000..1cc9cdd --- /dev/null +++ b/InvertedIndex.scala.html @@ -0,0 +1,1977 @@ + + + + + + + +

Build inverted index from a corpus of text documents

+ +

Input is a collection of (id, text)

package com.spotify.bdrc.pipeline + +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object InvertedIndex { + + case class Document(id: Int, text: String) + case class Posting(word: String, ids: Seq[Int]) +

+ +

Scalding

def scalding(input: TypedPipe[Document]): TypedPipe[Posting] = { + input

+ +

Split text and output (word, document ID)

.flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id)))

+ +

Group and convert document IDs per key to List[Int]

.group + .toList + .map(Posting.tupled) + } +

+ +

Scio

def scio(input: SCollection[Document]): SCollection[Posting] = { + input

+ +

Split text and output (word, document ID)

.flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id)))

+ +

Group document IDs per key into Iterable[Int]

.groupByKey + .map(kv => Posting(kv._1, kv._2.toSeq)) + } +

+ +

Spark

def spark(input: RDD[Document]): RDD[Posting] = { + input

+ +

Split text and output (word, document ID)

.flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id)))

+ +

Group document IDs per key into Iterable[Int]

.groupByKey() + .map(kv => Posting(kv._1, kv._2.toSeq)) + } + +}

+ + + + + + \ No newline at end of file diff --git a/JoinLogAndMetadata.scala.html b/JoinLogAndMetadata.scala.html new file mode 100644 index 0000000..b28af3e --- /dev/null +++ b/JoinLogAndMetadata.scala.html @@ -0,0 +1,2126 @@ + + + + + + + +

Compute average age of users who listened to a track by joining log event and user metadata.

LHS input is a large collection of (user, page, timestamp).
RHS input is a small collection of (user, age).

package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.{LogEvent, UserMeta} +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object JoinLogAndMetadata { +

+ +

Scalding Naive Approach

def scaldingNaive( + left: TypedPipe[LogEvent], + right: TypedPipe[UserMeta] + ): TypedPipe[(String, Double)] = { + import com.twitter.algebird.AveragedValue + left + .groupBy(_.user)

+ +

Join as (user, (LogEvent, UserMeta))

.join(right.groupBy(_.user))

+ +

Drop user key

.values

+ +

Map into (track, age)

.map { + case (logEvent, userMeta) => + (logEvent.track, userMeta.age.toDouble) + } + .group

+ +

Aggregate average age per track

.aggregate(AveragedValue.aggregator) + .toTypedPipe + } +

+ +

Scalding with Hash Join

hashJoin replicates the smaller RHS to all mappers on the LHS

def scaldingHashJoin( + left: TypedPipe[LogEvent], + right: TypedPipe[UserMeta] + ): TypedPipe[(String, Double)] = { + import com.twitter.algebird.AveragedValue +

+ +

Map out fields to avoid shuffing large objects

val lhs = left.map(e => (e.user, e.track))

+ +

Force to disk to avoid repeating the same computation on each mapper on the LHS

val rhs = right.map(u => (u.user, u.age.toDouble)).forceToDisk + + lhs + .hashJoin(rhs) + .values + .group + .aggregate(AveragedValue.aggregator) + .toTypedPipe + } +

+ +

Scio Naive Approach

def scioNaive( + left: SCollection[LogEvent], + right: SCollection[UserMeta] + ): SCollection[(String, Double)] = { + import com.twitter.algebird.AveragedValue + val lhs = left.map(e => (e.user, e.track)) + val rhs = right.map(u => (u.user, u.age.toDouble))

+ +

Join as (user, (track, age))

lhs + .join(rhs)

+ +

Drop user key to make track as new key in (track, age)

.values

+ +

Aggregate average age per track

.aggregateByKey(AveragedValue.aggregator) + } +

+ +

Scio with Side Input

Side input makes RHS available on all workers

def scioSideInput( + left: SCollection[LogEvent], + right: SCollection[UserMeta] + ): SCollection[(String, Double)] = { + import com.twitter.algebird.AveragedValue +

+ +

Convert RHS to a side input of Map[String, Double]

val rhs = right.map(u => (u.user, u.age.toDouble)).asMapSideInput +

+ +

Replicate RHS to each worker

left + .withSideInputs(rhs)

+ +

Access side input via the context

.map { case (e, sideContext) => (e.track, sideContext(rhs).getOrElse(e.user, 0.0)) }

+ +

Convert back to regular SCollection

.toSCollection + .aggregateByKey(AveragedValue.aggregator) + } +

+ +

Scio with Hash Join

hashJoin is a short cut to the side input approach

def scioHashJoin( + left: SCollection[LogEvent], + right: SCollection[UserMeta] + ): SCollection[(String, Double)] = { + import com.twitter.algebird.AveragedValue + val lhs = left.map(e => (e.user, e.track)) + val rhs = right.map(u => (u.user, u.age.toDouble)) + lhs + .hashJoin(rhs) + .values + .aggregateByKey(AveragedValue.aggregator) + } +

+ +

Spark Naive Approach

def sparkNaive(left: RDD[LogEvent], right: RDD[UserMeta]): RDD[(String, Double)] = { + import com.twitter.algebird.spark._ + import com.twitter.algebird.AveragedValue + val lhs = left.map(e => (e.user, e.track)) + val rhs = right.map(u => (u.user, u.age.toDouble))

+ +

Join as (user, (track, age))

lhs + .join(rhs)

+ +

Drop user key to make track as new key in (track, age)

.values + .algebird

+ +

Aggregate average age per track

.aggregateByKey(AveragedValue.aggregator) + } +

+ +

Spark with Broadcast Variable

def sparkBroadcast(left: RDD[LogEvent], right: RDD[UserMeta]): RDD[(String, Double)] = { + import com.twitter.algebird.spark._ + import com.twitter.algebird.AveragedValue +

+ +

Retrieve SparkContext for creating broadcast variable

val sc = left.context +

+ +

Collect RHS to driver memory and broadcast back to workers

val map = right.map(u => (u.user, u.age.toDouble)).collectAsMap() + val b = sc.broadcast(map) + + left

+ +

In-memory lookup on each worker

.map(e => (e.track, b.value.getOrElse(e.user, 0.0))) + .algebird + .aggregateByKey(AveragedValue.aggregator) + } + +}

+ + + + + + \ No newline at end of file diff --git a/JoinLogs.scala.html b/JoinLogs.scala.html new file mode 100644 index 0000000..43c5b22 --- /dev/null +++ b/JoinLogs.scala.html @@ -0,0 +1,2032 @@ + + + + + + + +

Given two log datasets of play track and save track events, compute tracks that a user saved +after playing in a session.

+ +

Inputs are collections of (user, item, timestamp).

package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.LogEvent +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object JoinLogs { + + val gapDuration = 3600000 +

+ +

Detect if a pair of (event type, LogEvent) tuples match a play and save sequence

def detectPlaySaveSequence(pair: Seq[(String, LogEvent)]): Option[String] = { + val Seq(first, second) = pair + if (first._1 == "play" && second._1 == "save" && first._2.track == second._2.track && + second._2.timestamp - first._2.timestamp <= gapDuration) { + Some(first._2.track) + } else { + None + } + } +

+ +

Scalding

def scalding( + playEvents: TypedPipe[LogEvent], + saveEvents: TypedPipe[LogEvent] + ): TypedPipe[(String, String)] = {

+ +

Map inputs to key-values and add event type information

val plays = playEvents.map(e => (e.user, ("play", e))).group + val saves = saveEvents.map(e => (e.user, ("save", e))).group + + plays + .cogroup(saves) { (user, p, s) =>

+ +

Iterables of play and save events for the user

(p ++ s).toList + .sortBy(_._2.timestamp)

+ +

Neighboring pairs

.sliding(2) + .flatMap(detectPlaySaveSequence) + } + .toTypedPipe + } +

+ +

Scio

def scio( + playEvents: SCollection[LogEvent], + saveEvents: SCollection[LogEvent] + ): SCollection[(String, String)] = {

+ +

Map inputs to key-values and add event type information

val plays = playEvents.map(e => (e.user, ("play", e))) + val saves = saveEvents.map(e => (e.user, ("save", e))) + + plays + .cogroup(saves)

+ +

Iterables of play and save events for the user

.flatMapValues { + case (p, s) => + (p ++ s).toList + .sortBy(_._2.timestamp)

+ +

Neighboring pairs

.sliding(2) + .flatMap(detectPlaySaveSequence) + } + } +

+ +

Spark

def spark(playEvents: RDD[LogEvent], saveEvents: RDD[LogEvent]): RDD[(String, String)] = {

+ +

Map inputs to key-values and add event type information

val plays = playEvents.map(e => (e.user, ("play", e))) + val saves = saveEvents.map(e => (e.user, ("save", e))) + + plays + .cogroup(saves) + .flatMapValues { + case (p, s) =>

+ +

Iterables of play and save events for the user

(p ++ s).toList + .sortBy(_._2.timestamp)

+ +

Neighboring pairs

.sliding(2) + .flatMap(detectPlaySaveSequence) + } + } + +}

+ + + + + + \ No newline at end of file diff --git a/MaxItemPerUser.scala.html b/MaxItemPerUser.scala.html new file mode 100644 index 0000000..9cc019c --- /dev/null +++ b/MaxItemPerUser.scala.html @@ -0,0 +1,2038 @@ + + + + + + + +

Input is a collection of (user, item, score)

+ +

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { + input + .groupBy(_.user)

+ +

Reduce items per key by picking the side with higher score for each pair of input

.reduce((x, y) => if (x.score > y.score) x else y) + .values + } +

+ +

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Rating] = { + import com.twitter.algebird.Aggregator.maxBy + input + .groupBy(_.user)

+ +

Aggregate per key into a single Rating based on Double value via _.score

.aggregate(maxBy(_.score)) + .values + } +

+ +

Scio

def scio(input: SCollection[Rating]): SCollection[Rating] = { + input + .keyBy(_.user)

+ +

Compute top one item per key as an Iterable[Rating]

.topByKey(1, Ordering.by(_.score))

+ +

Drop user key

.values

+ +

Flatten result Iterable[Rating]

.flatten + } +

+ +

Scio with Algebird `Aggregator`

def scioWithAlgebird(input: SCollection[Rating]): SCollection[Rating] = { + import com.twitter.algebird.Aggregator.maxBy + input + .keyBy(_.user)

+ +

Aggregate per key into a single Rating based on Double value via _.score. Explicit +type due to type inference limitation.

.aggregateByKey(maxBy { x: Rating => + x.score + }) + .values + } +

+ +

Spark

def spark(input: RDD[Rating]): RDD[Rating] = { + input + .keyBy(_.user)

+ +

Reduce items per key by picking the side with higher score for each pair of input

.reduceByKey((x, y) => if (x.score > y.score) x else y) + .values + } +

+ +

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { + import com.twitter.algebird.Aggregator.maxBy + import com.twitter.algebird.spark._ + input + .keyBy(_.user) + .algebird

+ +

Aggregate per key into a single Rating based on Double value via _.score. Explicit +type due to type inference limitation.

.aggregateByKey(maxBy { x: Rating => + x.score + }) + .values + } +

+ +

Spark with MLLib

def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { + import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ + input + .keyBy(_.user)

+ +

From spark-mllib, compute top K per key with a priority queue

.topByKey(1)(Ordering.by(_.score)) + .flatMap(_._2) + } + +}

+ + + + + + \ No newline at end of file diff --git a/MinItemPerUser.scala.html b/MinItemPerUser.scala.html new file mode 100644 index 0000000..a729555 --- /dev/null +++ b/MinItemPerUser.scala.html @@ -0,0 +1,2038 @@ + + + + + + + +

Input is a collection of (user, item, score)

+ +

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { + input + .groupBy(_.user)

+ +

Reduce items per key by picking the side with lower score for each pair of input

.reduce((x, y) => if (x.score < y.score) x else y) + .values + } +

+ +

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Rating] = { + import com.twitter.algebird.Aggregator.minBy + input + .groupBy(_.user)

+ +

Aggregate per key into a single Rating based on Double value via _.score

.aggregate(minBy(_.score)) + .values + } +

+ +

Scio

def scio(input: SCollection[Rating]): SCollection[Rating] = { + input + .keyBy(_.user)

+ +

Compute top one item per key as an Iterable[Rating] with a reverse comparator

.topByKey(1, Ordering.by(-_.score))

+ +

Drop user key

.values

+ +

Flatten result Iterable[Rating]

.flatten + } +

+ +

Scio with Algebird `Aggregator`

def scioWithAlgebird(input: SCollection[Rating]): SCollection[Rating] = { + import com.twitter.algebird.Aggregator.minBy + input + .keyBy(_.user)

+ +

Aggregate per key into a single Rating based on Double value via _.score. Explicit +type due to type inference limitation.

.aggregateByKey(minBy { x: Rating => + x.score + }) + .values + } +

+ +

Spark

def spark(input: RDD[Rating]): RDD[Rating] = { + input + .keyBy(_.user)

+ +

Reduce items per key by picking the side with lower score for each pair of input

.reduceByKey((x, y) => if (x.score < y.score) x else y) + .values + } +

+ +

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { + import com.twitter.algebird.Aggregator.minBy + import com.twitter.algebird.spark._ + input + .keyBy(_.user) + .algebird

+ +

Aggregate per key into a single Rating based on Double value via _.score. Explicit +type due to type inference limitation.

.aggregateByKey(minBy { x: Rating => + x.score + }) + .values + } +

+ +

Spark with MLLib

def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { + import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ + input + .keyBy(_.user)

+ +

From spark-mllib, compute top K per key with a priority queue and a reverse comparator

.topByKey(1)(Ordering.by(-_.score)) + .flatMap(_._2) + } + +}

+ + + + + + \ No newline at end of file diff --git a/Sessions.scala.html b/Sessions.scala.html new file mode 100644 index 0000000..2f8792d --- /dev/null +++ b/Sessions.scala.html @@ -0,0 +1,2048 @@ + + + + + + + +

Input is a collection of log events

package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.LogEvent +import com.spotify.scio.extra.Iterators._ +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD +import org.joda.time.Instant + +import scala.collection.mutable + +object Sessions { + + val gapDuration = 3600000 + + case class Session(user: String, duration: Long, numItems: Int) +

+ +

Wrapper for Iterator[LogEvent] that group items into sessions

class SessionIterator(self: Iterator[LogEvent]) extends Iterator[Seq[LogEvent]] {

+ +

BufferedIterator allows peak ahead

private val bi = self.buffered + override def hasNext: Boolean = bi.hasNext + override def next(): Seq[LogEvent] = { + val buf = mutable.Buffer(bi.next()) + var last = buf.head.timestamp +

+ +

Consume subsequent events until a gap is detected

while (bi.hasNext && bi.head.timestamp - last < gapDuration) { + val n = bi.next() + buf.append(n) + last = n.timestamp + } + buf + } + } +

+ +

Scalding

def scalding(input: TypedPipe[LogEvent]): TypedPipe[Session] = { + input + .groupBy(_.user)

+ +

sortBy uses Hadoop secondary sort to sort keys during shuffle

.sortBy(_.timestamp)

+ +

Iterate over values lazily and group items into sessions

.mapValueStream(new SessionIterator(_)) + .toTypedPipe

+ +

Map over each (user, session items)

.map { + case (user, items) => + Session(user, items.last.timestamp - items.head.timestamp, items.size) + } + } +

+ +

Scio

def scio(input: SCollection[LogEvent]): SCollection[Session] = { + input

+ +

Values in groupBy are sorted by timestamp

.timestampBy(e => new Instant(e.timestamp))

+ +

No secondary sort in Scio, shuffle all items

.groupBy(_.user) + .flatMapValues { + _.iterator

+ +

Generic version of SessionIterator from scio-extra

.timeSeries(_.timestamp) + .session(gapDuration) + }

+ +

Map over each (user, session items)

.map { + case (user, items) => + Session(user, items.last.timestamp - items.head.timestamp, items.size) + } + } +

+ +

Spark

def spark(input: RDD[LogEvent]): RDD[Session] = { + input

+ +

No secondary sort in Spark, shuffle all items

.groupBy(_.user) + .flatMapValues { + _

+ +

Order of values after shuffle is not guaranteed

.toList + .sortBy(_.timestamp) + .iterator

+ +

Generic version of SessionIterator from scio-extra

.timeSeries(_.timestamp) + .session(gapDuration) + }

+ +

Map over each (user, session items)

.map { + case (user, items) => + Session(user, items.last.timestamp - items.head.timestamp, items.size) + } + } + +}

+ + + + + + \ No newline at end of file diff --git a/Statistics.scala.html b/Statistics.scala.html new file mode 100644 index 0000000..bccb0ba --- /dev/null +++ b/Statistics.scala.html @@ -0,0 +1,1993 @@ + + + + + + + +

Input is a collection of (user, item, score)

package com.spotify.bdrc.pipeline + +import com.spotify.bdrc.util.Records.Rating +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object Statistics { + + case class Stats(max: Double, min: Double, sum: Double, count: Long, mean: Double, stddev: Double) +

+ +

Algebird `Aggregator`

def aggregator = { + import com.twitter.algebird._ +

+ +

Create 4 Aggregators with different logic

+ +

The first 3 are of type Aggregator[Rating, _, Double] which means it takes Rating as +input and generates Double as output. The last one is of type +Aggregator[Rating, _, Moments], where Moments include count, mean, standard deviation, +etc. The input Rating is prepared with a Rating => Double function _.score.

val maxOp = Aggregator.max[Double].composePrepare[Rating](_.score) + val minOp = Aggregator.min[Double].composePrepare[Rating](_.score) + val sumOp = Aggregator.prepareMonoid[Rating, Double](_.score) + val momentsOp = Moments.aggregator.composePrepare[Rating](_.score) +

+ +

Apply 4 Aggregators on the same input, present result tuple 4 of +(Double, Double, Double, Moments) as Stats

MultiAggregator(maxOp, minOp, sumOp, momentsOp) + .andThenPresent { + case (max, min, sum, moments) => + Stats(max, min, sum, moments.count, moments.mean, moments.stddev) + } + } +

+ +

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Stats] = + input.aggregate(aggregator) +

+ +

Scio

def scio(input: SCollection[Rating]): SCollection[Stats] = { + input + .map(_.score) + .stats + .map(s => Stats(s.max, s.min, s.sum, s.count, s.mean, s.stdev)) + } +

+ +

Scio with Algebird `Aggregator`

def scioAlgebird(input: SCollection[Rating]): SCollection[Stats] = + input.aggregate(aggregator) +

+ +

Spark

def spark(input: RDD[Rating]): Stats = { + val s = input.map(_.score).stats() + Stats(s.max, s.min, s.sum, s.count, s.mean, s.stdev) + } +

+ +

Spark with Algebird `Aggregator`

def sparkAlgebird(input: RDD[Rating]): Stats = { + import com.twitter.algebird.spark._ + input.algebird.aggregate(aggregator) + } + +}

+ + + + + + \ No newline at end of file diff --git a/SumPerItem.scala.html b/SumPerItem.scala.html new file mode 100644 index 0000000..aad9e67 --- /dev/null +++ b/SumPerItem.scala.html @@ -0,0 +1,2001 @@ + + + + + + + +

Input is a collection of (user, item, score)

+ +

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + input + .groupBy(_.item) + .mapValues(_.score)

+ +

Sum per key with an implicit Semigroup[Double]

.sum + .toTypedPipe + } +

+ +

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + import com.twitter.algebird.Aggregator.prepareMonoid + input + .groupBy(_.item)

+ +

Aggregate per key with an aggregator that converts UserItemData to Double via +_.score before reduce

.aggregate(prepareMonoid(_.score)) + .toTypedPipe + } +

+ +

Scio

def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { + input + .map(x => (x.item, x.score)) + .sumByKey + } +

+ +

Spark

def spark(input: RDD[Rating]): RDD[(String, Double)] = { + input + .map(x => (x.item, x.score)) + .reduceByKey(_ + _) + } +

+ +

Spark with Algebird `Semigroup`

def sparkWithAlgebird1(input: RDD[Rating]): RDD[(String, Double)] = { + import com.twitter.algebird.spark._ + input + .map(x => (x.item, x.score)) + .algebird

+ +

Sum per key with an implicit Semigroup[Double]

.sumByKey + } +

+ +

Spark with Algebird `Aggregator`

def sparkWithAlgebird2(input: RDD[Rating]): RDD[(String, Double)] = { + import com.twitter.algebird.Aggregator.prepareMonoid + import com.twitter.algebird.spark._ + input + .keyBy(_.item) + .algebird

+ +

Aggregate per key with an aggregator that converts UserItemData to Double via +_.score before reduce. Explicit type due to type inference limitation.

.aggregateByKey(prepareMonoid { x: Rating => + x.score + }) + } + +}

+ + + + + + \ No newline at end of file diff --git a/TopItems.scala.html b/TopItems.scala.html new file mode 100644 index 0000000..d788649 --- /dev/null +++ b/TopItems.scala.html @@ -0,0 +1,2069 @@ + + + + + + + +

Input is a collection of (user, item, score)

+ +

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + input + .map(x => (x.item, x.score)) + .group

+ +

Sum values with an implicit Semigroup[Double]

.sum

+ +

Group all elements with a single key Unit

.groupAll

+ +

Take top K with a priority queue

.sortedReverseTake(topK)(Ordering.by(_._2))

+ +

Drop Unit key

.values

+ +

Flatten result Seq[(String, Double)]

.flatten + } +

+ +

Scalding with Algebird `Aggregator`

def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { + import com.twitter.algebird.Aggregator.sortedReverseTake + val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) + input + .map(x => (x.item, x.score)) + .group

+ +

Sum values with an implicit Semigroup[Double]

.sum + .toTypedPipe

+ +

Aggregate globally into a single Seq[(String, Double)]

.aggregate(aggregator)

+ +

Flatten result Seq[(String, Double)]

.flatten + } +

+ +

Scio

def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { + input + .map(x => (x.item, x.score))

+ +

Sum values with an implicit Semigroup[Double]

.sumByKey

+ +

Compute top K as an Iterable[(String, Double)]

.top(topK, Ordering.by(_._2))

+ +

Flatten result Iterable[(String, Double)]

.flatten + } +

+ +

Scio with Algebird `Aggregator`

def scioWithAlgebird(input: SCollection[Rating]): SCollection[(String, Double)] = { + import com.twitter.algebird.Aggregator.sortedReverseTake + val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) + input + .map(x => (x.item, x.score))

+ +

Sum values with an implicit Semigroup[Double]

.sumByKey

+ +

Aggregate globally into a single Seq[(String, Double)]

.aggregate(aggregator)

+ +

Flatten result Seq[(String, Double)]

.flatten + } +

+ +

Spark

def spark(input: RDD[Rating]): Seq[(String, Double)] = { + input + .map(x => (x.item, x.score))

+ +

Sum values with addition

.reduceByKey(_ + _)

+ +

top is an action and collects data back to the driver node

.top(topK)(Ordering.by(_._2)) + } +

+ +

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): Seq[(String, Double)] = { + import com.twitter.algebird.Aggregator.sortedReverseTake + import com.twitter.algebird.spark._ + val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) + input + .map(x => (x.item, x.score))

+ +

Sum values with addition

.reduceByKey(_ + _) + .algebird

+ +

aggregate is an action and collects data back to the driver node

.aggregate(aggregator) + } + +}

+ + + + + + \ No newline at end of file diff --git a/TopItemsPerUser.scala.html b/TopItemsPerUser.scala.html new file mode 100644 index 0000000..edc301c --- /dev/null +++ b/TopItemsPerUser.scala.html @@ -0,0 +1,2028 @@ + + + + + + + +

Input is a collection of (user, item, score)

+ +

Scalding

def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { + input + .groupBy(_.user)

+ +

Take top K per group with a priority queue

.sortedReverseTake(topK)(Ordering.by(_.score))

+ +

Drop user key

.values

+ +

Flatten result Seq[Rating]

.flatten + } +

+ +

Scio

def scio(input: SCollection[Rating]): SCollection[Rating] = { + input + .keyBy(_.user)

+ +

Compute top K per key

.topByKey(topK, Ordering.by(_.score))

+ +

Drop user key

.values

+ +

Flatten result Iterable[Rating]

.flatten + } +

+ +

Spark Naive Approach

def spark(input: RDD[Rating]): RDD[Rating] = { + input

+ +

groupBy shuffles all data, inefficient

.groupBy(_.user)

+ +

Drop user key

.values

+ +

Convert grouped values to a List[Rating] and sort on a single node, inefficient

.flatMap(_.toList.sortBy(-_.score).take(topK)) + } +

+ +

Spark with Algebird `Aggregator`

def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { + import com.twitter.algebird.Aggregator.sortedReverseTake + import com.twitter.algebird.spark._ + val aggregator = sortedReverseTake[Rating](topK)(Ordering.by(_.score)) + input + .keyBy(_.user) + .algebird

+ +

Aggregate per key into a Seq[Rating]

.aggregateByKey(aggregator)

+ +

Flatten result Seq[Rating]

.flatMap(_._2) + } +

+ +

Spark with MLLib

def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { + import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ + input + .keyBy(_.user)

+ +

From spark-mllib, compute top K per key with a priority queue

.topByKey(topK)(Ordering.by(_.score))

+ +

Flatten result Seq[Rating]

.flatMap(_._2) + } + +}

+ + + + + + \ No newline at end of file diff --git a/WordCount.scala.html b/WordCount.scala.html new file mode 100644 index 0000000..f9c51c4 --- /dev/null +++ b/WordCount.scala.html @@ -0,0 +1,1976 @@ + + + + + + + +

package com.spotify.bdrc.pipeline + +import com.spotify.scio.values.SCollection +import com.twitter.scalding.TypedPipe +import org.apache.spark.rdd.RDD + +object WordCount { +

+ +

Scalding

def scalding(input: TypedPipe[String]): TypedPipe[(String, Long)] = { + input + .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty))

+ +

groupBy is lazy

.groupBy(identity)

+ +

Operations like size after groupBy can be lifted into the map phase

.size + .toTypedPipe + } +

+ +

Scio

def scio(input: SCollection[String]): SCollection[(String, Long)] = { + input + .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) + .countByValue + } +

+ +

Spark Transformation

def sparkTransformation(input: RDD[String]): RDD[(String, Long)] = { + input + .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty))

+ +

There is no countByValue transformation in Spark although it is equivalent to mapping +into initial count of 1 and reduce with addition

.map((_, 1L))

+ +

reduceByKey can lift function into the map phase

.reduceByKey(_ + _) + } +

+ +

Spark Action

def sparkAction(input: RDD[String]): Seq[(String, Long)] = { + input + .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty))

+ +

countByValue is an action and collects data back to the driver node

.countByValue() + .toSeq + } + +}

+ + + + + + \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..755c278 --- /dev/null +++ b/index.html @@ -0,0 +1,26 @@ + + +Codestin Search App + +### com.spotify.bdrc.pipeline +- [TopItems.scala](TopItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala)) - Compute Top K Items Globally +- [TopItemsPerUser.scala](TopItemsPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala)) - Compute Top K Items Globally +- [Statistics.scala](Statistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala)) - Compute Basic Descriptive Statistics +- [CountUsers.scala](CountUsers.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala)) - Count the Number of Items of a Given User +- [SumPerItem.scala](SumPerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala)) - Compute the Sum of Scores per Item +- [JoinLogs.scala](JoinLogs.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala)) - Join Two Log Datasets and Compute Action Sequence +- [Sessions.scala](Sessions.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala)) - Compute Session Duration and Number of Items from Log Data +- [DistinctItems.scala](DistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala)) - Compute Collection of Distinct Items +- [WordCount.scala](WordCount.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala)) - Classic Word Count +- [MinItemPerUser.scala](MinItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala)) - Compute One Item with Min Score per User +- [CountDistinctItems.scala](CountDistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala)) - Count Number of Distinct Items +- [Count.scala](Count.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/Count.scala)) - Count Number of Items +- [AverageScorePerItem.scala](AverageScorePerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala)) - Compute Average Score per Item +- [FieldStatistics.scala](FieldStatistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala)) - Compute Basic Descriptive Statistics for Each Field +- [MaxItemPerUser.scala](MaxItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala)) - Compute One Item with Max Score per User +- [InvertedIndex.scala](InvertedIndex.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala)) - Build Inverted Index +- [JoinLogAndMetadata.scala](JoinLogAndMetadata.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala)) - Join Log and Metadata Datasets + + + + From 200989587780b29c56066f94d8939ef19b67eeeb Mon Sep 17 00:00:00 2001 From: Filipe Regadas Date: Fri, 13 Dec 2019 15:38:01 -0500 Subject: [PATCH 09/11] updated site --- index.html | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/index.html b/index.html index 755c278..2e299c5 100644 --- a/index.html +++ b/index.html @@ -3,23 +3,23 @@ Codestin Search App ### com.spotify.bdrc.pipeline -- [TopItems.scala](TopItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala)) - Compute Top K Items Globally -- [TopItemsPerUser.scala](TopItemsPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala)) - Compute Top K Items Globally -- [Statistics.scala](Statistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala)) - Compute Basic Descriptive Statistics -- [CountUsers.scala](CountUsers.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala)) - Count the Number of Items of a Given User -- [SumPerItem.scala](SumPerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala)) - Compute the Sum of Scores per Item -- [JoinLogs.scala](JoinLogs.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala)) - Join Two Log Datasets and Compute Action Sequence -- [Sessions.scala](Sessions.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala)) - Compute Session Duration and Number of Items from Log Data -- [DistinctItems.scala](DistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala)) - Compute Collection of Distinct Items -- [WordCount.scala](WordCount.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala)) - Classic Word Count -- [MinItemPerUser.scala](MinItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala)) - Compute One Item with Min Score per User -- [CountDistinctItems.scala](CountDistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala)) - Count Number of Distinct Items -- [Count.scala](Count.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/Count.scala)) - Count Number of Items -- [AverageScorePerItem.scala](AverageScorePerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala)) - Compute Average Score per Item -- [FieldStatistics.scala](FieldStatistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala)) - Compute Basic Descriptive Statistics for Each Field -- [MaxItemPerUser.scala](MaxItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala)) - Compute One Item with Max Score per User -- [InvertedIndex.scala](InvertedIndex.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala)) - Build Inverted Index -- [JoinLogAndMetadata.scala](JoinLogAndMetadata.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/big-data-rosetta-code/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala)) - Join Log and Metadata Datasets +- [TopItems.scala](TopItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala)) - Compute Top K Items Globally +- [TopItemsPerUser.scala](TopItemsPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala)) - Compute Top K Items Globally +- [Statistics.scala](Statistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala)) - Compute Basic Descriptive Statistics +- [CountUsers.scala](CountUsers.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala)) - Count the Number of Items of a Given User +- [SumPerItem.scala](SumPerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala)) - Compute the Sum of Scores per Item +- [JoinLogs.scala](JoinLogs.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala)) - Join Two Log Datasets and Compute Action Sequence +- [Sessions.scala](Sessions.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala)) - Compute Session Duration and Number of Items from Log Data +- [DistinctItems.scala](DistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala)) - Compute Collection of Distinct Items +- [WordCount.scala](WordCount.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala)) - Classic Word Count +- [MinItemPerUser.scala](MinItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala)) - Compute One Item with Min Score per User +- [CountDistinctItems.scala](CountDistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala)) - Count Number of Distinct Items +- [Count.scala](Count.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Count.scala)) - Count Number of Items +- [AverageScorePerItem.scala](AverageScorePerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala)) - Compute Average Score per Item +- [FieldStatistics.scala](FieldStatistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala)) - Compute Basic Descriptive Statistics for Each Field +- [MaxItemPerUser.scala](MaxItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala)) - Compute One Item with Max Score per User +- [InvertedIndex.scala](InvertedIndex.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala)) - Build Inverted Index +- [JoinLogAndMetadata.scala](JoinLogAndMetadata.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala)) - Join Log and Metadata Datasets From 32447a907eb13a571b3f05f87ec63d0ba8a24303 Mon Sep 17 00:00:00 2001 From: Filipe Regadas Date: Fri, 13 Dec 2019 15:47:36 -0500 Subject: [PATCH 10/11] updated site --- index.html | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/index.html b/index.html index 2e299c5..7e519a5 100644 --- a/index.html +++ b/index.html @@ -3,23 +3,23 @@ Codestin Search App ### com.spotify.bdrc.pipeline -- [TopItems.scala](TopItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala)) - Compute Top K Items Globally -- [TopItemsPerUser.scala](TopItemsPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala)) - Compute Top K Items Globally -- [Statistics.scala](Statistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala)) - Compute Basic Descriptive Statistics +- [AverageScorePerItem.scala](AverageScorePerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala)) - Compute Average Score per Item +- [Count.scala](Count.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Count.scala)) - Count Number of Items +- [CountDistinctItems.scala](CountDistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala)) - Count Number of Distinct Items - [CountUsers.scala](CountUsers.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala)) - Count the Number of Items of a Given User -- [SumPerItem.scala](SumPerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala)) - Compute the Sum of Scores per Item -- [JoinLogs.scala](JoinLogs.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala)) - Join Two Log Datasets and Compute Action Sequence -- [Sessions.scala](Sessions.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala)) - Compute Session Duration and Number of Items from Log Data - [DistinctItems.scala](DistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala)) - Compute Collection of Distinct Items -- [WordCount.scala](WordCount.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala)) - Classic Word Count -- [MinItemPerUser.scala](MinItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala)) - Compute One Item with Min Score per User -- [CountDistinctItems.scala](CountDistinctItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala)) - Count Number of Distinct Items -- [Count.scala](Count.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Count.scala)) - Count Number of Items -- [AverageScorePerItem.scala](AverageScorePerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala)) - Compute Average Score per Item - [FieldStatistics.scala](FieldStatistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala)) - Compute Basic Descriptive Statistics for Each Field -- [MaxItemPerUser.scala](MaxItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala)) - Compute One Item with Max Score per User - [InvertedIndex.scala](InvertedIndex.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala)) - Build Inverted Index - [JoinLogAndMetadata.scala](JoinLogAndMetadata.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala)) - Join Log and Metadata Datasets +- [JoinLogs.scala](JoinLogs.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala)) - Join Two Log Datasets and Compute Action Sequence +- [MaxItemPerUser.scala](MaxItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala)) - Compute One Item with Max Score per User +- [MinItemPerUser.scala](MinItemPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala)) - Compute One Item with Min Score per User +- [Sessions.scala](Sessions.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala)) - Compute Session Duration and Number of Items from Log Data +- [Statistics.scala](Statistics.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala)) - Compute Basic Descriptive Statistics +- [SumPerItem.scala](SumPerItem.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala)) - Compute the Sum of Scores per Item +- [TopItems.scala](TopItems.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala)) - Compute Top K Items Globally +- [TopItemsPerUser.scala](TopItemsPerUser.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala)) - Compute Top K Items Globally +- [WordCount.scala](WordCount.scala.html) ([source](https://github.com/spotify/big-data-rosetta-code/blob/master/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala)) - Classic Word Count From 46b47166d8779b00f99a220692bef3cbc076ff74 Mon Sep 17 00:00:00 2001 From: Kellen Dye Date: Thu, 9 Nov 2023 13:41:23 -0500 Subject: [PATCH 11/11] updated site --- AverageScorePerItem.scala.html | 23 ++++++++------ Count.scala.html | 22 ++++++++----- CountDistinctItems.scala.html | 22 ++++++++----- CountUsers.scala.html | 17 +++++++---- DistinctItems.scala.html | 11 ++++--- FieldStatistics.scala.html | 39 +++++++++++++---------- InvertedIndex.scala.html | 15 +++++---- JoinLogAndMetadata.scala.html | 56 +++++++++++++++++++--------------- JoinLogs.scala.html | 43 ++++++++++++++------------ MaxItemPerUser.scala.html | 39 ++++++++++++----------- MinItemPerUser.scala.html | 39 ++++++++++++----------- Sessions.scala.html | 32 +++++++++---------- Statistics.scala.html | 37 +++++++++++++--------- SumPerItem.scala.html | 26 +++++++++------- TopItems.scala.html | 30 ++++++++++-------- TopItemsPerUser.scala.html | 21 ++++++++----- WordCount.scala.html | 16 ++++++---- index.html | 20 +++++++++--- 18 files changed, 298 insertions(+), 210 deletions(-) diff --git a/AverageScorePerItem.scala.html b/AverageScorePerItem.scala.html index 546c951..ba15ce7 100644 --- a/AverageScorePerItem.scala.html +++ b/AverageScorePerItem.scala.html @@ -285,7 +285,8 @@

Scio with Side Input

Side input makes RHS available on all workers

def scioSideInput( left: SCollection[LogEvent], @@ -397,49 +400,51 @@

Convert RHS to a side input of Map[String, Double]

val rhs = right.map(u => (u.user, u.age.toDouble)).asMapSideInput +

val rhs = right.map(u => (u.user, u.age.toDouble)).asMapSideInput

Replicate RHS to each worker

left - .withSideInputs(rhs)

+ .withSideInputs(rhs)

Scio with Hash Join

hashJoin is a short cut to the side input approach

def scioHashJoin( left: SCollection[LogEvent], right: SCollection[UserMeta] ): SCollection[(String, Double)] = { import com.twitter.algebird.AveragedValue - val lhs = left.map(e => (e.user, e.track)) - val rhs = right.map(u => (u.user, u.age.toDouble)) + val lhs = left.map(e => (e.user, e.track)) + val rhs = right.map(u => (u.user, u.age.toDouble)) lhs - .hashJoin(rhs) - .values - .aggregateByKey(AveragedValue.aggregator) + .hashJoin(rhs) + .values + .aggregateByKey(AveragedValue.aggregator) }

Count Number of Items

Scalding

Scalding with Algebird Aggregator

Scio

Scio with Algebird Aggregator

Spark

Spark with Algebird Aggregator

Count Number of Distinct Items

Scalding Exact Approach

Scalding Approximate Approach

Scio Exact Approach

Scio Approximate Approach

Spark Exact Approach

Spark Approximate Approach

Count the Number of Items of a Given User

Scalding

Sclading with Algebird Aggregator

Scio with Algebird Aggregator

Spark

Spark with Algebird Aggregator

Compute the Sum of Scores per Item

Scalding

Scalding with Algebird Aggregator

Scio

Spark

Spark with Algebird Semigroup

Spark with Algebird Aggregator

Compute Top K Items Globally

Scalding

Scalding with Algebird Aggregator

Scio

Scio with Algebird Aggregator

Spark

Spark with Algebird Aggregator

Compute Top K Items Globally

Scalding

Scio

Spark Naive Approach

Spark with Algebird Aggregator

Spark with MLLib

Classic Word Count

Scalding

Scio

Spark Transformation

Spark Action

Compute Average Score per Item

Scalding

Scalding with Algebird Aggregator

Scio

Spark

Spark with Algebird Aggregator

Compute Collection of Distinct Items

Scalding

Scio

Spark

Compute One Item with Max Score per User

Scalding

Scalding with Algebird Aggregator

Scio

Scio with Algebird Aggregator

Spark

Spark with Algebird Aggregator

Spark with MLLib

Compute One Item with Min Score per User

Scalding

Scalding with Algebird Aggregator

Scio

Scio with Algebird Aggregator

Spark

Spark with Algebird Aggregator

Spark with MLLib

Compute Basic Descriptive Statistics for Each Field

Algebird Aggregator

Scalding

Scio

Spark

Spark with Algebird Aggregator

Compute Basic Descriptive Statistics

Algebird Aggregator

Scalding

Scalding with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Sclading with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Spark with Algebird `Semigroup`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Algebird `Aggregator`

Spark with Algebird `Aggregator`

Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Sclading with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Algebird `Aggregator`