Codestin Search App

+ + + + diff --git a/make-site.sh b/make-site.sh deleted file mode 100755 index 771514b..0000000 --- a/make-site.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -sbt makeSite ghpagesPushSite diff --git a/project/build.properties b/project/build.properties deleted file mode 100644 index e8a1e24..0000000 --- a/project/build.properties +++ /dev/null @@ -1 +0,0 @@ -sbt.version=1.9.7 diff --git a/project/plugins.sbt b/project/plugins.sbt deleted file mode 100644 index 572c523..0000000 --- a/project/plugins.sbt +++ /dev/null @@ -1,3 +0,0 @@ -addSbtPlugin("com.github.sbt" % "sbt-ghpages" % "0.8.0") -addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2") -addSbtPlugin("io.regadas" % "sbt-socco" % "0.1.5") diff --git a/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala b/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala deleted file mode 100644 index b0d8336..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/AverageScorePerItem.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Average Score per Item -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.algebird.Semigroup -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object AverageScorePerItem { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - input - .groupBy(_.user) - // Map into (sum, count) - .mapValues(x => (x.score, 1L)) - // Sum both per key with an implicit `Semigroup[(Double, Long)]` - .sum - // Map (sum, count) into average - .mapValues(p => p._1 / p._2) - .toTypedPipe - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.AveragedValue - input - .groupBy(_.user) - // Map values into `Double` - .mapValues(_.score) - // Aggregate average per key - .aggregate(AveragedValue.aggregator) - .toTypedPipe - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { - input - .keyBy(_.user) - // Map into (sum, count) - .mapValues(x => (x.score, 1L)) - // Sum both per key with an implicit `Semigroup[(Double, Long)]` - .sumByKey - // Map (sum, count) into average - .mapValues(p => p._1 / p._2) - } - - // ## Spark - // Summon an Algebird `Semigroup[(Double, Long)]` with implicit argument - def spark(input: RDD[Rating])(implicit sg: Semigroup[(Double, Long)]): RDD[(String, Double)] = { - input - .keyBy(_.user) - // Map into (sum, count) - .mapValues(x => (x.score, 1L)) - // Reduce both per key with `plus = (T, T) => T` where `T` is `(Double, Long)` - .reduceByKey(sg.plus) // plus: (T, T) => T where T is (Double, Long) - // Map (sum, count) into average - .mapValues(p => p._1 / p._2) - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): RDD[(String, Double)] = { - import com.twitter.algebird.AveragedValue - import com.twitter.algebird.spark._ - input - .keyBy(_.user) - .mapValues(_.score) - // Map values into `Double` - .algebird - // Aggregate average per key - .aggregateByKey(AveragedValue.aggregator) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/BloomFilterSetDifference.scala b/src/main/scala/com/spotify/bdrc/pipeline/BloomFilterSetDifference.scala deleted file mode 100644 index 1c269f3..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/BloomFilterSetDifference.scala +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright 2017 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.algebird._ -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -/** - * Filter LHS by removing items that exist in the RHS using a Bloom Filter. - * - * Inputs are collections of strings. - */ -object BloomFilterSetDifference { - - def scalding(lhs: TypedPipe[String], rhs: TypedPipe[String]): TypedPipe[String] = { - val width = BloomFilter.optimalWidth(1000, 0.01).get - val numHashes = BloomFilter.optimalNumHashes(1000, width) - lhs - .cross(rhs.aggregate(BloomFilterAggregator(numHashes, width))) - .filter { case (s, bf) => bf.contains(s).isTrue } - .keys - } - - def scio(lhs: SCollection[String], rhs: SCollection[String]): SCollection[String] = { - val width = BloomFilter.optimalWidth(1000, 0.01).get - val numHashes = BloomFilter.optimalNumHashes(1000, width) - lhs - .cross(rhs.aggregate(BloomFilterAggregator[String](numHashes, width))) - .filter { case (s, bf) => bf.contains(s).isTrue } - .keys - } - - def spark(lhs: RDD[String], rhs: RDD[String]): RDD[String] = { - import com.twitter.algebird.spark._ - val width = BloomFilter.optimalWidth(1000, 0.01).get - val numHashes = BloomFilter.optimalNumHashes(1000, width) - val bf = rhs.algebird.aggregate(BloomFilterAggregator(numHashes, width)) - lhs.filter(s => bf.contains(s).isTrue) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/Count.scala b/src/main/scala/com/spotify/bdrc/pipeline/Count.scala deleted file mode 100644 index 1203a3c..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/Count.scala +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Count Number of Items -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object Count { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { - input - .map(_ => 1L) - // Sum with an implicit `Semigroup[Long]` - .sum - .toTypedPipe - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Long] = { - import com.twitter.algebird.Aggregator.size - input - .aggregate(size) - .toTypedPipe - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[Long] = - input.count - - // ## Scio with Algebird `Aggregator` - def scioWithAlgebird(input: SCollection[Rating]): SCollection[Long] = { - import com.twitter.algebird.Aggregator.size - input - .aggregate(size) - } - - // # Spark - def spark(input: RDD[Rating]): Long = { - input - // `count` is an action and collects data back to the driver node - .count - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): Long = { - import com.twitter.algebird.Aggregator.size - import com.twitter.algebird.spark._ - input.algebird - // `aggregate` is an action and collects data back to the driver node - .aggregate(size) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala b/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala deleted file mode 100644 index c4e9e6e..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/CountDistinctItems.scala +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Count Number of Distinct Items -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.google.common.base.Charsets -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object CountDistinctItems { - - // ## Scalding Exact Approach - def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { - input - .map(_.item) - // Remove duplicates, requires a shuffle - .distinct - .map(_ => 1L) - // Sum with an implicit `Semigroup[Long]` - .sum - .toTypedPipe - } - - // ## Scalding Approximate Approach - def scaldingApproxWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Double] = { - import com.twitter.algebird.HyperLogLogAggregator - val aggregator = HyperLogLogAggregator.sizeAggregator(bits = 12) - input - // `HyperLogLog` expects bytes input - .map(_.item.getBytes(Charsets.UTF_8)) - // Aggregate globally into a `Double` - .aggregate(aggregator) - .toTypedPipe - } - - // ## Scio Exact Approach - def scio(input: SCollection[Rating]): SCollection[Long] = { - input - .map(_.item) - .distinct - .count - } - - // ## Scio Approximate Approach - def scioApprox(input: SCollection[Rating]): SCollection[Long] = { - input - .map(_.item) - .countApproxDistinct() - } - - // ## Spark Exact Approach - def spark(input: RDD[Rating]): Long = { - input - .map(_.item) - .distinct() - .count() - } - - // ## Spark Approximate Approach - def sparkApprox(input: RDD[Rating]): Long = { - input - .map(_.item) - .countApproxDistinct() - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala b/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala deleted file mode 100644 index e7a1458..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/CountUsers.scala +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Count the Number of Items of a Given User -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object CountUsers { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Long] = { - input - .filter(_.user == "Smith") - .map(_ => 1L) - // Sum with an implicit `Semigroup[Long]` - .sum - .toTypedPipe - } - - // ## Sclading with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Long] = { - import com.twitter.algebird.Aggregator.count - input - // Aggregate globally into a single `Long` - .aggregate(count(_.user == "Smith")) - .toTypedPipe - } - - def scio(input: SCollection[Rating]): SCollection[Long] = { - input - .filter(_.user == "Smith") - .count - } - - // ## Scio with Algebird `Aggregator` - def scioWithAlgebird(input: SCollection[Rating]): SCollection[Long] = { - import com.twitter.algebird.Aggregator.count - input - // Aggregate globally into a single `Long` - .aggregate(count((_: Rating).user == "Smith")) - } - - // ## Spark - def spark(input: RDD[Rating]): Long = { - input - .filter(_.user == "Smith") - // `count` is an action and collects data back to the driver node - .count() - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): Long = { - import com.twitter.algebird.Aggregator.count - import com.twitter.algebird.spark._ - input.algebird - // `aggregate` is an action and collects data back to the driver node - .aggregate(count(_.user == "Smith")) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala b/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala deleted file mode 100644 index 295fe4b..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/DistinctItems.scala +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Collection of Distinct Items -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object DistinctItems { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[String] = { - input - .map(_.item) - .distinct - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[String] = { - input - .map(_.item) - .distinct - } - - // ## Spark - def spark(input: RDD[Rating]): RDD[String] = { - input - .map(_.item) - .distinct() - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala b/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala deleted file mode 100644 index 5a3a08e..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/FieldStatistics.scala +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Basic Descriptive Statistics for Each Field -// Input is a collection of case classes -package com.spotify.bdrc.pipeline - -import com.spotify.scio.coders.Coder -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object FieldStatistics { - - case class User(age: Int, income: Double, score: Double) - case class Stats(max: Double, min: Double, mean: Double, stddev: Double) - case class UserStats(age: Stats, income: Stats, score: Stats) - - import com.twitter.algebird._ - implicit val momentsCoder: Coder[Moments] = Coder.kryo[Moments] - - // ## Algebird `Aggregator` - def aggregator = { - // Create 3 `Aggregator`s on `age` field with different logic - - // The first 2 are of type `Aggregator[User, _, Int]` which means it takes `User` as input and - // generates `Int` as output. The last one is of type `Aggregator[User, _, Moments]`, - // where `Moments` include count, mean, standard deviation, etc. The input `User` is prepared - // with a `User => Int` function `_.age`. - val maxAgeOp = Aggregator.max[Int].composePrepare[User](_.age) - val minAgeOp = Aggregator.min[Int].composePrepare[User](_.age) - val momentsAgeOp = Moments.aggregator.composePrepare[User](_.age) - - // Create 3 `Aggregator`s on `income` field with different logic - val maxIncomeOp = Aggregator.max[Double].composePrepare[User](_.income) - val minIncomeOp = Aggregator.min[Double].composePrepare[User](_.income) - val momentsIncomeOp = Moments.aggregator.composePrepare[User](_.income) - - // Create 3 `Aggregator`s on `score` field with different logic - val maxScoreOp = Aggregator.max[Double].composePrepare[User](_.score) - val minScoreOp = Aggregator.min[Double].composePrepare[User](_.score) - val momentsScoreOp = Moments.aggregator.composePrepare[User](_.score) - - // Apply 12 `Aggregator`s on the same input, present result tuple 12 as `UserStats`. - MultiAggregator( - maxAgeOp, - minAgeOp, - momentsAgeOp, - maxIncomeOp, - minIncomeOp, - momentsIncomeOp, - maxScoreOp, - minScoreOp, - momentsScoreOp - ).andThenPresent { t => - val (maxAge, minAge, mAge, maxIncome, minIncome, mIncome, maxScore, minScore, mScore) = t - UserStats( - age = Stats(maxAge, minAge, mAge.mean, mAge.stddev), - income = Stats(maxIncome, minIncome, mIncome.mean, mIncome.stddev), - score = Stats(maxScore, minScore, mScore.mean, mScore.stddev) - ) - } - } - - // ## Scalding - def scalding(input: TypedPipe[User]): TypedPipe[UserStats] = - input.aggregate(aggregator) - - // ## Scio - def scio(input: SCollection[User]): SCollection[UserStats] = - input.aggregate(aggregator) - - // ## Spark - def spark(input: RDD[User]): UserStats = { - // Compute each field separately, potentially in-efficient if input is not cached - val s1 = input.map(_.age).stats() - val s2 = input.map(_.income).stats() - val s3 = input.map(_.score).stats() - UserStats( - age = Stats(s1.max, s1.min, s1.mean, s1.stdev), - income = Stats(s2.max, s2.min, s2.mean, s2.stdev), - score = Stats(s3.max, s3.min, s3.mean, s3.stdev) - ) - } - - // ## Spark with Algebird `Aggregator` - def sparkAlgebird(input: RDD[User]): UserStats = { - import com.twitter.algebird.spark._ - input.algebird.aggregate(aggregator) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/FindMedian.scala b/src/main/scala/com/spotify/bdrc/pipeline/FindMedian.scala deleted file mode 100644 index e8f2b74..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/FindMedian.scala +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -/** - * Compute the median of a collection of numbers. - */ -object FindMedian { - - // Computing the exact median is very expensive as it requires sorting and counting elements. - // QTree is a compact data structure for approximate quantile and range queries. - - def scalding(input: TypedPipe[Long]): TypedPipe[(Double, Double)] = { - import com.twitter.algebird._ - input - .aggregate(QTreeAggregator[Long](0.5)) - .map(i => (i.lower.lower, i.upper.upper)) - } - - def scio(input: SCollection[Long]): SCollection[(Double, Double)] = { - import com.twitter.algebird._ - input - .aggregate(QTreeAggregator[Long](0.5)) - .map(i => (i.lower.lower, i.upper.upper)) - } - - def spark(input: RDD[Long]): (Double, Double) = { - import com.twitter.algebird._ - import com.twitter.algebird.spark._ - val i = input.algebird.aggregate(QTreeAggregator[Long](0.5)) - (i.lower.lower, i.upper.upper) - } - - // TODO: exact version - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala b/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala deleted file mode 100644 index 55f9edd..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/InvertedIndex.scala +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2017 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Build Inverted Index -// Build inverted index from a corpus of text documents - -// Input is a collection of (id, text) -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object InvertedIndex { - - case class Document(id: Int, text: String) - case class Posting(word: String, ids: Seq[Int]) - - // ## Scalding - def scalding(input: TypedPipe[Document]): TypedPipe[Posting] = { - input - // Split text and output (word, document ID) - .flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id))) - // Group and convert document IDs per key to `List[Int]` - .group - .toList - .map(Posting.tupled) - } - - // ## Scio - def scio(input: SCollection[Document]): SCollection[Posting] = { - input - // Split text and output (word, document ID) - .flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id))) - // Group document IDs per key into `Iterable[Int]` - .groupByKey - .map(kv => Posting(kv._1, kv._2.toSeq)) - } - - // ## Spark - def spark(input: RDD[Document]): RDD[Posting] = { - input - // Split text and output (word, document ID) - .flatMap(d => d.text.split("[^a-zA-Z']+").map(w => (w, d.id))) - // Group document IDs per key into `Iterable[Int]` - .groupByKey() - .map(kv => Posting(kv._1, kv._2.toSeq)) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala b/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala deleted file mode 100644 index 0ac595b..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/JoinLogAndMetadata.scala +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Join Log and Metadata Datasets -// Compute average age of users who listened to a track by joining log event and user metadata. -// -// - LHS input is a large collection of (user, page, timestamp). -// - RHS input is a small collection of (user, age). -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.{LogEvent, UserMeta} -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object JoinLogAndMetadata { - - // ## Scalding Naive Approach - def scaldingNaive( - left: TypedPipe[LogEvent], - right: TypedPipe[UserMeta] - ): TypedPipe[(String, Double)] = { - import com.twitter.algebird.AveragedValue - left - .groupBy(_.user) - // Join as (user, (LogEvent, UserMeta)) - .join(right.groupBy(_.user)) - // Drop user key - .values - // Map into (track, age) - .map { case (logEvent, userMeta) => - (logEvent.track, userMeta.age.toDouble) - } - .group - // Aggregate average age per track - .aggregate(AveragedValue.aggregator) - .toTypedPipe - } - - // ## Scalding with Hash Join - // `hashJoin` replicates the smaller RHS to all mappers on the LHS - def scaldingHashJoin( - left: TypedPipe[LogEvent], - right: TypedPipe[UserMeta] - ): TypedPipe[(String, Double)] = { - import com.twitter.algebird.AveragedValue - - // Map out fields to avoid shuffing large objects - val lhs = left.map(e => (e.user, e.track)) - // Force to disk to avoid repeating the same computation on each mapper on the LHS - val rhs = right.map(u => (u.user, u.age.toDouble)).forceToDisk - - lhs - .hashJoin(rhs) - .values - .group - .aggregate(AveragedValue.aggregator) - .toTypedPipe - } - - // ## Scio Naive Approach - def scioNaive( - left: SCollection[LogEvent], - right: SCollection[UserMeta] - ): SCollection[(String, Double)] = { - import com.twitter.algebird.AveragedValue - val lhs = left.map(e => (e.user, e.track)) - val rhs = right.map(u => (u.user, u.age.toDouble)) - // Join as (user, (track, age)) - lhs - .join(rhs) - // Drop user key to make track as new key in (track, age) - .values - // Aggregate average age per track - .aggregateByKey(AveragedValue.aggregator) - } - - // ## Scio with Side Input - // Side input makes RHS available on all workers - def scioSideInput( - left: SCollection[LogEvent], - right: SCollection[UserMeta] - ): SCollection[(String, Double)] = { - import com.twitter.algebird.AveragedValue - - // Convert RHS to a side input of `Map[String, Double]` - val rhs = right.map(u => (u.user, u.age.toDouble)).asMapSideInput - - // Replicate RHS to each worker - left - .withSideInputs(rhs) - // Access side input via the context - .map { case (e, sideContext) => (e.track, sideContext(rhs).getOrElse(e.user, 0.0)) } - // Convert back to regular SCollection - .toSCollection - .aggregateByKey(AveragedValue.aggregator) - } - - // ## Scio with Hash Join - // `hashJoin` is a short cut to the side input approach - def scioHashJoin( - left: SCollection[LogEvent], - right: SCollection[UserMeta] - ): SCollection[(String, Double)] = { - import com.twitter.algebird.AveragedValue - val lhs = left.map(e => (e.user, e.track)) - val rhs = right.map(u => (u.user, u.age.toDouble)) - lhs - .hashJoin(rhs) - .values - .aggregateByKey(AveragedValue.aggregator) - } - - // ## Spark Naive Approach - def sparkNaive(left: RDD[LogEvent], right: RDD[UserMeta]): RDD[(String, Double)] = { - import com.twitter.algebird.spark._ - import com.twitter.algebird.AveragedValue - val lhs = left.map(e => (e.user, e.track)) - val rhs = right.map(u => (u.user, u.age.toDouble)) - // Join as (user, (track, age)) - lhs - .join(rhs) - // Drop user key to make track as new key in (track, age) - .values - .algebird - // Aggregate average age per track - .aggregateByKey(AveragedValue.aggregator) - } - - // ## Spark with Broadcast Variable - def sparkBroadcast(left: RDD[LogEvent], right: RDD[UserMeta]): RDD[(String, Double)] = { - import com.twitter.algebird.spark._ - import com.twitter.algebird.AveragedValue - - // Retrieve `SparkContext` for creating broadcast variable - val sc = left.context - - // Collect RHS to driver memory and broadcast back to workers - val map = right.map(u => (u.user, u.age.toDouble)).collectAsMap() - val b = sc.broadcast(map) - - left - // In-memory lookup on each worker - .map(e => (e.track, b.value.getOrElse(e.user, 0.0))) - .algebird - .aggregateByKey(AveragedValue.aggregator) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala b/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala deleted file mode 100644 index 9273a4d..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/JoinLogs.scala +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Join Two Log Datasets and Compute Action Sequence -// Given two log datasets of play track and save track events, compute tracks that a user saved -// after playing in a session. - -// Inputs are collections of (user, item, timestamp). -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.LogEvent -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object JoinLogs { - - val gapDuration = 3600000 - - // Detect if a pair of (event type, LogEvent) tuples match a play and save sequence - def detectPlaySaveSequence(pair: Seq[(String, LogEvent)]): Option[String] = { - val Seq(first, second) = pair - if ( - first._1 == "play" && second._1 == "save" && first._2.track == second._2.track && - second._2.timestamp - first._2.timestamp <= gapDuration - ) { - Some(first._2.track) - } else { - None - } - } - - // ## Scalding - def scalding( - playEvents: TypedPipe[LogEvent], - saveEvents: TypedPipe[LogEvent] - ): TypedPipe[(String, String)] = { - // Map inputs to key-values and add event type information - val plays = playEvents.map(e => (e.user, ("play", e))).group - val saves = saveEvents.map(e => (e.user, ("save", e))).group - - plays - .cogroup(saves) { (user, p, s) => - // `Iterable`s of play and save events for the user - (p ++ s).toList - .sortBy(_._2.timestamp) - // Neighboring pairs - .sliding(2) - .flatMap(detectPlaySaveSequence) - } - .toTypedPipe - } - - // ## Scio - def scio( - playEvents: SCollection[LogEvent], - saveEvents: SCollection[LogEvent] - ): SCollection[(String, String)] = { - // Map inputs to key-values and add event type information - val plays = playEvents.map(e => (e.user, ("play", e))) - val saves = saveEvents.map(e => (e.user, ("save", e))) - - plays - .cogroup(saves) - // `Iterable`s of play and save events for the user - .flatMapValues { case (p, s) => - (p ++ s).toList - .sortBy(_._2.timestamp) - // Neighboring pairs - .sliding(2) - .flatMap(detectPlaySaveSequence) - } - } - - // ## Spark - def spark(playEvents: RDD[LogEvent], saveEvents: RDD[LogEvent]): RDD[(String, String)] = { - // Map inputs to key-values and add event type information - val plays = playEvents.map(e => (e.user, ("play", e))) - val saves = saveEvents.map(e => (e.user, ("save", e))) - - plays - .cogroup(saves) - .flatMapValues { case (p, s) => - // `Iterable`s of play and save events for the user - (p ++ s).toList - .sortBy(_._2.timestamp) - // Neighboring pairs - .sliding(2) - .flatMap(detectPlaySaveSequence) - } - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala b/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala deleted file mode 100644 index a3baaab..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/MaxItemPerUser.scala +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute One Item with Max Score per User -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object MaxItemPerUser { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { - input - .groupBy(_.user) - // Reduce items per key by picking the side with higher score for each pair of input - .reduce((x, y) => if (x.score > y.score) x else y) - .values - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Rating] = { - import com.twitter.algebird.Aggregator.maxBy - input - .groupBy(_.user) - // Aggregate per key into a single `Rating` based on `Double` value via `_.score` - .aggregate(maxBy(_.score)) - .values - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[Rating] = { - input - .keyBy(_.user) - // Compute top one item per key as an `Iterable[Rating]` - .topByKey(1)(Ordering.by(_.score)) - // Drop user key - .values - // Flatten result `Iterable[Rating]` - .flatten - } - - // ## Scio with Algebird `Aggregator` - def scioWithAlgebird(input: SCollection[Rating]): SCollection[Rating] = { - import com.twitter.algebird.Aggregator.maxBy - input - .keyBy(_.user) - // Aggregate per key into a single `Rating` based on `Double` value via `_.score`. Explicit - // type due to type inference limitation. - .aggregateByKey(maxBy { x: Rating => x.score }) - .values - } - - // ## Spark - def spark(input: RDD[Rating]): RDD[Rating] = { - input - .keyBy(_.user) - // Reduce items per key by picking the side with higher score for each pair of input - .reduceByKey((x, y) => if (x.score > y.score) x else y) - .values - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { - import com.twitter.algebird.Aggregator.maxBy - import com.twitter.algebird.spark._ - input - .keyBy(_.user) - .algebird - // Aggregate per key into a single `Rating` based on `Double` value via `_.score`. Explicit - // type due to type inference limitation. - .aggregateByKey(maxBy { x: Rating => x.score }) - .values - } - - // ## Spark with MLLib - def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { - import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ - input - .keyBy(_.user) - // From `spark-mllib`, compute top K per key with a priority queue - .topByKey(1)(Ordering.by(_.score)) - .flatMap(_._2) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala b/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala deleted file mode 100644 index c2f323a..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/MinItemPerUser.scala +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute One Item with Min Score per User -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object MinItemPerUser { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { - input - .groupBy(_.user) - // Reduce items per key by picking the side with lower score for each pair of input - .reduce((x, y) => if (x.score < y.score) x else y) - .values - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[Rating] = { - import com.twitter.algebird.Aggregator.minBy - input - .groupBy(_.user) - // Aggregate per key into a single `Rating` based on `Double` value via `_.score` - .aggregate(minBy(_.score)) - .values - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[Rating] = { - input - .keyBy(_.user) - // Compute top one item per key as an `Iterable[Rating]` with a reverse comparator - .topByKey(1)(Ordering.by(-_.score)) - // Drop user key - .values - // Flatten result `Iterable[Rating]` - .flatten - } - - // ## Scio with Algebird `Aggregator` - def scioWithAlgebird(input: SCollection[Rating]): SCollection[Rating] = { - import com.twitter.algebird.Aggregator.minBy - input - .keyBy(_.user) - // Aggregate per key into a single `Rating` based on `Double` value via `_.score`. Explicit - // type due to type inference limitation. - .aggregateByKey(minBy { x: Rating => x.score }) - .values - } - - // ## Spark - def spark(input: RDD[Rating]): RDD[Rating] = { - input - .keyBy(_.user) - // Reduce items per key by picking the side with lower score for each pair of input - .reduceByKey((x, y) => if (x.score < y.score) x else y) - .values - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { - import com.twitter.algebird.Aggregator.minBy - import com.twitter.algebird.spark._ - input - .keyBy(_.user) - .algebird - // Aggregate per key into a single `Rating` based on `Double` value via `_.score`. Explicit - // type due to type inference limitation. - .aggregateByKey(minBy { x: Rating => x.score }) - .values - } - - // ## Spark with MLLib - def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { - import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ - input - .keyBy(_.user) - // From `spark-mllib`, compute top K per key with a priority queue and a reverse comparator - .topByKey(1)(Ordering.by(-_.score)) - .flatMap(_._2) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/PageRank.scala b/src/main/scala/com/spotify/bdrc/pipeline/PageRank.scala deleted file mode 100644 index d675460..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/PageRank.scala +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -/** - * Classic PageRank. - * - * Input is a collection of (source URL, destination URL). - */ -object PageRank { - - val iterations = 10 - val dampingFactor = 0.85 - - def scalding(input: TypedPipe[(String, String)]): TypedPipe[(String, Double)] = { - val links = input.group.toList // (src URL, list of dst URL) - var ranks = input.keys.distinct.map((_, 1.0)) // (src URL, 1.0) - - for (i <- 1 to 10) { - val contribs = links - .join(ranks) - .toTypedPipe - .values - // re-distribute rank of src URL among collection of dst URLs - .flatMap { case (urls, rank) => - val size = urls.size - urls.map((_, rank / size)) - } - ranks = contribs.group.sum - .mapValues((1 - dampingFactor) + dampingFactor * _) - .toTypedPipe - } - - ranks - } - - def scio(input: SCollection[(String, String)]): SCollection[(String, Double)] = { - val links = input.groupByKey - var ranks = links.mapValues(_ => 1.0) - - for (i <- 1 to 10) { - val contribs = links - .join(ranks) - .values - .flatMap { case (urls, rank) => - val size = urls.size - urls.map((_, rank / size)) - } - ranks = contribs.sumByKey - .mapValues((1 - dampingFactor) + dampingFactor * _) - } - - ranks - } - - def spark(input: RDD[(String, String)]): RDD[(String, Double)] = { - val links = input - .groupByKey() // (src URL, iterable of dst URL) - .cache() // links is reused in every iteration - var ranks = links.mapValues(_ => 1.0) // (src URL, 1.0) - - for (i <- 1 to 10) { - val contribs = links - .join(ranks) - .values - // re-distribute rank of src URL among collection of dst URLs - .flatMap { case (urls, rank) => - val size = urls.size - urls.map((_, rank / size)) - } - ranks = contribs - .reduceByKey(_ + _) - .mapValues((1 - dampingFactor) + dampingFactor * _) - } - - ranks - } -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala b/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala deleted file mode 100644 index 4d89beb..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/Sessions.scala +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Session Duration and Number of Items from Log Data -// Input is a collection of log events -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.LogEvent -import com.spotify.scio.extra.Iterators._ -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD -import org.joda.time.Instant - -import scala.collection.mutable - -object Sessions { - - val gapDuration = 3600000 - - case class Session(user: String, duration: Long, numItems: Int) - - // Wrapper for `Iterator[LogEvent]` that group items into sessions - class SessionIterator(self: Iterator[LogEvent]) extends Iterator[Seq[LogEvent]] { - // `BufferedIterator` allows peak ahead - private val bi = self.buffered - override def hasNext: Boolean = bi.hasNext - override def next(): Seq[LogEvent] = { - val buf = mutable.Buffer(bi.next()) - var last = buf.head.timestamp - - // Consume subsequent events until a gap is detected - while (bi.hasNext && bi.head.timestamp - last < gapDuration) { - val n = bi.next() - buf.append(n) - last = n.timestamp - } - buf - } - } - - // ## Scalding - def scalding(input: TypedPipe[LogEvent]): TypedPipe[Session] = { - input - .groupBy(_.user) - // `sortBy` uses Hadoop secondary sort to sort keys during shuffle - .sortBy(_.timestamp) - // Iterate over values lazily and group items into sessions - .mapValueStream(new SessionIterator(_)) - .toTypedPipe - // Map over each (user, session items) - .map { case (user, items) => - Session(user, items.last.timestamp - items.head.timestamp, items.size) - } - } - - // ## Scio - def scio(input: SCollection[LogEvent]): SCollection[Session] = { - input - // Values in `groupBy` are sorted by timestamp - .timestampBy(e => new Instant(e.timestamp)) - // No secondary sort in Scio, shuffle all items - .groupBy(_.user) - .flatMapValues { - _.iterator - // Generic version of `SessionIterator` from `scio-extra` - .timeSeries(_.timestamp) - .session(gapDuration) - } - // Map over each (user, session items) - .map { case (user, items) => - Session(user, items.last.timestamp - items.head.timestamp, items.size) - } - } - - // ## Spark - def spark(input: RDD[LogEvent]): RDD[Session] = { - input - // No secondary sort in Spark, shuffle all items - .groupBy(_.user) - .flatMapValues { - _ - // Order of values after shuffle is not guaranteed - .toList - .sortBy(_.timestamp) - .iterator - // Generic version of `SessionIterator` from `scio-extra` - .timeSeries(_.timestamp) - .session(gapDuration) - } - // Map over each (user, session items) - .map { case (user, items) => - Session(user, items.last.timestamp - items.head.timestamp, items.size) - } - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala b/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala deleted file mode 100644 index 8fa15ff..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/Statistics.scala +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Basic Descriptive Statistics -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.coders.Coder -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object Statistics { - - case class Stats(max: Double, min: Double, sum: Double, count: Long, mean: Double, stddev: Double) - - import com.twitter.algebird._ - implicit val momentsCoder: Coder[Moments] = Coder.kryo[Moments] - - // ## Algebird `Aggregator` - def aggregator = { - // Create 4 `Aggregator`s with different logic - - // The first 3 are of type `Aggregator[Rating, _, Double]` which means it takes `Rating` as - // input and generates `Double` as output. The last one is of type - // `Aggregator[Rating, _, Moments]`, where `Moments` include count, mean, standard deviation, - // etc. The input `Rating` is prepared with a `Rating => Double` function `_.score`. - val maxOp = Aggregator.max[Double].composePrepare[Rating](_.score) - val minOp = Aggregator.min[Double].composePrepare[Rating](_.score) - val sumOp = Aggregator.prepareMonoid[Rating, Double](_.score) - val momentsOp = Moments.aggregator.composePrepare[Rating](_.score) - - // Apply 4 `Aggregator`s on the same input, present result tuple 4 of - // `(Double, Double, Double, Moments)` as `Stats` - MultiAggregator(maxOp, minOp, sumOp, momentsOp) - .andThenPresent { case (max, min, sum, moments) => - Stats(max, min, sum, moments.count, moments.mean, moments.stddev) - } - } - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Stats] = - input.aggregate(aggregator) - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[Stats] = { - input - .map(_.score) - .stats - .map(s => Stats(s.max, s.min, s.sum, s.count, s.mean, s.stdev)) - } - - // ## Scio with Algebird `Aggregator` - def scioAlgebird(input: SCollection[Rating]): SCollection[Stats] = - input.aggregate(aggregator) - - // ## Spark - def spark(input: RDD[Rating]): Stats = { - val s = input.map(_.score).stats() - Stats(s.max, s.min, s.sum, s.count, s.mean, s.stdev) - } - - // ## Spark with Algebird `Aggregator` - def sparkAlgebird(input: RDD[Rating]): Stats = { - import com.twitter.algebird.spark._ - input.algebird.aggregate(aggregator) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala b/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala deleted file mode 100644 index 14bf627..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/SumPerItem.scala +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute the Sum of Scores per Item -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object SumPerItem { - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - input - .groupBy(_.item) - .mapValues(_.score) - // Sum per key with an implicit `Semigroup[Double]` - .sum - .toTypedPipe - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.Aggregator.prepareMonoid - input - .groupBy(_.item) - // Aggregate per key with an aggregator that converts `UserItemData` to `Double` via - // `_.score` before reduce - .aggregate(prepareMonoid(_.score)) - .toTypedPipe - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { - input - .map(x => (x.item, x.score)) - .sumByKey - } - - // ## Spark - def spark(input: RDD[Rating]): RDD[(String, Double)] = { - input - .map(x => (x.item, x.score)) - .reduceByKey(_ + _) - } - - // ## Spark with Algebird `Semigroup` - def sparkWithAlgebird1(input: RDD[Rating]): RDD[(String, Double)] = { - import com.twitter.algebird.spark._ - input - .map(x => (x.item, x.score)) - .algebird - // Sum per key with an implicit `Semigroup[Double]` - .sumByKey - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird2(input: RDD[Rating]): RDD[(String, Double)] = { - import com.twitter.algebird.Aggregator.prepareMonoid - import com.twitter.algebird.spark._ - input - .keyBy(_.item) - .algebird - // Aggregate per key with an aggregator that converts `UserItemData` to `Double` via - // `_.score` before reduce. Explicit type due to type inference limitation. - .aggregateByKey(prepareMonoid { x: Rating => x.score }) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/TfIdf.scala b/src/main/scala/com/spotify/bdrc/pipeline/TfIdf.scala deleted file mode 100644 index 86e2c06..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/TfIdf.scala +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -/** - * Compute TF-IDF for a set of documents. - * - * Input is a Seq of (doc, text). - */ -object TfIdf { - - case class Score(term: String, doc: String, score: Double) - - def scalding(input: Seq[(String, TypedPipe[String])]): TypedPipe[Score] = { - val numDocs = input.size - - val docToTerms = input - .map { case (doc, pipe) => - pipe - .flatMap(_.split("\\W+").filter(_.nonEmpty)) - .map(t => (doc, t.toLowerCase)) - } - // union input collections - .reduce(_ ++ _) // (d, t) - - val docToTermAndFreq = docToTerms - .groupBy(identity) - .size - .toTypedPipe - .map { case ((d, t), tf) => (d, (t, tf)) } - - val termToDfN = docToTerms.distinct.values - .groupBy(identity) - .size // (t, df) - .mapValues(_.toDouble / numDocs) // (t, df/N) - - docToTerms.keys - .groupBy(identity) - .size // (d, |d|) - .join(docToTermAndFreq) - .toTypedPipe - .map { case (d, (dLen, (t, tf))) => (t, (d, tf.toDouble / dLen)) } // (t, (d, tf/|d|)) - .join(termToDfN) - .toTypedPipe - .map { case (t, ((d, tfd), dfN)) => Score(t, d, tfd * math.log(1 / dfN)) } - } - - def scio(input: Seq[(String, SCollection[String])]): SCollection[Score] = { - val numDocs = input.size - - val docToTerms = input - .map { case (doc, pipe) => - pipe - .flatMap(_.split("\\W+").filter(_.nonEmpty)) - .map(t => (doc, t.toLowerCase)) - } - // union input collections - .reduce(_ ++ _) // (d, t) - - val docToTermAndCFreq = docToTerms - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) - .map { case ((d, t), tf) => (d, (t, tf)) } - - val termToDfN = docToTerms.distinct.values - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) // (t, df) - .mapValues(_.toDouble / numDocs) // (t, df/N) - - docToTerms.keys - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) // (d, |d|) - .join(docToTermAndCFreq) - .map { case (d, (dLen, (t, tf))) => (t, (d, tf.toDouble / dLen)) } // (t, (d, tf/|d|)) - .join(termToDfN) - .map { case (t, ((d, tfd), dfN)) => Score(t, d, tfd * math.log(1 / dfN)) } - } - - /** Spark implementation using transformations to keep computation distributed. */ - def sparkTransformations(input: Seq[(String, RDD[String])]): RDD[Score] = { - val numDocs = input.size - - val docToTerms = input - .map { case (doc, pipe) => - pipe - .flatMap(_.split("\\W+").filter(_.nonEmpty)) - .map(t => (doc, t.toLowerCase)) - } - // union input collections - .reduce(_ ++ _) // (d, t) - .cache() // docToTerms is reused 3 times - - val docToTermAndCFreq = docToTerms - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) - .map { case ((d, t), tf) => (d, (t, tf)) } - - val termToDfN = docToTerms - .distinct() - .values - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) // (t, df) - .mapValues(_.toDouble / numDocs) // (t, df/N) - - docToTerms.keys - // equivalent to .countByValue but returns RDD instead of Map - .map((_, 1L)) - .reduceByKey(_ + _) // (d, |d|) - .join(docToTermAndCFreq) - .map { case (d, (dLen, (t, tf))) => (t, (d, tf.toDouble / dLen)) } // (t, (d, tf/|d|)) - .join(termToDfN) - .map { case (t, ((d, tfd), dfN)) => Score(t, d, tfd * math.log(1 / dfN)) } - } - - /** Spark implementation using actions to compute some steps on the driver node. */ - def sparkActions(input: Seq[(String, RDD[String])]): Seq[Score] = { - val numDocs = input.size - - val docToTerms = input - .map { case (doc, pipe) => - pipe - .flatMap(_.split("\\W+").filter(_.nonEmpty)) - .map(t => (doc, t.toLowerCase)) - } - .reduce(_ ++ _) // (d, t) - .cache() // docToTerms is reused 3 times - - val docToTermAndCFreq = docToTerms - .countByValue() - // performed on driver node - .map { case ((d, t), tf) => (d, (t, tf)) } - - val termToDfN = docToTerms - .distinct() - .values - .countByValue() // (t, df) - // performed on driver node - .mapValues(_.toDouble / numDocs) // (t, df/N) - - docToTerms.keys - .countByValue() // (d, |d|) - // performed on driver node - .toSeq - .map { case (d, dLen) => - val (t, tf) = docToTermAndCFreq(d) - //(t, (d, tf.toDouble / dLen)) // (t, (d, tf/|d|)) - val tfd = tf.toDouble / dLen - val dfN = termToDfN(t) - Score(t, d, tfd * math.log(1 / dfN)) - } - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala b/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala deleted file mode 100644 index ad52209..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/TopItems.scala +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Top K Items Globally -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object TopItems { - - val topK = 100 - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - input - .map(x => (x.item, x.score)) - .group - // Sum values with an implicit `Semigroup[Double]` - .sum - // Group all elements with a single key `Unit` - .groupAll - // Take top K with a priority queue - .sortedReverseTake(topK)(Ordering.by(_._2)) - // Drop `Unit` key - .values - // Flatten result `Seq[(String, Double)]` - .flatten - } - - // ## Scalding with Algebird `Aggregator` - def scaldingWithAlgebird(input: TypedPipe[Rating]): TypedPipe[(String, Double)] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) - input - .map(x => (x.item, x.score)) - .group - // Sum values with an implicit `Semigroup[Double]` - .sum - .toTypedPipe - // Aggregate globally into a single `Seq[(String, Double)]` - .aggregate(aggregator) - // Flatten result `Seq[(String, Double)]` - .flatten - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[(String, Double)] = { - input - .map(x => (x.item, x.score)) - // Sum values with an implicit `Semigroup[Double]` - .sumByKey - // Compute top K as an `Iterable[(String, Double)]` - .top(topK)(Ordering.by(_._2)) - // Flatten result `Iterable[(String, Double)]` - .flatten - } - - // ## Scio with Algebird `Aggregator` - def scioWithAlgebird(input: SCollection[Rating]): SCollection[(String, Double)] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) - input - .map(x => (x.item, x.score)) - // Sum values with an implicit `Semigroup[Double]` - .sumByKey - // Aggregate globally into a single `Seq[(String, Double)]` - .aggregate(aggregator) - // Flatten result `Seq[(String, Double)]` - .flatten - } - - // ## Spark - def spark(input: RDD[Rating]): Seq[(String, Double)] = { - input - .map(x => (x.item, x.score)) - // Sum values with addition - .reduceByKey(_ + _) - // `top` is an action and collects data back to the driver node - .top(topK)(Ordering.by(_._2)) - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): Seq[(String, Double)] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - import com.twitter.algebird.spark._ - val aggregator = sortedReverseTake[(String, Double)](topK)(Ordering.by(_._2)) - input - .map(x => (x.item, x.score)) - // Sum values with addition - .reduceByKey(_ + _) - .algebird - // `aggregate` is an action and collects data back to the driver node - .aggregate(aggregator) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala b/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala deleted file mode 100644 index b294873..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/TopItemsPerUser.scala +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Compute Top K Items Globally -// Input is a collection of (user, item, score) -package com.spotify.bdrc.pipeline - -import com.spotify.bdrc.util.Records.Rating -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object TopItemsPerUser { - - val topK = 100 - - // ## Scalding - def scalding(input: TypedPipe[Rating]): TypedPipe[Rating] = { - input - .groupBy(_.user) - // Take top K per group with a priority queue - .sortedReverseTake(topK)(Ordering.by(_.score)) - // Drop user key - .values - // Flatten result `Seq[Rating]` - .flatten - } - - // ## Scio - def scio(input: SCollection[Rating]): SCollection[Rating] = { - input - .keyBy(_.user) - // Compute top K per key - .topByKey(topK)(Ordering.by(_.score)) - // Drop user key - .values - // Flatten result `Iterable[Rating]` - .flatten - } - - // ## Spark Naive Approach - def spark(input: RDD[Rating]): RDD[Rating] = { - input - // `groupBy` shuffles all data, inefficient - .groupBy(_.user) - // Drop user key - .values - // Convert grouped values to a `List[Rating]` and sort on a single node, inefficient - .flatMap(_.toList.sortBy(-_.score).take(topK)) - } - - // ## Spark with Algebird `Aggregator` - def sparkWithAlgebird(input: RDD[Rating]): RDD[Rating] = { - import com.twitter.algebird.Aggregator.sortedReverseTake - import com.twitter.algebird.spark._ - val aggregator = sortedReverseTake[Rating](topK)(Ordering.by(_.score)) - input - .keyBy(_.user) - .algebird - // Aggregate per key into a `Seq[Rating]` - .aggregateByKey(aggregator) - // Flatten result `Seq[Rating]` - .flatMap(_._2) - } - - // ## Spark with MLLib - def sparkWithMllib(input: RDD[Rating]): RDD[Rating] = { - import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ - input - .keyBy(_.user) - // From `spark-mllib`, compute top K per key with a priority queue - .topByKey(topK)(Ordering.by(_.score)) - // Flatten result `Seq[Rating]` - .flatMap(_._2) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/TotalAndDistinctCount.scala b/src/main/scala/com/spotify/bdrc/pipeline/TotalAndDistinctCount.scala deleted file mode 100644 index 561b899..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/TotalAndDistinctCount.scala +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -/** - * Compute number of total and distinct items. - * - * Input is a collection of (user, item, score). - */ -object TotalAndDistinctCount { - - def aggregator = { - import com.twitter.algebird._ - // Exact total count, approximate unique count - val totalCount = Aggregator.size - val uniqueCount = Aggregator.approximateUniqueCount[String] - MultiAggregator(totalCount, uniqueCount) - } - - def scaldingExact(input: TypedPipe[String]): TypedPipe[(Long, Long)] = { - input - .map((_, 1L)) - .group - .sum // (key, total count per key) - .toTypedPipe - .map(kv => (kv._1, (kv._2, 1L))) - .group - .sum // (key, (total count, distinct count)) - .values - } - - def scaldingApproximate(input: TypedPipe[String]): TypedPipe[(Long, Long)] = - input.aggregate(aggregator) - - def scioExact(input: SCollection[String]): SCollection[(Long, Long)] = { - input - .map((_, 1L)) - .sumByKey // (key, total count per key) - .map(kv => (kv._1, (kv._2, 1L))) - .sumByKey // (key, (total count, distinct count)) - .values - } - - def scioApproximate(input: SCollection[String]): SCollection[(Long, Long)] = - input.aggregate(aggregator) - - def sparkAlgebird(input: RDD[String]): RDD[(Long, Long)] = { - import com.twitter.algebird.spark._ - input - .map((_, 1L)) - .algebird - .sumByKey[String, Long] // (key, total count per key) - .map(kv => (kv._1, (kv._2, 1L))) - .algebird - .sumByKey[String, (Long, Long)] // (key, (total count, distinct count)) - .values - } - - def sparkInMemory(input: RDD[String]): (Long, Long) = { - input.cache() - (input.count(), input.distinct().count()) - } - - def sparkApproximate(input: RDD[String]): (Long, Long) = { - input.cache() - (input.count(), input.countApproxDistinct()) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala b/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala deleted file mode 100644 index 4eeb3d5..0000000 --- a/src/main/scala/com/spotify/bdrc/pipeline/WordCount.scala +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -// Example: Classic Word Count -package com.spotify.bdrc.pipeline - -import com.spotify.scio.values.SCollection -import com.twitter.scalding.TypedPipe -import org.apache.spark.rdd.RDD - -object WordCount { - - // ## Scalding - def scalding(input: TypedPipe[String]): TypedPipe[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - // `groupBy` is lazy - .groupBy(identity) - // Operations like `size` after `groupBy` can be lifted into the map phase - .size - .toTypedPipe - } - - // ## Scio - def scio(input: SCollection[String]): SCollection[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - .countByValue - } - - // ## Spark Transformation - def sparkTransformation(input: RDD[String]): RDD[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - // There is no `countByValue` transformation in Spark although it is equivalent to mapping - // into initial count of `1` and reduce with addition - .map((_, 1L)) - // `reduceByKey` can lift function into the map phase - .reduceByKey(_ + _) - } - - // ## Spark Action - def sparkAction(input: RDD[String]): Seq[(String, Long)] = { - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - // `countByValue` is an action and collects data back to the driver node - .countByValue() - .toSeq - } - -} diff --git a/src/main/scala/com/spotify/bdrc/scala/Collections.scala b/src/main/scala/com/spotify/bdrc/scala/Collections.scala deleted file mode 100644 index 68b27d0..0000000 --- a/src/main/scala/com/spotify/bdrc/scala/Collections.scala +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2017 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.scala - -/** - * Examples for working with Scala collections. - */ -object Collections { - - def mapValues: Unit = { - val m = Map("a" -> 1, "b" -> 2, "c" -> 3) - - // Native approach, inefficient since .toList, .map, .toMap each creates a copy - m.toList.map(t => (t._1, t._2 + 1)).toMap - - // Better, one copy - m.map(kv => (kv._1, kv._2 + 1)) - - // Lazy version, no copy - m.mapValues(_ + 1) - } - - def mergeMaps: Unit = { - val m1 = Map("a" -> 1.0, "b" -> 2.0, "c" -> 3.0) - val m2 = Map("a" -> 1.5, "b" -> 2.5, "d" -> 3.5) - - // Native approach, inefficient since it creates many copies - val i = m1.keySet intersect m2.keySet - val m = i.map(k => k -> (m1(k) + m2(k))) // sum values of common keys - (m1 -- i) ++ (m2 -- i) ++ m // inefficient, creates 2 more temporary maps - m1 ++ m2 ++ m // slightly better, values from RHS overwrites those from LHS - - // Slightly better but still creates a temporary set - (m1.keySet ++ m2.keySet).map(k => k -> (m1.getOrElse(k, 0.0) + m2.getOrElse(k, 0.0))) - - // Better but slightly cryptic - m1 ++ m2.map { case (k, v) => k -> (v + m1.getOrElse(k, 0.0)) } - } - - def listToMap: Unit = { - val l = List(1, 2, 3, 4, 5) - - // Native approach, creates a temporary copy - l.map(x => "key" + x -> x).toMap - - // Slightly better, using a mutable builder - val b = Map.newBuilder[String, Int] - l.foreach(x => b += "key" + x -> x) - b.result() - - // Use implicits to automatically build for the target collection type Map[String, Int] - val m: Map[String, Int] = l.map(x => "key" + x -> x)(scala.collection.breakOut) - } - -} diff --git a/src/main/scala/com/spotify/bdrc/scala/FilterMessyData.scala b/src/main/scala/com/spotify/bdrc/scala/FilterMessyData.scala deleted file mode 100644 index 4769e71..0000000 --- a/src/main/scala/com/spotify/bdrc/scala/FilterMessyData.scala +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.scala - -import scala.util.Try - -/** - * Filter out messy data that may cause computation to fail. - * - * Input is a collection of case classes with messy values. - */ -object FilterMessyData { - - case class MessyData(user: String, gender: String, scores: Array[Double], favorites: Set[String]) - - /** Dummy method that may fail for invalid records. */ - def compute(x: MessyData): String = "dummy_result" - - /** Naive approach that checks every field accessed. */ - def naive(input: Seq[MessyData]): Seq[String] = { - input - .filter { x => - x.user != null && x.gender != null && - x.scores != null && x.scores.nonEmpty && - x.favorites != null && x.favorites.nonEmpty - } - .map(compute) // may still fail for unexpected cases - } - - /** - * Smart approach that throws any failed records away. - * - * Try.toOption returns Some if the computation succeeds or None if it fails. - * Option[U] is implicitly converted to TraversableOnce[U] that flatMap expects. - * - * WARNING: THIS APPROACH IGNORES ANY EXCEPTION AND IS POTENTIALLY UNSAFE. - */ - def withUnsafeFlatMap(input: Seq[MessyData]): Seq[String] = - input - .flatMap(x => Try(compute(x)).toOption) - - /** - * Smart approach that throws any failed records away. - * - * Try/catch block returns a Seq of one item if compute succeeds and Nil if it fails. - * This approach is safer since you have control over what exceptions to expect. - */ - def withSafeFlatMap(input: Seq[MessyData]): Seq[String] = { - input - .flatMap { x => - try { - Seq(compute(x)) - } catch { - case _: NullPointerException => Nil - } - } - } - -} diff --git a/src/main/scala/com/spotify/bdrc/scala/HandlingOptions.scala b/src/main/scala/com/spotify/bdrc/scala/HandlingOptions.scala deleted file mode 100644 index 5387469..0000000 --- a/src/main/scala/com/spotify/bdrc/scala/HandlingOptions.scala +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.scala - -/** - * Handling data with multiple Option[T]s more gracefully. - * - * Input is a collection of case classes with nested Option[T]. - */ -object HandlingOptions { - - case class Metadata(track: Option[Track], audio: Option[Audio]) - case class Track(id: String, name: String, artist: Option[Artist]) - case class Artist(id: String, name: String) - case class Audio(tempo: Int, key: String) - - /** Naive approach that checks every field accessed is defined. */ - def naive(input: Seq[Metadata]): Seq[(String, Int)] = { - input - .filter(m => m.track.isDefined && m.track.get.artist.isDefined && m.audio.isDefined) - .map { m => - // Option[T].get is safe since we already checked with Option[T].isDefined - (m.track.get.artist.get.id, m.audio.get.tempo) - } - } - - /** - * Smart approach that uses for comprehension. - * - * For-comprehension extracts values from Options and yields Some if all Options are defined. - * It yields None if any of the Options is None. - */ - def withFlatMap(input: Seq[Metadata]): Seq[(String, Int)] = { - input.flatMap { md => - for { - tr <- md.track // extract Track from Option[Track] - ar <- tr.artist // extract Artist from Option[Artist] - au <- md.audio // extract Audio from Option[Audio] - } yield (ar.id, au.tempo) - } - } - - /** The for-comprehension above translates to nested flatMaps. */ - def withNestedFlatMap(input: Seq[Metadata]): Seq[(String, Int)] = { - input.flatMap { md => - md.track.flatMap { tr => - tr.artist.flatMap(ar => md.audio.map(au => (ar.id, au.tempo))) - } - } - } - -} diff --git a/src/main/scala/com/spotify/bdrc/scala/JavaPrimitives.scala b/src/main/scala/com/spotify/bdrc/scala/JavaPrimitives.scala deleted file mode 100644 index ff1be50..0000000 --- a/src/main/scala/com/spotify/bdrc/scala/JavaPrimitives.scala +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.scala - -/** - * Examples for working with Java primitives. - */ -object JavaPrimitives { - - // java.lang.Double is a boxed type (object) while double in Java is a primitive type. - // scala.Double can be either boxed or primitive depending on the context, e.g. it's boxed when - // used as a type parameter in a generic class but primitive when used in an array or on the - // stack. - // Due to type system limitations, M[java.lang.Double] and M[scala.Double] are incompatible types - // but they can be casted safely back and forth since both are implemented as Java boxed types. - import java.lang.{Double => JDouble} - import java.util.{List => JList} - - import scala.collection.JavaConverters._ - - /** - * `xs.asScala` returns `mutable.Buffer[JDouble]` where `Buffer` is a sub-type of `Seq` but - * `JDouble` is not the same type as `Double` (`scala.Double`). Casting is safe because `JDouble` - * and `Double` are equivalent when used as type parameters (boxed objects). It's also cheaper - * than `.map(_.toDouble)` which creates a copy of the `Buffer`. - */ - def jDoubleListToSeq(xs: JList[JDouble]): Seq[Double] = xs.asScala.asInstanceOf[Seq[Double]] - - /** - * Array[Double] is more efficient since it's implemented as a Java primitive array. Arrays are - * also mutable so it'scheaper to pre-allocate and mutate elements. Java iterator and while loop - * are faster than `xs.asScala.asInstanceOf[Seq[Double]].toArray`. - */ - def jDoubleListToArray(xs: JList[JDouble]): Array[Double] = { - val a = new Array[Double](xs.size()) - var i = 0 - val iterator = xs.iterator() - while (iterator.hasNext) { - a(i) = iterator.next() - i += 1 - } - a - } - -} diff --git a/src/main/scala/com/spotify/bdrc/util/Records.scala b/src/main/scala/com/spotify/bdrc/util/Records.scala deleted file mode 100644 index 070cf0e..0000000 --- a/src/main/scala/com/spotify/bdrc/util/Records.scala +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.util - -object Records { - - case class LogEvent(user: String, track: String, timestamp: Long) - case class Rating(user: String, item: String, score: Double) - case class UserMeta(user: String, age: Int) - -} diff --git a/src/test/scala/com/spotify/bdrc/bench/ForYieldBenchmark.scala b/src/test/scala/com/spotify/bdrc/bench/ForYieldBenchmark.scala deleted file mode 100644 index 4fce4e5..0000000 --- a/src/test/scala/com/spotify/bdrc/bench/ForYieldBenchmark.scala +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.bench - -import java.lang.{Iterable => JIterable} - -import com.google.common.collect.Lists -import org.scalameter.api._ -import org.scalameter.picklers.noPickler._ - -import scala.collection.JavaConverters._ - -/** Micro-benchmark for for/yield pattern. */ -object ForYieldBenchmark extends Bench.LocalTime { - - val lSizes = Gen.enumeration("lSize")(1, 10, 100, 1000) - val rSizes = Gen.enumeration("rSize")(1, 10, 100, 1000) - - def jIterable(i: Int): JIterable[String] = - Lists.newArrayList((0 until i).map("v%05d".format(_)): _*).asInstanceOf[JIterable[String]] - - val inputs = for { - l <- lSizes - r <- rSizes - } yield (jIterable(l), jIterable(r)) - - performance of "Join" in { - measure method "forIterable" in { - using(inputs) in { p => - for { - a <- p._1.asScala - b <- p._2.asScala - } yield ("key", (a, b)) - } - } - - // Iterator version is lazy and more efficient - measure method "forIterator" in { - using(inputs) in { p => - val r = for { - a <- p._1.asScala.iterator - b <- p._2.asScala.iterator - } yield ("key", (a, b)) - r.toIterable - } - } - } - -} diff --git a/src/test/scala/com/spotify/bdrc/testing/T01EndToEndTest.scala b/src/test/scala/com/spotify/bdrc/testing/T01EndToEndTest.scala deleted file mode 100644 index 4f9b83f..0000000 --- a/src/test/scala/com/spotify/bdrc/testing/T01EndToEndTest.scala +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.testing - -import com.spotify.scio._ -import com.spotify.scio.io.TextIO -import com.spotify.scio.testing.PipelineSpec - -object WordCount1 { - def main(cmdlineArgs: Array[String]): Unit = { - val (sc, args) = ContextAndArgs(cmdlineArgs) - sc.textFile(args("input")) - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - .countByValue - .map(kv => kv._1 + ": " + kv._2) - .saveAsTextFile(args("output")) - sc.run() - } -} - -/** - * Test an entire pipeline end-to-end - * - * Pros: - * - Complete test of the entire pipeline - * - Covers argument parsing and I/O handling - * - May also reveal serialization issues - * - * Cons: - * - Hard to handcraft input and expected data - * - Hard to cover edge cases for complex pipelines - * - Can be slow in some frameworks - * - * Supported in: Scalding, Scio - * - * Recommendation: - * This is a good approach to test small and simple pipelines since it offers the best code - * coverage. It can also be used for pipelines with complex argument parsing and I/O handling, - * e.g. ones with dynamic I/O based on arguments. - * - * Very complex pipelines with lots of steps may be broken down into smaller logical blocks and - * tested separately using the transform test approach. - */ -class T01EndToEndTest extends PipelineSpec { - - val input = Seq("a b c d e", "a b a b") - val expected = Seq("a: 3", "b: 3", "c: 1", "d: 1", "e: 1") - - "WordCount1" should "work" in { - JobTest[com.spotify.bdrc.testing.WordCount1.type] - .args("--input=in.txt", "--output=out.txt") - .input(TextIO("in.txt"), input) - .output(TextIO("out.txt"))(output => output should containInAnyOrder(expected)) - .run() - } - -} diff --git a/src/test/scala/com/spotify/bdrc/testing/T02TransformTest.scala b/src/test/scala/com/spotify/bdrc/testing/T02TransformTest.scala deleted file mode 100644 index 9a95a9a..0000000 --- a/src/test/scala/com/spotify/bdrc/testing/T02TransformTest.scala +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.testing - -import com.spotify.scio._ -import com.spotify.scio.testing._ -import com.spotify.scio.values.SCollection - -object WordCount2 { - def main(cmdlineArgs: Array[String]): Unit = { - val (sc, args) = ContextAndArgs(cmdlineArgs) - val input = sc.textFile(args("input")) - val wc = countWords(input) - val output = formatOutput(wc) - output.saveAsTextFile(args("output")) - } - - def countWords(input: SCollection[String]): SCollection[(String, Long)] = - input - .flatMap(_.split("[^a-zA-Z']+").filter(_.nonEmpty)) - .countByValue - - def formatOutput(input: SCollection[(String, Long)]): SCollection[String] = - input - .map(kv => kv._1 + ": " + kv._2) -} - -/** - * Test pipeline transforms - * - * Pros: - * - Break down complex pipelines into smaller reusable pieces - * - Easier to handcraft input and expected data than end-to-end test - * - * Cons: - * - Does not cover argument parsing and IO handling - * - May disrupt pipeline logic flow if overused - * - * Supported in: Scalding, Scio, Spark - * - * Recommendation: - * Complex pipelines can be broken into logical blocks and tested using this approach. Individual - * transforms should have clear roles in the pipeline, e.g. parsing input, formatting output, - * aggregating data, training model, predicting labels, etc. It should also be easy to craft input - * and expected data for these transforms and cover all code bases and edge cases. - * - * The level of granularity of each transform is also important. A transform should be small enough - * for readability but big enough to avoid disruption to the main pipeline flow. Things to - * consider are: number of inputs and outputs, group or join operations, etc. - */ -class TransformTest extends PipelineSpec { - - val input = Seq("a b c d e", "a b a b") - val expected = Seq("a: 3", "b: 3", "c: 1", "d: 1", "e: 1") - val intermediate = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L)) - - "countWords" should "work" in { - runWithContext { sc => - val in = sc.parallelize(input) - WordCount2.countWords(in) should containInAnyOrder(intermediate) - } - } - - "formatOutput" should "work" in { - runWithContext { sc => - val in = sc.parallelize(intermediate) - WordCount2.formatOutput(in) should containInAnyOrder(expected) - } - } - -} diff --git a/src/test/scala/com/spotify/bdrc/testing/T03FunctionTest.scala b/src/test/scala/com/spotify/bdrc/testing/T03FunctionTest.scala deleted file mode 100644 index bbf88eb..0000000 --- a/src/test/scala/com/spotify/bdrc/testing/T03FunctionTest.scala +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.testing - -import com.spotify.scio._ -import org.scalatest.flatspec.AnyFlatSpec -import org.scalatest.matchers.should.Matchers - -object WordCount3 { - def main(cmdlineArgs: Array[String]): Unit = { - val (sc, args) = ContextAndArgs(cmdlineArgs) - sc.textFile(args("input")) - .flatMap(split) - .countByValue - .map(format) - .saveAsTextFile(args("output")) - } - - def split(input: String): Seq[String] = input.split("[^a-zA-Z']+").filter(_.nonEmpty) - def format(kv: (String, Long)): String = kv._1 + ": " + kv._2 -} - -/** - * Test individual functions used in a pipeline - * - * Pros: - * - Fastest to test - * - Easy to cover edge cases - * - * Cons: - * - Limited scope of coverage - * - May disrupt pipeline logic flow if overused - * - * Supported in: any framework - * - * Recommendation: - * This is recommended for commonly reused functions or those with complex business logic, e.g. - * numerical computation, log clean up and filtering, value group operations after groupByKey. - * - * The level of granularity of each function is also important. Typical candidates are multi-line - * functions that are used more than once. Functions with complex logic and hard to test at a - * higher level (transform or end-to-end), e.g. user session analysis after grouping by user key, - * can also be tested with this approach. - */ -class FunctionTest extends AnyFlatSpec with Matchers { - - "split" should "work" in { - WordCount3.split("a b,c d\te\n\nf") should equal(Seq("a", "b", "c", "d", "e", "f")) - } - - "format" should "work" in { - WordCount3.format(("a", 10L)) should equal("a: 10") - } - -} diff --git a/src/test/scala/com/spotify/bdrc/testing/T04PropertyBasedTest.scala b/src/test/scala/com/spotify/bdrc/testing/T04PropertyBasedTest.scala deleted file mode 100644 index d765d1f..0000000 --- a/src/test/scala/com/spotify/bdrc/testing/T04PropertyBasedTest.scala +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.testing - -import com.google.common.collect.MinMaxPriorityQueue -import org.scalacheck.Prop._ -import org.scalacheck.{Gen, Properties} -import org.scalatest.propspec.AnyPropSpec -import org.scalatest.matchers.should.Matchers -import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks - -import scala.collection.JavaConverters._ - -object Utils { - - def top[T: Ordering](xs: Seq[T], num: Int): Seq[T] = { - if (xs.isEmpty) { - Seq.empty[T] - } else { - val size = math.min(num, xs.size) - val ord = implicitly[Ordering[T]] - MinMaxPriorityQueue - .orderedBy(ord.reverse) - .expectedSize(size) - .maximumSize(size) - .create[T](xs.asJava) - .asScala - .toSeq - .sorted(ord.reverse) - } - } - - def split(input: String): Seq[String] = - input - .split("[^a-zA-Z']+") - .filter(_.nonEmpty) - .map(_.toLowerCase) - - def cosineSim(v1: Seq[Double], v2: Seq[Double]): Double = { - require(v1.length == v2.length) - var s1 = 0.0 - var s2 = 0.0 - var dp = 0.0 - var i = 0 - while (i < v1.length) { - s1 += v1(i) * v1(i) - s2 += v2(i) * v2(i) - dp += v1(i) * v2(i) - i += 1 - } - dp / math.sqrt(s1 * s2) - } - -} - -/** - * Property-based testing using ScalaCheck - * - * http://scalacheck.org/ - * - * Pros: - * - No need to handcraft input data - * - May reveal rare edge cases, e.g. null input, extreme values, empty lists - * - * Cons: - * - Hard to test business logic - * - Some properties may be hard to verify - * - Can be slow for expensive computations - * - * Supported in: any framework - * - * Recommendation: - * This is useful for functions simple input and output types, especially those of heavy - * mathematically computation, e.g. linear algebra, hash functions, set operations. - * - * However, since input data are randomly generated based on type signature, it might produce edge - * cases irrelevant to the business logic, e.g. Double.MinValue, strings with Unicode characters. - * You might also have to construct your own generator if certain distribution of input data is - * expected, e.g. positive integers, strings from a finite set. - * - * See AlgebirdSpec.scala for more examples of testing Algebird features using ScalaCheck - * https://github.com/spotify/scio/blob/master/scio-examples/src/test/scala/com/spotify/scio/examples/extra/AlgebirdSpec.scala - */ -class PropertyBasedTest extends AnyPropSpec with ScalaCheckDrivenPropertyChecks with Matchers { - - property("top") { - forAll { xs: Seq[Long] => Utils.top(xs, 5) shouldBe xs.sorted.reverse.take(5) } - } - - property("split") { - forAll { line: String => Utils.split(line).forall(_.matches("[a-z']+")) } - } - - // Generator for List[Double] of 100 doubles between -100.0 and 100.0 - val genVector = Gen.listOfN(100, Gen.choose(-100.0, 100.0)) - - property("cosineSim") { - forAll(genVector, genVector) { (v1, v2) => - val s1 = Utils.cosineSim(v1, v2) - val s2 = Utils.cosineSim(v2, v1) - - s1 should (be >= -1.0 and be <= 1.0) - s1 shouldBe s2 - Utils.cosineSim(v1, v1) shouldBe 1.0 - Utils.cosineSim(v1, v1.map(-_)) shouldBe -1.0 - } - } - -} diff --git a/src/test/scala/com/spotify/bdrc/testing/T05MixedTest.scala b/src/test/scala/com/spotify/bdrc/testing/T05MixedTest.scala deleted file mode 100644 index ba4abb2..0000000 --- a/src/test/scala/com/spotify/bdrc/testing/T05MixedTest.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright 2016 Spotify AB. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package com.spotify.bdrc.testing - -import com.spotify.scio._ -import com.spotify.scio.io.TextIO -import com.spotify.scio.testing.PipelineSpec -import com.spotify.scio.values.SCollection - -object WordCount4 { - def main(cmdlineArgs: Array[String]): Unit = { - val (sc, args) = ContextAndArgs(cmdlineArgs) - val input = sc.textFile(args("input")) - val wc = countWords(input) - val output = formatOutput(wc) - output.saveAsTextFile(args("output")) - sc.run() - } - - // transforms - def countWords(input: SCollection[String]): SCollection[(String, Long)] = - input.flatMap(split).countByValue - def formatOutput(input: SCollection[(String, Long)]): SCollection[String] = - input.map(format) - - // functions - def split(input: String): Seq[String] = input.split("[^a-zA-Z']+").filter(_.nonEmpty) - def format(kv: (String, Long)): String = kv._1 + ": " + kv._2 -} - -/** - * Mixed function, transform and end-to-end tests - * - * Property-based tests require an object that extends Properties and therefore are not included. - */ -class MixedTest extends PipelineSpec { - - val input = Seq("a b c d e", "a b a b") - val expected = Seq("a: 3", "b: 3", "c: 1", "d: 1", "e: 1") - val intermediate = Seq(("a", 3L), ("b", 3L), ("c", 1L), ("d", 1L), ("e", 1L)) - - // Function tests - - "split" should "work" in { - WordCount3.split("a b,c d\te\n\nf") should equal(Seq("a", "b", "c", "d", "e", "f")) - } - - "format" should "work" in { - WordCount3.format(("a", 10L)) should equal("a: 10") - } - - // Transform tests - - "countWords" should "work" in { - runWithContext { sc => - val in = sc.parallelize(input) - WordCount4.countWords(in) should containInAnyOrder(intermediate) - } - } - - "formatOutput" should "work" in { - runWithContext { sc => - val in = sc.parallelize(intermediate) - WordCount4.formatOutput(in) should containInAnyOrder(expected) - } - } - - // End-to-end test - - "WordCount1" should "work" in { - JobTest[com.spotify.bdrc.testing.WordCount4.type] - .args("--input=in.txt", "--output=out.txt") - .input(TextIO("in.txt"), input) - .output(TextIO("out.txt"))(output => output should containInAnyOrder(expected)) - .run() - } - -}

Compute Average Score per Item

Scalding

Scalding with Algebird Aggregator

Scio

Spark

Spark with Algebird Aggregator

Count Number of Items

Scalding

Scalding with Algebird Aggregator

Scio

Scio with Algebird Aggregator

Spark

Spark with Algebird Aggregator

Count Number of Distinct Items

Scalding Exact Approach

Scalding Approximate Approach

Scio Exact Approach

Scio Approximate Approach

Spark Exact Approach

Spark Approximate Approach

Count the Number of Items of a Given User

Scalding

Sclading with Algebird Aggregator

Scio with Algebird Aggregator

Spark

Spark with Algebird Aggregator

Compute Collection of Distinct Items

Scalding

Scio

Spark

Compute Basic Descriptive Statistics for Each Field

Algebird Aggregator

Scalding

Scio

Spark

Spark with Algebird Aggregator

Build Inverted Index

Scalding

Scio

Spark

Join Log and Metadata Datasets

Scalding Naive Approach

Scalding with Hash Join

Scio Naive Approach

Scio with Side Input

Scio with Hash Join

Spark Naive Approach

Spark with Broadcast Variable

Join Two Log Datasets and Compute Action Sequence

Scalding

Scio

Spark

Compute One Item with Max Score per User

Scalding

Scalding with Algebird Aggregator

Scio

Scio with Algebird Aggregator

Spark

Spark with Algebird Aggregator

Spark with MLLib

Compute One Item with Min Score per User

Scalding

Scalding with Algebird Aggregator

Scio

Scio with Algebird Aggregator

Spark

Spark with Algebird Aggregator

Spark with MLLib

Compute Session Duration and Number of Items from Log Data

Scalding

Scio

Spark

Compute Basic Descriptive Statistics

Algebird Aggregator

Scalding

Scio

Scio with Algebird Aggregator

Spark

Spark with Algebird Aggregator

Compute the Sum of Scores per Item

Scalding with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Sclading with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Algebird `Aggregator`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Spark with Algebird `Semigroup`

Spark with Algebird `Aggregator`

Scalding with Algebird `Aggregator`

Scio with Algebird `Aggregator`

Spark with Algebird `Aggregator`

Spark with Algebird `Aggregator`