Thanks to visit codestin.com
Credit goes to github.com

Skip to content
This repository was archived by the owner on Jan 20, 2022. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ object Batcher {
override val currentBatch = BatchID(0L)
def batchOf(t: Timestamp) = currentBatch
def earliestTimeOf(batch: BatchID) = Timestamp.Min

override def latestTimeOf(batch: BatchID) = Timestamp.Max

override def toInterval(b: BatchID): Interval[Timestamp] =
if(b == BatchID(0))
Intersection(
Expand Down Expand Up @@ -160,6 +163,9 @@ trait Batcher extends Serializable {
/** Returns the (inclusive) earliest time of the supplied batch. */
def earliestTimeOf(batch: BatchID): Timestamp

/** Returns the latest time in the given batch */
def latestTimeOf(batch: BatchID): Timestamp = earliestTimeOf(batch.next).prev

/** Returns the current BatchID. */
def currentBatch: BatchID = batchOf(Timestamp.now)

Expand All @@ -176,7 +182,7 @@ trait Batcher extends Serializable {
*/
def enclosedBy(batchID: BatchID, other: Batcher): Iterable[BatchID] = {
val earliestInclusive = earliestTimeOf(batchID)
val latestInclusive = earliestTimeOf(batchID.next).prev
val latestInclusive = latestTimeOf(batchID)
BatchID.range(
other.batchOf(earliestInclusive),
other.batchOf(latestInclusive)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
Copyright 2014 Twitter, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package com.twitter.summingbird.batch

trait PrunedSpace[-T] extends java.io.Serializable {
// expire (REMOVE) before writing, T is often (K, V) pair
def prune(item: T, writeTime: Timestamp): Boolean
}

object PrunedSpace extends java.io.Serializable {
val neverPruned: PrunedSpace[Any] =
new PrunedSpace[Any] { def prune(item: Any, writeTime: Timestamp) = false }
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import com.twitter.summingbird.scalding._
import com.twitter.summingbird.scalding
import com.twitter.summingbird._
import com.twitter.summingbird.option._
import com.twitter.summingbird.batch.{ BatchID, Batcher, Timestamp, IteratorSums}
import com.twitter.summingbird.batch.{ BatchID, Batcher, Timestamp, IteratorSums, PrunedSpace}
import cascading.flow.FlowDef

import org.slf4j.LoggerFactory
Expand All @@ -45,6 +45,14 @@ trait BatchedStore[K, V] extends scalding.Store[K, V] { self =>
*/
def select(b: List[BatchID]): List[BatchID] = b


/**
* Override this to set up store pruning, by default, no (key,value) pairs
* are pruned. This is a house keeping function to permanently remove entries
* matching a criteria.
*/
def pruning: PrunedSpace[(K, V)] = PrunedSpace.neverPruned

/**
* For (firstNonZero - 1) we read empty. For all before we error on read. For all later, we proxy
* On write, we throw if batchID is less than firstNonZero
Expand All @@ -71,7 +79,9 @@ trait BatchedStore[K, V] extends scalding.Store[K, V] { self =>
// make sure we checkpoint to disk to avoid double computation:
val checked = if(batches.size > 1) lastVals.forceToDisk else lastVals
batches.foreach { batchID =>
val thisBatch = checked.filter { case (b, _) => b == batchID }
val thisBatch = checked.filter { case (b, kv) =>
(b == batchID) && !pruning.prune(kv, batcher.latestTimeOf(b))
}
writeLast(batchID, thisBatch.values)(flow, mode)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ package object scalding {

val ScaldingConfig = summingbird.batch.BatchConfig


// ALL DEPRECATION ALIASES BELOW HERE, NOTHING ELSE.
@deprecated("Use com.twitter.summingbird.batch.WaitingState", "0.3.2")
type WaitingState[T] = summingbird.batch.WaitingState[T]
@deprecated("Use com.twitter.summingbird.batch.PrepareState", "0.3.2")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,15 @@ import com.twitter.summingbird.scalding._
* For (firstNonZero - 1) we read empty. For all before we error on read. For all later, we proxy
* On write, we throw if batchID is less than firstNonZero
*/
class InitialBatchedStore[K,V](val firstNonZero: BatchID, val proxy: BatchedStore[K, V])
extends BatchedStore[K, V] {

def batcher = proxy.batcher
def ordering = proxy.ordering
// This one is dangerous and marked override because it has a default
override def select(b: List[BatchID]) = proxy.select(b)
def writeLast(batchID: BatchID, lastVals: TypedPipe[(K, V)])(implicit flowDef: FlowDef, mode: Mode) =
class InitialBatchedStore[K,V](val firstNonZero: BatchID, override val proxy: BatchedStore[K, V])
extends ProxyBatchedStore[K, V] {

override def writeLast(batchID: BatchID, lastVals: TypedPipe[(K, V)])(implicit flowDef: FlowDef, mode: Mode) =
if (batchID >= firstNonZero) proxy.writeLast(batchID, lastVals)
else sys.error("Earliest batch set at :" + firstNonZero + " but tried to write: " + batchID)

// Here is where we switch:
def readLast(exclusiveUB: BatchID, mode: Mode): Try[(BatchID, FlowProducer[TypedPipe[(K, V)]])] = {
override def readLast(exclusiveUB: BatchID, mode: Mode): Try[(BatchID, FlowProducer[TypedPipe[(K, V)]])] = {
if (exclusiveUB > firstNonZero) proxy.readLast(exclusiveUB, mode)
else if (exclusiveUB == firstNonZero) Right((firstNonZero.prev, Scalding.emptyFlowProducer[(K,V)]))
else Left(List("Earliest batch set at :" + firstNonZero + " but tried to read: " + exclusiveUB))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
Copyright 2013 Twitter, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package com.twitter.summingbird.scalding.store

import cascading.flow.FlowDef
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird.scalding._
import com.twitter.scalding.{Mode, TypedPipe}

/** Use this class to easily change, for instance, the pruning
* for an existing store.
*/
abstract class ProxyBatchedStore[K, V] extends batch.BatchedStore[K, V] {
def proxy: batch.BatchedStore[K, V]
override def batcher = proxy.batcher
override def ordering = proxy.ordering
override def select(b: List[BatchID]) = proxy.select(b)
override def pruning = proxy.pruning
override def readLast(exclusiveUB: BatchID, mode: Mode) = proxy.readLast(exclusiveUB, mode)
override def writeLast(batchID: BatchID, lastVals: TypedPipe[(K, V)])(implicit flowDef: FlowDef, mode: Mode): Unit =
proxy.writeLast(batchID, lastVals)(flowDef, mode)
}
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,15 @@ class MockMappable[T](val id: String)(implicit tconv: TupleConverter[T])
}

object TestStore {
def apply[K, V](store: String, inBatcher: Batcher, initStore: Iterable[(K, V)], lastTime: Long)
def apply[K, V](store: String, inBatcher: Batcher, initStore: Iterable[(K, V)], lastTime: Long, pruning: PrunedSpace[(K, V)] = PrunedSpace.neverPruned)
(implicit ord: Ordering[K], tset: TupleSetter[(K, V)], tconv: TupleConverter[(K, V)]) = {
val startBatch = inBatcher.batchOf(Timestamp(0)).prev
val endBatch = inBatcher.batchOf(Timestamp(lastTime)).next
new TestStore[K, V](store, inBatcher, startBatch, initStore, endBatch)
new TestStore[K, V](store, inBatcher, startBatch, initStore, endBatch, pruning)
}
}

class TestStore[K, V](store: String, inBatcher: Batcher, initBatch: BatchID, initStore: Iterable[(K, V)], lastBatch: BatchID)
class TestStore[K, V](store: String, inBatcher: Batcher, initBatch: BatchID, initStore: Iterable[(K, V)], lastBatch: BatchID, override val pruning: PrunedSpace[(K, V)])
(implicit ord: Ordering[K], tset: TupleSetter[(K, V)], tconv: TupleConverter[(K, V)])
extends batch.BatchedStore[K, V] {

Expand Down Expand Up @@ -333,6 +333,43 @@ object ScaldingLaws extends Specification {
compareMaps(original, Monoid.plus(initStore, inMemory), testStore) must be_==(true)
}


"match scala single step pruned jobs" in {
val original = sample[List[Int]]
val fn = sample[(Int) => List[(Int, Int)]]
val initStore = sample[Map[Int, Int]]
val prunedList = sample[Set[Int]]
val inMemory = {
val computedMap = TestGraphs.singleStepInScala(original)(fn)
val totalMap = Monoid.plus(initStore, computedMap)
totalMap.filter(kv => !prunedList.contains(kv._1)).toMap
}

val pruner = new PrunedSpace[(Int, Int)] {
def prune(item: (Int, Int), writeTime: Timestamp) = {
prunedList.contains(item._1)
}
}
// Add a time:
val inWithTime = original.zipWithIndex.map { case (item, time) => (time.toLong, item) }
val batcher = randomBatcher(inWithTime)
val testStore = TestStore[Int,Int]("test", batcher, initStore, inWithTime.size, pruner)
val (buffer, source) = testSource(inWithTime)

val summer = TestGraphs.singleStepJob[Scalding,(Long,Int),Int,Int](source, testStore)(t =>
fn(t._2))

val scald = Scalding("scalaCheckJob")
val intr = batchedCover(batcher, 0L, original.size.toLong)
val ws = new LoopState(intr)
val mode: Mode = TestMode(t => (testStore.sourceToBuffer ++ buffer).get(t))

scald.run(ws, mode, scald.plan(summer))
// Now check that the inMemory ==

compareMaps(original, inMemory, testStore) must be_==(true)
}

"match scala for flatMapKeys jobs" in {
val original = sample[List[Int]]
val initStore = sample[Map[Int,Int]]
Expand Down