Thanks to visit codestin.com
Credit goes to github.com

Skip to content
This repository was archived by the owner on Jan 20, 2022. It is now read-only.
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ object TestStore {
}
}

class TestStore[K, V](store: String, inBatcher: Batcher, initBatch: BatchID, initStore: Iterable[(K, V)], lastBatch: BatchID, override val pruning: PrunedSpace[(K, V)])(implicit ord: Ordering[K], tset: TupleSetter[(K, V)], tconv: TupleConverter[(K, V)])
class TestStore[K, V](store: String, inBatcher: Batcher, val initBatch: BatchID, initStore: Iterable[(K, V)], lastBatch: BatchID, override val pruning: PrunedSpace[(K, V)])(implicit ord: Ordering[K], tset: TupleSetter[(K, V)], tconv: TupleConverter[(K, V)])
extends batch.BatchedStore[K, V] {

var writtenBatches = Set[BatchID](initBatch)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
/*
Copyright 2013 Twitter, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package com.twitter.summingbird.scalding

import cascading.flow.{ Flow, FlowDef }

import com.twitter.algebird._
import com.twitter.algebird.monad._
import com.twitter.summingbird.batch._
import com.twitter.summingbird.option.{ Commutative, NonCommutative, Commutativity }
import com.twitter.scalding.{ Source => ScaldingSource, Test => TestMode, _ }

import org.scalacheck._
import org.scalacheck.Prop._
import org.scalacheck.Properties

/**
* Tests for BatchedStore
*/
object BatchedStoreProperties extends Properties("BatchedStore's Properties") {

implicit def intersectionArb[T: Arbitrary: Ordering]: Arbitrary[Intersection[InclusiveLower, ExclusiveUpper, T]] =
Arbitrary {
for {
l <- Arbitrary.arbitrary[T]
u <- Arbitrary.arbitrary[T]
if implicitly[Ordering[T]].lt(l, u)
} yield Intersection(InclusiveLower(l), ExclusiveUpper(u))
}

implicit val arbTimestamp: Arbitrary[Timestamp] = Arbitrary {
Gen.choose(1L, 100000L)
.map { Timestamp(_) }
}

implicit val arbitraryPipeFactory: Arbitrary[PipeFactory[Nothing]] = {
Arbitrary {
Gen.const {
StateWithError[(Interval[Timestamp], Mode), List[FailureReason], FlowToPipe[Nothing]] {
(timeMode: (Interval[Timestamp], Mode)) =>
{
val (time: Interval[Timestamp], mode: Mode) = timeMode
val a: FlowToPipe[Nothing] = Reader { (fdM: (FlowDef, Mode)) => TypedPipe.empty }
Right((timeMode, a))
}
}
}
}
}

implicit def timeExtractor[T <: (Long, Any)] = TestUtil.simpleTimeExtractor[T]

implicit val arbitraryInputWithTimeStampAndBatcher: Arbitrary[(List[(Long, Int)], Batcher, TestStore[Int, Int])] = Arbitrary {
for {
arbInt <- Arbitrary.arbitrary[List[Int]]
in = arbInt.zipWithIndex.map { case (item: Int, time: Int) => (time.toLong, item) }
arbMap <- Arbitrary.arbitrary[Map[Int, Int]]
batcher = TestUtil.randomBatcher(in)
lastTimeStamp = in.size
testStore = TestStore[Int, Int]("test", batcher, arbMap, lastTimeStamp)
} yield (in, batcher, testStore)
}

implicit def arbitraryLocalMode: Arbitrary[Mode] = Arbitrary { Gen.const(Local(true)) }
implicit def arbitraryCommutativity: Arbitrary[Commutativity] = Arbitrary {
Gen.oneOf(Seq(Commutative, NonCommutative))
}

property("readAfterLastBatch should return interval starting from the last batch written") = {
forAll {
(diskPipeFactory: PipeFactory[Nothing],
interval: Intersection[InclusiveLower, ExclusiveUpper, Timestamp],
inputWithTimeStampAndBatcherAndStore: (List[(Long, Int)], Batcher, TestStore[Int, Int]),
mode: Mode) =>
val (inputWithTimeStamp, batcher, testStore) = inputWithTimeStampAndBatcherAndStore
val result = testStore.readAfterLastBatch(diskPipeFactory)((interval, mode))

result match {
case Right(((Intersection(InclusiveLower(readIntervalLower), ExclusiveUpper(_)), _), _)) => {
//readInterval should start from the last written interval in the store
val start: Timestamp = batcher.earliestTimeOf(testStore.initBatch.next)
implicitly[Ordering[Timestamp]].equiv(readIntervalLower, start)
}
case Right(_) => false
case Left(_) => interval == Empty()
}
}
}

property("readAfterLastBatch should not extend the end of interval requested") = {
forAll {
(diskPipeFactory: PipeFactory[Nothing],
interval: Intersection[InclusiveLower, ExclusiveUpper, Timestamp],
inputWithTimeStampAndBatcherAndStore: (List[(Long, Int)], Batcher, TestStore[Int, Int]),
mode: Mode) =>
val (inputWithTimeStamp, batcher, testStore) = inputWithTimeStampAndBatcherAndStore
val result = testStore.readAfterLastBatch(diskPipeFactory)((interval, mode))

result match {
case Right(((Intersection(InclusiveLower(_), ExclusiveUpper(readIntervalUpper)), _), _)) => {
//readInterval should start from the last written interval in the store
implicitly[Ordering[Timestamp]].lteq(readIntervalUpper, interval.upper.upper)
}
case Right(_) => false
case Left(_) => interval == Empty()
}
}
}

property("the end of merged interval is never extended") = {
forAll {
(diskPipeFactory: PipeFactory[Nothing],
interval: Intersection[InclusiveLower, ExclusiveUpper, Timestamp],
inputWithTimeStampAndBatcherAndStore: (List[(Long, Int)], Batcher, TestStore[Int, Int]),
commutativity: Commutativity,
mode: Mode) =>
val (inputWithTimeStamp, batcher, testStore) = inputWithTimeStampAndBatcherAndStore
val mergeResult = testStore.merge(diskPipeFactory, implicitly[Semigroup[Int]], commutativity, 10)((interval, mode))
mergeResult.isRight ==> {
val Right(((Intersection(InclusiveLower(_), ExclusiveUpper(readIntervalUpper)), _), _)) = mergeResult
val requestedEndingTimestamp: Timestamp = interval.upper.upper
val readIntervalEndingTimestamp: Timestamp = readIntervalUpper
implicitly[Ordering[Timestamp]].lteq(readIntervalEndingTimestamp, requestedEndingTimestamp)
}
}
}

property("should not merge if the time interval on disk(from diskPipeFactory) is smaller than one batch") = {
//To test this property, it requires the length of the batcher is at least 2 millis, since we want
//to create data that fits a batch partially
def atLeast2MsBatcher(batcher: Batcher): Boolean = {
batcher match {
case b: MillisecondBatcher => b.durationMillis >= 2
case _ => true
}
}
forAll {
(interval: Intersection[InclusiveLower, ExclusiveUpper, Timestamp],
inputWithTimeStampAndBatcherAndStore: (List[(Long, Int)], Batcher, TestStore[Int, Int]),
commutativity: Commutativity,
mode: Mode) =>
val (inputWithTimeStamp, batcher, testStore) = inputWithTimeStampAndBatcherAndStore
(atLeast2MsBatcher(batcher)) ==> {
val nextBatchEnding = batcher.latestTimeOf(testStore.initBatch.next)

//this diskPipeFactory returns a time interval that ends before the ending of next batch, meaning there is not enough data for a new batch
val diskPipeFactory = StateWithError[(Interval[Timestamp], Mode), List[FailureReason], FlowToPipe[(Int, Int)]] {
(timeMode: (Interval[Timestamp], Mode)) =>
{
val (time: Interval[Timestamp], mode: Mode) = timeMode
val Intersection(InclusiveLower(startRequestedTime), ExclusiveUpper(_)) = time

//shrink the endTime so it does not cover a whole batch
val onDiskEndTime: Long = Gen.choose(startRequestedTime.milliSinceEpoch, nextBatchEnding.milliSinceEpoch).sample.get

val readTime: Interval[Timestamp] = if (startRequestedTime == nextBatchEnding)
Empty()
else
Intersection(InclusiveLower(startRequestedTime), ExclusiveUpper(nextBatchEnding))

val flowToPipe: FlowToPipe[(Int, Int)] = Reader { (fdM: (FlowDef, Mode)) => TypedPipe.from[(Timestamp, (Int, Int))](Seq((Timestamp(10), (2, 3)))) }
Right(((readTime, mode), flowToPipe))
}
}

val mergeResult = testStore.merge(diskPipeFactory, implicitly[Semigroup[Int]], commutativity, 10)((interval, mode))

mergeResult match {
case Left(l) => {
l.mkString.contains("readTimespan is not convering at least one batch").label("fail with right reason")
}
case Right(_) => false.label("should fail when readTimespan is not covering at least one batch")
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -143,16 +143,15 @@ trait BatchedStore[K, V] extends scalding.Store[K, V] { self =>
// get the batches read from the readTimespan
val batchIntr = batcher.batchesCoveredBy(readTimespan)

logger.info("readTimeSpan {}", readTimespan)
val batches = BatchID.toIterable(batchIntr).toList
val finalBatch = batches.last // batches won't be empty.
val finalBatch = batches.last // batches won't be empty, ensured by atLeastOneBatch method
val filteredBatches = select(batches).sorted

assert(filteredBatches.contains(finalBatch), "select must not remove the final batch.")

import IteratorSums._ // get the groupedSum, partials function

logger.info("Previous written batch: {}, computing: {}", inBatch.asInstanceOf[Any], batches)
logger.debug("Previous written batch: {}, computing: {}", inBatch.asInstanceOf[Any], batches)

def prepareOld(old: TypedPipe[(K, V)]): TypedPipe[(K, (BatchID, (Timestamp, V)))] =
old.map { case (k, v) => (k, (inBatch, (Timestamp.Min, v))) }
Expand Down Expand Up @@ -284,8 +283,6 @@ trait BatchedStore[K, V] extends scalding.Store[K, V] { self =>
// Now get the first timestamp that we need input data for.
firstDeltaTimestamp = lastTimeWrittenToStore.next

firstDeltaBatch = lastBatch.next

// Get the requested timeSpan.
tsMode <- getState[FactoryInput]
(timeSpan, mode) = tsMode
Expand All @@ -303,17 +300,13 @@ trait BatchedStore[K, V] extends scalding.Store[K, V] { self =>

(readDeltaTimestamps, readFlow) = readTimeFlow

firstDeltaBatchInterval: Interval[Timestamp] = batcher.toInterval(firstDeltaBatch)
// Make sure that the time we can read includes the time just after the last
// snapshot. We can't roll the store forward without this.
_ <- fromEither[FactoryInput] {
logger.info("firstBatchInterval is {}", firstDeltaBatchInterval)
if (readDeltaTimestamps.intersect(firstDeltaBatchInterval) == firstDeltaBatchInterval) //readDeltaTimestamps should include the firstDeltaBatchInterval
Right(())
else
Left(List("Cannot load initial timestamp interval " + firstDeltaBatchInterval.toString + " of deltas " +
" at " + this.toString + " only " + readDeltaTimestamps.toString))
}
_ <- fromEither[FactoryInput](if (readDeltaTimestamps.contains(firstDeltaTimestamp))
Right(())
else
Left(List("Cannot load initial timestamp " + firstDeltaTimestamp.toString + " of deltas " +
" at " + this.toString + " only " + readDeltaTimestamps.toString)))

// Record the timespan we actually read.
_ <- putState((readDeltaTimestamps, mode))
Expand All @@ -339,6 +332,19 @@ trait BatchedStore[K, V] extends scalding.Store[K, V] { self =>
deltaFlow2Pipe)
}

/**
* This is for ensuring there is at least one batch coverd by readTimespan. This is
* required by mergeBatched
*/
private def atLeastOneBatch(readTimespan: Interval[Timestamp]) =
fromEither[FactoryInput] {
if (batcher.batchesCoveredBy(readTimespan) == Empty()) {
Left(List("readTimespan is not convering at least one batch: " + readTimespan.toString))
} else {
Right()
}
}

/**
* instances of this trait MAY NOT change the logic here. This always follows the rule
* that we look for existing data (avoiding reading deltas in that case), then we fall
Expand All @@ -360,6 +366,7 @@ trait BatchedStore[K, V] extends scalding.Store[K, V] { self =>
// get the actual timespan read by readAfterLastBatch
tsModeRead <- getState[FactoryInput]
(tsRead, _) = tsModeRead
_ <- atLeastOneBatch(tsRead)

/**
* Once we have read the last snapshot and the available batched blocks of delta, just merge
Expand Down