twitter · ianoc · Nov 27, 2014 · Nov 7, 2014 · Nov 7, 2014 · Nov 7, 2014
diff --git a/summingbird-core-java/src/main/java/com/twitter/summingbird/memory/javaapi/JMemory.java b/summingbird-core-java/src/main/java/com/twitter/summingbird/memory/javaapi/JMemory.java
@@ -26,6 +26,7 @@
  * @author Julien Le Dem
  *
  */
+
 public class JMemory {
 
   private final JobId jobId;
@@ -77,8 +78,8 @@ public static <T> Sink<Memory, Function1<T, Void>, T> sink(JSink<T> sink) {
    * @param service
    * @return the corresponding Service to use in JProducer.lookup
    */
-  public static <K,V> Service<Memory, Function1<K, Option<V>>, K, V> service(Function<K, Option<V>> service) {
-    return new Service<Memory, Function1<K, Option<V>>, K, V>(JProducerImpl.toScala(service));
+  public static <K,V> Service<Memory, JMemoryService<K, V>, K, V> service(Map<K, V> service) {
+    return new Service<Memory, JMemoryService<K, V>, K, V>(new JMemoryService(service));
   }
 
 

diff --git a/...ngbird-core-java/src/main/java/com/twitter/summingbird/memory/javaapi/JMemoryService.java b/...ngbird-core-java/src/main/java/com/twitter/summingbird/memory/javaapi/JMemoryService.java
@@ -0,0 +1,18 @@
+package com.twitter.summingbird.memory.javaapi;
+
+import com.twitter.summingbird.memory.MemoryService;
+import java.util.Map;
+import scala.Option;
+
+public class JMemoryService<K, V> implements MemoryService<K, V>  {
+
+  private Map<K, V> serviceMap;
+
+  public JMemoryService(Map<K, V> m) {
+    serviceMap = m;
+  }
+
+  public Option<V> get(K key) {
+    return Option.apply(serviceMap.get(key));
+  }
+}
diff --git a/summingbird-core-java/src/test/java/com/twitter/summingbird/memory/javaapi/TestJMemory.java b/summingbird-core-java/src/test/java/com/twitter/summingbird/memory/javaapi/TestJMemory.java
@@ -39,10 +39,18 @@ public class TestJMemory {
   private static final Integer[] LENGTH = { 3, 3, 5 };
   private static final String[] LESS_THAN_4 = { "one", "two" };
   private static final String[] FLATTENED = { "o", "e", "t", "o", "th", "ee" };
+  private static final HashMap<String, Integer> SERVICE;
+  static
+  {
+      SERVICE = new HashMap<String, Integer>();
+      SERVICE.put("one", 3);
+      SERVICE.put("two", 3);
+      SERVICE.put("three", 5);
+  }
 
   private static final JProducer<Memory, String> SOURCE = source(asList(INPUT));
 
-  private static final Service<Memory, Function1<String, Option<Integer>>, String, Integer> LENGTH_SERVICE = service(new Function<String, Option<Integer>>() {
+  private static final Service<Memory, JMemoryService<String, Integer>, String, Integer> LENGTH_SERVICE = service(new HashMap<String, Integer>(SERVICE) {
     public Option<Integer> apply(String p) {
       return new Some<Integer>(p.length());
     }

diff --git a/summingbird-core/src/main/scala/com/twitter/summingbird/Producer.scala b/summingbird-core/src/main/scala/com/twitter/summingbird/Producer.scala
@@ -80,6 +80,7 @@ object Producer {
       case OptionMappedProducer(producer, _) => List(producer)
       case FlatMappedProducer(producer, _) => List(producer)
       case KeyFlatMappedProducer(producer, _) => List(producer)
+      case ValueFlatMappedProducer(producer, _) => List(producer)
       case WrittenProducer(producer, _) => List(producer)
       case LeftJoinedProducer(producer, _) => List(producer)
       case Summer(producer, _, _) => List(producer)
@@ -99,6 +100,7 @@ object Producer {
     case OptionMappedProducer(_, _) => false
     case FlatMappedProducer(_, _) => false
     case KeyFlatMappedProducer(_, _) => false
+    case ValueFlatMappedProducer(_, _) => false
     case WrittenProducer(_, _) => false
     case LeftJoinedProducer(_, _) => false
     case Summer(_, _, _) => false
@@ -255,7 +257,7 @@ sealed trait KeyedProducer[P <: Platform[P], K, V] extends Producer[P, (K, V)] {
 
   /** Builds a new KeyedProvider by applying a partial function to values of elements of this one on which the function is defined.*/
   def collectValues[V2](pf: PartialFunction[V, V2]): KeyedProducer[P, K, V2] =
-    IdentityKeyedProducer(collect { case (k, v) if pf.isDefinedAt(v) => (k, pf(v)) })
+    flatMapValues { v => if (pf.isDefinedAt(v)) Iterator(pf(v)) else Iterator.empty }
 
   /**
    * Prefer this to filter or flatMap/flatMapKeys if you are filtering.
@@ -273,7 +275,7 @@ sealed trait KeyedProducer[P <: Platform[P], K, V] extends Producer[P, (K, V)] {
    * the partition.
    */
   def filterValues(pred: V => Boolean): KeyedProducer[P, K, V] =
-    IdentityKeyedProducer(filter { case (_, v) => pred(v) })
+    flatMapValues { v => if (pred(v)) Iterator(v) else Iterator.empty }
 
   /**
    * Prefer to call this method to flatMap if you are expanding only keys.
@@ -284,7 +286,7 @@ sealed trait KeyedProducer[P <: Platform[P], K, V] extends Producer[P, (K, V)] {
 
   /** Prefer this to a raw map as this may be optimized to avoid a key reshuffle */
   def flatMapValues[U](fn: V => TraversableOnce[U]): KeyedProducer[P, K, U] =
-    IdentityKeyedProducer(flatMap { case (k, v) => fn(v).map((k, _)) })
+    ValueFlatMappedProducer(this, fn)
 
   /** Return just the keys */
   def keys: Producer[P, K] = map(_._1)
@@ -315,7 +317,7 @@ sealed trait KeyedProducer[P <: Platform[P], K, V] extends Producer[P, (K, V)] {
 
   /** Prefer this to a raw map as this may be optimized to avoid a key reshuffle */
   def mapValues[U](fn: V => U): KeyedProducer[P, K, U] =
-    IdentityKeyedProducer(map { case (k, v) => (k, fn(v)) })
+    flatMapValues { v => Iterator(fn(v)) }
 
   /**
    * emits a KeyedProducer with a value that is the store value, just BEFORE a merge,
@@ -336,7 +338,10 @@ sealed trait KeyedProducer[P <: Platform[P], K, V] extends Producer[P, (K, V)] {
   def values: Producer[P, V] = map(_._2)
 }
 
-case class KeyFlatMappedProducer[P <: Platform[P], K, V, K2](producer: KeyedProducer[P, K, V], fn: K => TraversableOnce[K2]) extends KeyedProducer[P, K2, V]
+case class KeyFlatMappedProducer[P <: Platform[P], K, V, K2](producer: Producer[P, (K, V)], fn: K => TraversableOnce[K2]) extends KeyedProducer[P, K2, V]
+
+case class ValueFlatMappedProducer[P <: Platform[P], K, V, V2](producer: Producer[P, (K, V)],
+  fn: V => TraversableOnce[V2]) extends KeyedProducer[P, K, V2]
 
 case class IdentityKeyedProducer[P <: Platform[P], K, V](producer: Producer[P, (K, V)]) extends KeyedProducer[P, K, V]
 

diff --git a/summingbird-core/src/main/scala/com/twitter/summingbird/memory/ConcurrentMemory.scala b/summingbird-core/src/main/scala/com/twitter/summingbird/memory/ConcurrentMemory.scala
@@ -245,6 +245,7 @@ class ConcurrentMemory(implicit jobID: JobId = JobId("default.concurrent.memory.
       .orElse(FlatMapFusion)
       .orElse(RemoveNames)
       .orElse(RemoveIdentityKeyed)
+      .orElse(ValueFlatMapToFlatMap)
 
     val deps = Dependants(optimize(prod, ourRule))
     val heads = deps.nodes.collect { case s @ Source(_) => s }

diff --git a/summingbird-core/src/main/scala/com/twitter/summingbird/memory/Memory.scala b/summingbird-core/src/main/scala/com/twitter/summingbird/memory/Memory.scala
@@ -19,18 +19,23 @@ package com.twitter.summingbird.memory
 import com.twitter.algebird.Monoid
 import com.twitter.summingbird._
 import com.twitter.summingbird.option.JobId
+import com.twitter.summingbird.planner.DagOptimizer
 import collection.mutable.{ Map => MutableMap }
 
 object Memory {
   implicit def toSource[T](traversable: TraversableOnce[T])(implicit mf: Manifest[T]): Producer[Memory, T] =
     Producer.source[Memory, T](traversable)
 }
 
+trait MemoryService[-K, +V] {
+  def get(k: K): Option[V]
+}
+
 class Memory(implicit jobID: JobId = JobId("default.memory.jobId")) extends Platform[Memory] {
   type Source[T] = TraversableOnce[T]
   type Store[K, V] = MutableMap[K, V]
   type Sink[-T] = (T => Unit)
-  type Service[-K, +V] = (K => Option[V])
+  type Service[-K, +V] = MemoryService[K, V]
   type Plan[T] = Stream[T]
 
   private type Prod[T] = Producer[Memory, T]
@@ -83,7 +88,7 @@ class Memory(implicit jobID: JobId = JobId("default.memory.jobId")) extends Plat
           case LeftJoinedProducer(producer, service) =>
             val (s, m) = toStream(producer, jamfs)
             val joined = s.map {
-              case (k, v) => (k, (v, service(k)))
+              case (k, v) => (k, (v, service.get(k)))
             }
             (joined, m)
 
@@ -111,7 +116,12 @@ class Memory(implicit jobID: JobId = JobId("default.memory.jobId")) extends Plat
       MemoryStatProvider.registerCounters(jobID, registeredCounters)
       SummingbirdRuntimeStats.addPlatformStatProvider(MemoryStatProvider)
     }
-    toStream(prod, Map.empty)._1
+
+    val dagOptimizer = new DagOptimizer[Memory] {}
+    val memoryTail = dagOptimizer.optimize(prod, dagOptimizer.ValueFlatMapToFlatMap)
+    val memoryDag = memoryTail.asInstanceOf[TailProducer[Memory, T]]
+
+    toStream(memoryDag, Map.empty)._1
   }
 
   def run(iter: Stream[_]) {

diff --git a/summingbird-core/src/main/scala/com/twitter/summingbird/planner/DagOptimizer.scala b/summingbird-core/src/main/scala/com/twitter/summingbird/planner/DagOptimizer.scala
@@ -32,12 +32,18 @@ trait DagOptimizer[P <: Platform[P]] {
   protected def mkAlso[T, U]: (Prod[T], Prod[U]) => Prod[U] = {
     (left, right) => AlsoProducer(left.asInstanceOf[TailProducer[P, T]], right)
   }
+  protected def mkAlsoTail[T, U]: (Prod[T], Prod[U]) => Prod[U] = {
+    (left, right) => new AlsoTailProducer(left.asInstanceOf[TailProducer[P, T]], right.asInstanceOf[TailProducer[P, U]])
+  }
   protected def mkMerge[T]: (Prod[T], Prod[T]) => Prod[T] = {
     (left, right) => MergedProducer(left, right)
   }
   protected def mkNamed[T](name: String): (Prod[T] => Prod[T]) = {
     prod => NamedProducer(prod, name)
   }
+  protected def mkTPNamed[T](name: String): (Prod[T] => Prod[T]) = {
+    prod => new TPNamedProducer(prod.asInstanceOf[TailProducer[P, T]], name)
+  }
   protected def mkIdentKey[K, V]: (Prod[(K, V)] => Prod[(K, V)]) = {
     prod => IdentityKeyedProducer(prod)
   }
@@ -50,6 +56,9 @@ trait DagOptimizer[P <: Platform[P]] {
   protected def mkKeyFM[T, U, V](fn: T => TraversableOnce[U]): (Prod[(T, V)] => Prod[(U, V)]) = {
     prod => KeyFlatMappedProducer(prod, fn)
   }
+  protected def mkValueFM[K, U, V](fn: U => TraversableOnce[V]): (Prod[(K, U)] => Prod[(K, V)]) = {
+    prod => ValueFlatMappedProducer(prod, fn)
+  }
   protected def mkWritten[T, U >: T](sink: P#Sink[U]): (Prod[T] => Prod[T]) = {
     prod => WrittenProducer[P, T, U](prod, sink)
   }
@@ -90,6 +99,12 @@ trait DagOptimizer[P <: Platform[P]] {
       val lit = BinaryLit[R, T, T, N](l1, l2, mkAlso)
       (h2 + (a -> lit), lit)
     }
+    def alsoTail[R](a: AlsoTailProducer[P, R, T]): (M, L[T]) = {
+      val (h1, l1) = toLiteral(hm, a.ensure)
+      val (h2, l2) = toLiteral(h1, a.result)
+      val lit = BinaryLit[R, T, T, N](l1, l2, mkAlsoTail)
+      (h2 + (a -> lit), lit)
+    }
     def merge(m: MergedProducer[P, T]): (M, L[T]) = {
       val (h1, l1) = toLiteral(hm, m.left)
       val (h2, l2) = toLiteral(h1, m.right)
@@ -101,6 +116,11 @@ trait DagOptimizer[P <: Platform[P]] {
       val lit = UnaryLit[T, T, N](l1, mkNamed(n.id))
       (h1 + (n -> lit), lit)
     }
+    def namedTP(n: TPNamedProducer[P, T]): (M, L[T]) = {
+      val (h1, l1) = toLiteral(hm, n.producer)
+      val lit = UnaryLit[T, T, N](l1, mkTPNamed(n.id))
+      (h1 + (n -> lit), lit)
+    }
     def ikp[K, V](ik: IdentityKeyedProducer[P, K, V]): (M, L[(K, V)]) = {
       val (h1, l1) = toLiteral(hm, ik.producer)
       val lit = UnaryLit[(K, V), (K, V), N](l1, mkIdentKey)
@@ -121,6 +141,11 @@ trait DagOptimizer[P <: Platform[P]] {
       val lit = UnaryLit[(K, V), (K2, V), N](l1, mkKeyFM(kf.fn))
       (h1 + (kf -> lit), lit)
     }
+    def vfm[K, V, V2](kf: ValueFlatMappedProducer[P, K, V, V2]): (M, L[(K, V2)]) = {
+      val (h1, l1) = toLiteral(hm, kf.producer)
+      val lit = UnaryLit[(K, V), (K, V2), N](l1, mkValueFM(kf.fn))
+      (h1 + (kf -> lit), lit)
+    }
     def writer[T1 <: T, U >: T1](w: WrittenProducer[P, T1, U]): (M, L[T]) = {
       val (h1, l1) = toLiteral(hm, w.producer)
       val lit = UnaryLit[T1, T, N](l1, mkWritten[T1, U](w.sink))
@@ -148,8 +173,10 @@ trait DagOptimizer[P <: Platform[P]] {
       case None =>
         prod match {
           case s @ Source(_) => source(s)
+          case a: AlsoTailProducer[_, _, _] => alsoTail(a.asInstanceOf[AlsoTailProducer[P, _, T]])
           case a @ AlsoProducer(_, _) => also(a)
           case m @ MergedProducer(l, r) => merge(m)
+          case n: TPNamedProducer[_, _] => namedTP(n.asInstanceOf[TPNamedProducer[P, T]])
           case n @ NamedProducer(producer, name) => named(n)
           case w @ WrittenProducer(producer, sink) => writer(w)
           case fm @ FlatMappedProducer(producer, fn) => flm(fm)
@@ -158,6 +185,7 @@ trait DagOptimizer[P <: Platform[P]] {
           // but I can't convince scala of this without the cast.
           case ik @ IdentityKeyedProducer(producer) => cast(ikp(ik))
           case kf @ KeyFlatMappedProducer(producer, fn) => cast(kfm(kf))
+          case vf @ ValueFlatMappedProducer(producer, fn) => cast(vfm(vf))
           case j @ LeftJoinedProducer(producer, srv) => cast(joined(j))
           case s @ Summer(producer, store, sg) => cast(summer(s))
         }
@@ -255,6 +283,18 @@ trait DagOptimizer[P <: Platform[P]] {
         cast(in.flatMap { case (k, v) => fn(k).map((_, v)) })
     }
   }
+  /**
+   * If you can't optimize ValueFlatMaps, use this
+   */
+  object ValueFlatMapToFlatMap extends PartialRule[Prod] {
+    def applyWhere[T](on: ExpressionDag[Prod]) = {
+      // TODO: we need to case class here to not lose the irreducible which may be named
+      case ValueFlatMappedProducer(in, fn) =>
+        // we know that (K, V) <: T due to the case match, but scala can't see it
+        def cast[K, V](p: Prod[(K, V)]): Prod[T] = p.asInstanceOf[Prod[T]]
+        cast(in.flatMap { case (k, v) => fn(v).map((k, _)) })
+    }
+  }
 
   /**
    * Combine flatMaps followed by optionMap into a single operation

diff --git a/summingbird-core/src/test/scala/com/twitter/summingbird/DependantsTests.scala b/summingbird-core/src/test/scala/com/twitter/summingbird/DependantsTests.scala
@@ -24,15 +24,15 @@ import org.scalacheck.Arbitrary._
 import org.scalacheck.Prop._
 import scala.util.Random
 
-import scala.collection.mutable.{ Map => MMap }
+import scala.collection.mutable.{ Map => MMap, HashMap => MHashMap }
 
 object DependantsTest extends Properties("Dependants") {
   import TestGraphGenerators._
+  import MemoryArbitraries._
   implicit def testStore: Memory#Store[Int, Int] = MMap[Int, Int]()
+  implicit def testService: Memory#Service[Int, Int] = new MHashMap[Int, Int]() with MemoryService[Int, Int]
   implicit def sink1: Memory#Sink[Int] = ((_) => Unit)
   implicit def sink2: Memory#Sink[(Int, Int)] = ((_) => Unit)
-  implicit val arbSource1: Arbitrary[Producer[Memory, Int]] = Arbitrary(Gen.listOfN(100, Arbitrary.arbitrary[Int]).map(Producer.source[Memory, Int](_)))
-  implicit val arbSource2: Arbitrary[KeyedProducer[Memory, Int, Int]] = Arbitrary(Gen.listOfN(100, Arbitrary.arbitrary[(Int, Int)]).map(Producer.source[Memory, (Int, Int)](_)))
 
   implicit def genProducer: Arbitrary[Producer[Memory, _]] = Arbitrary(oneOf(genProd1, genProd2, summed))
 
@@ -150,7 +150,7 @@ object DependantsTest extends Properties("Dependants") {
   }
 
   property("transitiveDependantsTillOutput finds outputs as a subset of dependants") =
-    forAll { (prod: Producer[Memory, Any]) =>
+    forAll { (prod: Producer[Memory, _]) =>
       val dependants = Dependants(prod)
       dependants.nodes.forall { n =>
         val output = dependants.transitiveDependantsTillOutput(n).collect {
@@ -162,7 +162,7 @@ object DependantsTest extends Properties("Dependants") {
     }
 
   property("transitiveDependantsTillOutput is a subset of writers dependencies") =
-    forAll { (prod: Producer[Memory, Any]) =>
+    forAll { (prod: Producer[Memory, _]) =>
       val dependants = Dependants(prod)
       dependants.nodes.forall { n =>
         val depTillWrite = dependants.transitiveDependantsTillOutput(n)
@@ -174,7 +174,7 @@ object DependantsTest extends Properties("Dependants") {
       }
     }
 
-  property("transitiveDependantsTillOutput finds no children of outputs") = forAll { (prod: Producer[Memory, Any]) =>
+  property("transitiveDependantsTillOutput finds no children of outputs") = forAll { (prod: Producer[Memory, _]) =>
     val dependants = Dependants(prod)
     dependants.nodes.forall { n =>
       val tillWrite = dependants.transitiveDependantsTillOutput(n)