From 4e9b5549664af4c09cceb789472a1b98985e1e9d Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Thu, 2 Feb 2017 17:15:40 -0800
Subject: [PATCH 01/12] first stacked running ok

---
 README.md | 10 +++++++-
 clstm.py  | 70 +++++++++++++++++++++++++++++++++++++++----------------
 main.py   | 46 ++++++++++++++++++++++++++++--------
 3 files changed, 95 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index 55a3550..8cfc511 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,10 @@
 
 # Inspired by papers:
 
+https://arxiv.org/abs/1506.04214 (Conv LSTM)
 http://www.jmlr.org/proceedings/papers/v2/sutskever07a/sutskever07a.pdf
 https://arxiv.org/abs/1411.4389
 https://arxiv.org/abs/1504.08023
-https://arxiv.org/abs/1506.04214 (like this paper with RNN but now with LSTM)
 https://arxiv.org/abs/1511.06380
 https://arxiv.org/abs/1511.05440
 https://arxiv.org/abs/1605.08104
@@ -110,6 +110,14 @@ smplayer out_all2_fast.mp4
 
 * Try more filters
 
+* Try L2 loss not only on (or not just on) final image, but hidden states.  Should approximate adversarial networks, which keep image and hidden latent variable more smoothly connected (i.e. avoid fractured manifold).
+
+* Try different hyperparameters
+
+* Try Stacked Conv/Deconv LSTMs (https://arxiv.org/pdf/1506.04214v2.pdf and https://arxiv.org/pdf/1605.07157v4.pdf)
+
+* Try skip connections (https://arxiv.org/pdf/1605.07157v4.pdf)
+
 * Try temporal convolution
 
 * Try other LSTM architectures (C-peek, bind forget-recall, GRU, etc.)
diff --git a/clstm.py b/clstm.py
index e3e2a41..1a23310 100644
--- a/clstm.py
+++ b/clstm.py
@@ -42,12 +42,13 @@ class clstm(CRNNCell):
 
 # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
 
-  def __init__(self, shape, filter, features, forget_bias=1.0, input_size=None,
+  def __init__(self, shape, filter, stride, features, forget_bias=1.0, input_size=None,
                state_is_tuple=False, activation=tf.nn.tanh):
     """Initialize the basic CLSTM cell.
     Args:
       shape: int tuple of the height and width of the cell
       filter: int tuple of the height and width of the filter
+      stride: stride to use if doing convolution or deconvolution
       features: int of the depth of the cell 
       forget_bias: float, the bias added to forget gates (see above).
       input_size: Deprecated.
@@ -60,6 +61,7 @@ def __init__(self, shape, filter, features, forget_bias=1.0, input_size=None,
       logging.warn("%s: Input_size parameter is deprecated.", self)
     self.shape = shape 
     self.filter = filter
+    self.stride = stride
     self.features = features 
     self._forget_bias = forget_bias
     self._state_is_tuple = state_is_tuple
@@ -74,7 +76,7 @@ def state_size(self):
   def output_size(self):
     return self._num_units
 
-  def __call__(self, inputs, state, scope=None):
+  def __call__(self, inputs, state, typec='Conv', scope=None):
     """Long short-term memory cell (LSTM)."""
     # inputs: batchsize x clstmshape x clstmshape x clstmfeatures
     with tf.variable_scope(scope or type(self).__name__):
@@ -88,12 +90,12 @@ def __call__(self, inputs, state, scope=None):
 
       doclstm=1
       if doclstm==1:
-        concat = _convolve_linear([inputs, h], self.filter, self.features * 4, True)
+        concat = _convolve_linear([inputs, h], self.filter, self.stride, self.features * 4, typec, True)
         # http://colah.github.io/posts/2015-08-Understanding-LSTMs/
         # i = input_gate, j = new_input, f = forget_gate, o = output_gate (each with clstmfeatures features)
         i, j, f, o = tf.split(3, 4, concat)
       else:
-        # work in-progress
+        # TODO: work in-progress
         incat = tf.concat(3,args)
         # general W.x + b separately for each i,j,f,o
         #i = tf.matmul(incat,weightsi) + biasesi
@@ -103,9 +105,10 @@ def __call__(self, inputs, state, scope=None):
         
       # concat: batchsize x clstmshape x clstmshape x (clstmfeatures*4)
 
-
-      new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) *
-               self._activation(j))
+      # Hadamard (element-by-element) products (*)
+      # If stride!=1, then c will be different size than i,j,f,o, so next operation won't work.
+      new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) * self._activation(j))
+      # If stride!=1, then o different dimension than new_h needs to be. (because c and h need to be same size if packing/splitting them as well as recurrently needs to be same size)
       new_h = self._activation(new_c) * tf.nn.sigmoid(o)
 
       if self._state_is_tuple:
@@ -114,11 +117,12 @@ def __call__(self, inputs, state, scope=None):
         new_state = tf.concat(3, [new_c, new_h])
       return new_h, new_state
 
-def _convolve_linear(args, filter, features, bias, bias_start=0.0, scope=None):
+def _convolve_linear(args, filter, stride, features, typec, bias, bias_start=0.0, scope=None):
   """convolution:
   Args:
     args: 4D Tensor or list of 4D, batch x n, Tensors.
     filter: int tuple of filter with height and width.
+    stride: stride for convolution
     features: int, as number of features.
     bias_start: starting value to initialize bias; 0 by default.
     scope: VariableScope for created subgraph; defaults to "Linear".
@@ -144,23 +148,49 @@ def _convolve_linear(args, filter, features, bias, bias_start=0.0, scope=None):
 
   dtype = [a.dtype for a in args][0]
 
-  # Computation
-  with tf.variable_scope(scope or "Conv"):
-    # setup weights as kernel x kernel x (input features = clstmfeatures*2) x (new features=clstmfeatures*4)
-    mat = tf.get_variable(
-        "Mat", [filter[0], filter[1], total_arg_size_depth, features], dtype=dtype)
-    if len(args) == 1:
-      res = tf.nn.conv2d(args[0], mat, strides=[1, 1, 1, 1], padding='SAME')
+  # concat
+  if len(args) == 1:
+    inputs = args[0]
+  else:
+    inputs=tf.concat(3, args)
+
+  # Conv
+  if typec=='Conv':
+    with tf.variable_scope(scope or "Conv"):
+      # setup weights as kernel x kernel x (input features = clstmfeatures*2) x (new features=clstmfeatures*4)
+      weights = tf.get_variable( "Weights", [filter[0], filter[1], total_arg_size_depth, features], dtype=dtype)
+      res = tf.nn.conv2d(inputs, weights, strides=[1, stride, stride, 1], padding='SAME')
+
+    # BIAS
+    if bias:
+      bias_term = tf.get_variable(
+        "Bias", [features],
+        dtype=dtype,
+        initializer=tf.constant_initializer(
+            bias_start, dtype=dtype))
     else:
+      bias_term = 0*res
+
+  # deConv
+  if typec=='deConv':
+    with tf.variable_scope(scope or "deConv"):
+      # setup weights as kernel x kernel x (new features=clstmfeatures*4) x (input features = clstmfeatures*2).
+      # i.e., 2nd arg to transpose version is [height, width, output_channels, in_channels], where last 2 are switched compared to normal conv2d
+      deweights = tf.get_variable( "deWeights", [filter[0], filter[1], features, total_arg_size_depth], dtype=dtype)
+      output_shape = tf.pack([tf.shape(inputs)[0], tf.shape(inputs)[1]*stride, tf.shape(inputs)[2]*stride, features]) 
       # first argument is batchsize x clstmshape x clstmshape x (2*clstmfeatures)
-      res = tf.nn.conv2d(tf.concat(3, args), mat, strides=[1, 1, 1, 1], padding='SAME')
       # res: batchsize x clstmshape x clstmshape x (clstmfeatures*4)
-    if not bias:
-      return res
-    bias_term = tf.get_variable(
-        "Bias", [features],
+      res = tf.nn.conv2d_transpose(inputs, deweights, output_shape, strides=[1, stride, stride, 1], padding='SAME')
+
+    # BIAS
+    if bias:
+      bias_term = tf.get_variable(
+        "deBias", [features],
         dtype=dtype,
         initializer=tf.constant_initializer(
             bias_start, dtype=dtype))
+    else:
+      bias_term = 0*res
+
   return res + bias_term
 
diff --git a/main.py b/main.py
index 82605ae..2bdad6d 100644
--- a/main.py
+++ b/main.py
@@ -87,6 +87,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
     clstminput=sizexy/cnnstrideproduct # must be evenly divisible
     clstmshape=[clstminput,clstminput]
     clstmkernel=[3,3]
+    clstmstride=1 # currently needs to be 1 unless implement tf.pad() or tf.nn.fractional_avg_pool()
     clstmfeatures=cnnfeatures[3] # same as features of last cnn layer fed into clstm
     #
     dcnnkernels=[1,3,3,3] # reasonably the reverse order of cnnkernels
@@ -112,12 +113,25 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
     #
     #
     x_pred = []
+
+    ####################
+    # Setup CLSTM
     with tf.variable_scope('clstm', initializer = tf.random_uniform_initializer(-.01, 0.1)):
       # input shape, kernel filter size, number of features
-      cell = clstm.clstm(clstmshape, clstmkernel, clstmfeatures)
+      convcell = clstm.clstm(clstmshape, clstmkernel, clstmstride, clstmfeatures)
       # state: batchsize x clstmshape x clstmshape x clstmfeatures
-      new_state = cell.set_zero_state(FLAGS.minibatch_size, tf.float32) 
+      new_state = convcell.set_zero_state(FLAGS.minibatch_size, tf.float32) 
 
+      # Setup deCLSTM
+    with tf.variable_scope('declstm', initializer = tf.random_uniform_initializer(-.01, 0.1)):
+      # input shape, kernel filter size, number of features
+      deconvcell = clstm.clstm(clstmshape, clstmkernel, clstmstride, clstmfeatures)
+      # state: batchsize x clstmshape x clstmshape x clstmfeatures
+      denew_state = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32) 
+
+      
+
+    ########################
     # Create CNN-LSTM-dCNN for an input of input_seq_length-1 frames in n time for an output of input_seq_length-1 frames in n+1 time
     for i in xrange(FLAGS.input_seq_length-1):
 
@@ -137,13 +151,16 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
       # cnn4:
       cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4")
 
-      # lstm layer (input y_0 and hidden state, output prediction y_1 and new hidden state new_state)
+      # Convolutional lstm layer (input y_0 and hidden state, output prediction y_1 and new hidden state new_state)
       y_0 = cnn4 #y_0 should be same shape as first argument in clstm.clstm() above.
-      y_1, new_state = cell(y_0, new_state)
+      y_1, new_state = convcell(y_0, new_state, 'Conv')
+
+      # deConvolutional LSTM layer
+      y_2, denew_state = deconvcell(y_1, denew_state, 'deConv')
 
       # DECODE
       # cnn5
-      cnn5 = ld.dcnn2d_layer(y_1, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_5")
+      cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_5")
       # cnn6
       cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_6")
       # cnn7
@@ -168,8 +185,14 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
     # Create network to generate predicted video
     predictframes=50
 
+    ##############
+    # Setup CLSTM (initialize to zero, but same convcell as in other network)
     x_pred_long = []
-    new_state_pred = cell.set_zero_state(FLAGS.minibatch_size, tf.float32) 
+    new_state_pred = convcell.set_zero_state(FLAGS.minibatch_size, tf.float32)
+    new_destate_pred = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32)
+
+    #######
+    # Setup long prediction network
     for i in xrange(predictframes):
 
       # ENCODE
@@ -185,13 +208,16 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
       # cnn4
       cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4")
 
-      # lstm layer
+      # Convolutional lstm layer
       y_0 = cnn4
-      y_1, new_state_pred = cell(y_0, new_state_pred)
+      y_1, new_state_pred = convcell(y_0, new_state_pred, 'Conv')
+
+      # deConvolutional lstm layer
+      y_2, new_destate_pred = deconvcell(y_1, new_destate_pred, 'deConv')
 
       # DECODE
       # cnn5
-      cnn5 = ld.dcnn2d_layer(y_1, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_5")
+      cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_5")
       # cnn6
       cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_6")
       # cnn7
@@ -310,7 +336,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
         normalnorm=np.sum(dat[0,0])
         print("normalnorm=%d" % (normalnorm))
-        print("L2 percent loss=%g" % 100.0*(np.sqrt(float(lossm))/float(normalnorm)))
+        print("L2 percent loss=%g \%" % 100.0*(np.sqrt(float(lossm))/float(normalnorm)))
       else:
         # track progress
         sys.stdout.write('.')

From 623a24f8c47de8399acf3eb0a5b0cfec80fdfc0a Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Thu, 2 Feb 2017 17:40:06 -0800
Subject: [PATCH 02/12] minor

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 2bdad6d..5fcd92c 100644
--- a/main.py
+++ b/main.py
@@ -336,7 +336,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
         normalnorm=np.sum(dat[0,0])
         print("normalnorm=%d" % (normalnorm))
-        print("L2 percent loss=%g \%" % 100.0*(np.sqrt(float(lossm))/float(normalnorm)))
+        print("L2 percent loss=%g" % (100.0*(np.sqrt(float(lossm))/float(normalnorm))))
       else:
         # track progress
         sys.stdout.write('.')

From a85fc902ce24f0c1a712e554fa5d762c2c9cd8d6 Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Fri, 3 Feb 2017 10:41:35 -0800
Subject: [PATCH 03/12] better L2 loss estimate

---
 README.md | 14 ++++++++++++++
 main.py   |  8 +++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 8cfc511..dc4588f 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,15 @@ smplayer out_all2_fast.mp4
 
 * Training Curve in Tensorflow (norm order 40): ![Alt text](https://github.com/pseudotensor/temporal_autoencoder/blob/master/lossexamples/loss_wheel.jpg "Training loss curve for wheel prediction vs. model.")
 
+Notes for wheel case:
+
+* Longer training frames work better to predict longer
+
+* Seems to need to have loss over at least one rotation to be able to predict well into multiple frames in the future
+
+* Central part of wheel diffuses even when otherwise does well.  Lack of resolution
+
+
 
 # Parameters:
 
@@ -102,6 +111,7 @@ smplayer out_all2_fast.mp4
 2) In balls.py:
 
 * SIZE: size of ball's bounding box in pixels
+* omega: angular frequency of rotation for modeltype=1 (wheel type)
 
 
 # Ideas and Future Work:
@@ -114,6 +124,10 @@ smplayer out_all2_fast.mp4
 
 * Try different hyperparameters
 
+* Try multi-scale for space
+
+* Try multi-scale for time (to capture periods over long times)
+
 * Try Stacked Conv/Deconv LSTMs (https://arxiv.org/pdf/1506.04214v2.pdf and https://arxiv.org/pdf/1605.07157v4.pdf)
 
 * Try skip connections (https://arxiv.org/pdf/1605.07157v4.pdf)
diff --git a/main.py b/main.py
index 5fcd92c..4aad671 100644
--- a/main.py
+++ b/main.py
@@ -27,9 +27,9 @@
                             """directory to store checkpoints""")
 tf.app.flags.DEFINE_integer('sizexy', 32,
                             """size x and y dimensions for model, training, and prediction""")
-tf.app.flags.DEFINE_integer('input_seq_length', 10,
+tf.app.flags.DEFINE_integer('input_seq_length', 50,
                             """size of hidden layer""")
-tf.app.flags.DEFINE_integer('predict_frame_start', 5,
+tf.app.flags.DEFINE_integer('predict_frame_start', 25,
                             """ frame number, in zero-base counting, to start using prediction as output or next input""")
 tf.app.flags.DEFINE_integer('max_minibatches', 1000000,
                             """maximum number of mini-batches""")
@@ -236,6 +236,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
     # Setup loss Computation
     # Loss computes L2 for original sequence vs. predicted sequence over input_seq_length - (seq.start+1) frames
     # Compare x^{n+1} to xpred^n (that is supposed to be approximation to x^{n+1})
+    # x: batchsize, time steps, sizexy, sizexy, sizez
     loss = tf.nn.l2_loss(x[:,FLAGS.predict_frame_start+1:,:,:,:] - x_pred[:,:,:,:,:])
     #tf.scalar_summary('loss', loss)
     tf.summary.scalar('loss', loss)
@@ -334,7 +335,8 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
         print("step=%d nstep=%d" % (step,nstep))
         print("L2 loss=%g" % (lossm))
 
-        normalnorm=np.sum(dat[0,0])
+        #normalnorm=np.sum(dat[0,0])
+        normalnorm=np.sum(dat[0,FLAGS.predict_frame_start+1:,:,:,:])
         print("normalnorm=%d" % (normalnorm))
         print("L2 percent loss=%g" % (100.0*(np.sqrt(float(lossm))/float(normalnorm))))
       else:

From b11006ef9f88a8e806dcf00f3f796c71416c5a43 Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Fri, 3 Feb 2017 14:51:15 -0800
Subject: [PATCH 04/12] Not doing timeconv yet, still cleaning up stacked
 actually so graph more sensible.

---
 clstm.py  |   8 +-
 layers.py |  18 ++--
 main.py   | 313 ++++++++++++++++++++++++++++--------------------------
 3 files changed, 176 insertions(+), 163 deletions(-)

diff --git a/clstm.py b/clstm.py
index 1a23310..a04e8c0 100644
--- a/clstm.py
+++ b/clstm.py
@@ -5,7 +5,7 @@ class CRNNCell(object):
   """CRNN cell.
   """
 
-  def __call__(self, inputs, state, scope=None):
+  def __call__(self, inputs, state, typec='Conv', scope=None):
     """Run this RNN cell on inputs, starting from the inputted state.
     """
     raise NotImplementedError("Abstract method")
@@ -90,7 +90,7 @@ def __call__(self, inputs, state, typec='Conv', scope=None):
 
       doclstm=1
       if doclstm==1:
-        concat = _convolve_linear([inputs, h], self.filter, self.stride, self.features * 4, typec, True)
+        concat = _convolve_linear([inputs, h], self.filter, self.stride, self.features * 4, typec, True, scope=scope)
         # http://colah.github.io/posts/2015-08-Understanding-LSTMs/
         # i = input_gate, j = new_input, f = forget_gate, o = output_gate (each with clstmfeatures features)
         i, j, f, o = tf.split(3, 4, concat)
@@ -156,7 +156,7 @@ def _convolve_linear(args, filter, stride, features, typec, bias, bias_start=0.0
 
   # Conv
   if typec=='Conv':
-    with tf.variable_scope(scope or "Conv"):
+    with tf.variable_scope(scope):# or "Conv"):
       # setup weights as kernel x kernel x (input features = clstmfeatures*2) x (new features=clstmfeatures*4)
       weights = tf.get_variable( "Weights", [filter[0], filter[1], total_arg_size_depth, features], dtype=dtype)
       res = tf.nn.conv2d(inputs, weights, strides=[1, stride, stride, 1], padding='SAME')
@@ -173,7 +173,7 @@ def _convolve_linear(args, filter, stride, features, typec, bias, bias_start=0.0
 
   # deConv
   if typec=='deConv':
-    with tf.variable_scope(scope or "deConv"):
+    with tf.variable_scope(scope):# or "deConv"):
       # setup weights as kernel x kernel x (new features=clstmfeatures*4) x (input features = clstmfeatures*2).
       # i.e., 2nd arg to transpose version is [height, width, output_channels, in_channels], where last 2 are switched compared to normal conv2d
       deweights = tf.get_variable( "deWeights", [filter[0], filter[1], features, total_arg_size_depth], dtype=dtype)
diff --git a/layers.py b/layers.py
index 11d65c9..6608007 100644
--- a/layers.py
+++ b/layers.py
@@ -36,6 +36,9 @@ def _activation_summary(x):
   tf.histogram_summary(tensor_name + '/activations', x)
   tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
 
+  # used by cifar10 and inception in tensorflow for multi-GPU systems that have no P2P.
+  # But Titan X's have DMA P2P, so change to /gpu:0
+  #https://github.com/tensorflow/tensorflow/issues/4881
 def _variable_on_cpu(name, shape, initializer):
   """Helper to create a Variable stored on CPU memory.
 
@@ -47,7 +50,8 @@ def _variable_on_cpu(name, shape, initializer):
   Returns:
     Variable Tensor
   """
-  with tf.device('/cpu:0'):
+#  with tf.device('/cpu:0'):
+  with tf.device('/gpu:0'):
     var = tf.get_variable(name, shape, initializer=initializer)
   return var
 
@@ -77,6 +81,7 @@ def _variable_with_weight_decay(name, shape, stddev, wd):
   return var
 
 def cnn2d_layer(inputs, kernel, stride, features, idx, linear = False):
+  # below scope means this layer is shared for all calls unless idx is different.
   with tf.variable_scope('{0}_cnn'.format(idx)) as scope:
     input_channels = inputs.get_shape()[3] # rgb
 
@@ -91,11 +96,12 @@ def cnn2d_layer(inputs, kernel, stride, features, idx, linear = False):
     return cnn_rect
 
 def dcnn2d_layer(inputs, kernel, stride, features, idx, linear = False):
-  with tf.variable_scope('{0}_trans_cnn'.format(idx)) as scope:
+  # below scope means this layer is shared for all calls unless idx is different.
+  with tf.variable_scope('{0}_dcnn'.format(idx)) as scope:
     input_channels = inputs.get_shape()[3] # rgb
     
-    weights = _variable_with_weight_decay('weights', shape=[kernel,kernel,features,input_channels], stddev=0.01, wd=FLAGS.weight_decay)
-    biases = _variable_on_cpu('biases',[features],tf.constant_initializer(0.01))
+    weights = _variable_with_weight_decay('deweights', shape=[kernel,kernel,features,input_channels], stddev=0.01, wd=FLAGS.weight_decay)
+    biases = _variable_on_cpu('debiases',[features],tf.constant_initializer(0.01))
     batch_size = tf.shape(inputs)[0]
     output_shape = tf.pack([tf.shape(inputs)[0], tf.shape(inputs)[1]*stride, tf.shape(inputs)[2]*stride, features]) 
     dcnn = tf.nn.conv2d_transpose(inputs, weights, output_shape, strides=[1,stride,stride,1], padding='SAME')
@@ -116,8 +122,8 @@ def fc_layer(inputs, hiddens, idx, flat = False, linear = False):
       dim = input_shape[1]
       inputs_processed = inputs
     
-    weights = _variable_with_weight_decay('weights', shape=[dim,hiddens],stddev=FLAGS.weights_init, wd=FLAGS.weight_decay)
-    biases = _variable_on_cpu('biases', [hiddens], tf.constant_initializer(FLAGS.weights_init))
+    weights = _variable_with_weight_decay('fcweights', shape=[dim,hiddens],stddev=FLAGS.weights_init, wd=FLAGS.weight_decay)
+    biases = _variable_on_cpu('fcbiases', [hiddens], tf.constant_initializer(FLAGS.weights_init))
     if linear:
       return tf.add(tf.matmul(inputs_processed,weights),biases,name=str(idx)+'_fc')
   
diff --git a/main.py b/main.py
index 4aad671..7e91746 100644
--- a/main.py
+++ b/main.py
@@ -53,8 +53,9 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
     sizexy=FLAGS.sizexy
     # Number of rgb or depth estimation at t=0, but no convolution in this direction
     sizez=3
-    # x: minibatches x input_seq_length of frames x sizex x sizey x sizez(rgb)
-    x = tf.placeholder(tf.float32, [None, FLAGS.input_seq_length, sizexy, sizexy, sizez])
+    with tf.name_scope('input'):
+      # x: minibatches x input_seq_length of frames x sizex x sizey x sizez(rgb)
+      x = tf.placeholder(tf.float32, [None, FLAGS.input_seq_length, sizexy, sizexy, sizez])
 
     # Setup dropout
     hold_prob = tf.placeholder("float")
@@ -153,20 +154,20 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
       # Convolutional lstm layer (input y_0 and hidden state, output prediction y_1 and new hidden state new_state)
       y_0 = cnn4 #y_0 should be same shape as first argument in clstm.clstm() above.
-      y_1, new_state = convcell(y_0, new_state, 'Conv')
+      y_1, new_state = convcell(y_0, new_state, 'Conv', 'clstm')
 
       # deConvolutional LSTM layer
-      y_2, denew_state = deconvcell(y_1, denew_state, 'deConv')
+      y_2, denew_state = deconvcell(y_1, denew_state, 'deConv', 'declstm')
 
       # DECODE
       # cnn5
-      cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_5")
+      cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_4")
       # cnn6
-      cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_6")
+      cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_3")
       # cnn7
-      cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_7")
+      cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_2")
       # x_1 (linear act)
-      x_1 = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_8", True)
+      x_1 = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_1", True)
       if i >= FLAGS.predict_frame_start:
         # add predictive layer
         x_pred.append(x_1)
@@ -210,20 +211,20 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
       # Convolutional lstm layer
       y_0 = cnn4
-      y_1, new_state_pred = convcell(y_0, new_state_pred, 'Conv')
+      y_1, new_state_pred = convcell(y_0, new_state_pred, 'Conv', 'clstm')
 
       # deConvolutional lstm layer
-      y_2, new_destate_pred = deconvcell(y_1, new_destate_pred, 'deConv')
+      y_2, new_destate_pred = deconvcell(y_1, new_destate_pred, 'deConv', 'declstm')
 
       # DECODE
       # cnn5
-      cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_5")
+      cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_4")
       # cnn6
-      cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_6")
+      cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_3")
       # cnn7
-      cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_7")
+      cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_2")
       # x_1_pred (linear act)
-      x_1_pred = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_8", True)
+      x_1_pred = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_1", True)
       if i >= FLAGS.predict_frame_start:
         x_pred_long.append(x_1_pred)
 
@@ -238,11 +239,15 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
     # Compare x^{n+1} to xpred^n (that is supposed to be approximation to x^{n+1})
     # x: batchsize, time steps, sizexy, sizexy, sizez
     loss = tf.nn.l2_loss(x[:,FLAGS.predict_frame_start+1:,:,:,:] - x_pred[:,:,:,:,:])
-    #tf.scalar_summary('loss', loss)
     tf.summary.scalar('loss', loss)
+    normalnorm=tf.nn.l2_loss(x[:,FLAGS.predict_frame_start+1:,:,:,:])
+    tf.summary.scalar('normalnorm', normalnorm)
+    ploss = tf.sqrt(10.0*loss/normalnorm)
+    tf.summary.scalar('ploss', ploss)
 
     # Set training method
-    train_operation = tf.train.AdamOptimizer(FLAGS.adamvar).minimize(loss)
+    with tf.name_scope('train'):
+      train_operation = tf.train.AdamOptimizer(FLAGS.adamvar).minimize(loss)
     
     # List of all Variables
     variables = tf.global_variables()
@@ -257,149 +262,151 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
     # Summary op
     #summary_op = tf.merge_all_summaries()
     summary_op = tf.summary.merge_all()
- 
-    # Initialize variables
-    init = tf.global_variables_initializer()
 
-    # Start session
-    sess = tf.Session()
 
-    # Initialize Network
-    if continuetrain==0:
-      print("Initialize network")
-      sess.run(init)
-    else:
-      print("load network")
-      # http://stackoverflow.com/questions/33759623/tensorflow-how-to-restore-a-previously-saved-model-python
-      #
-      # * means all if need specific format then *.csv
-      list_of_files = glob.glob(FLAGS.ckpt_dir + '/model.ckpt-*.meta')
-      if(len(list_of_files)==0):
+    with tf.Session() as sess:
+      # Initialize variables
+      init = tf.global_variables_initializer()
+
+      # Start session
+      sess = tf.Session()
+
+      # Initialize Network
+      if continuetrain==0:
         print("Initialize network")
         sess.run(init)
       else:
-        latest_file = max(list_of_files, key=os.path.getctime)
-        print("latest_file=%s" % (latest_file))
+        print("load network")
+        # http://stackoverflow.com/questions/33759623/tensorflow-how-to-restore-a-previously-saved-model-python
         #
-        checkpoint_path = latest_file
-        saver = tf.train.import_meta_graph(checkpoint_path)
-        saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
-        all_vars = tf.get_collection('vars')
-        m = re.search('ckpt-([0-9]+).meta', latest_file)
-        nstep = int(m.group(1))
-        print("done loading network: nstep=%d" % (nstep))
-      
-    # Setup summary
-    summary_writer = tf.summary.FileWriter(FLAGS.ckpt_dir, sess.graph)
-
-    # Set number of model frames
-    #modelframes=FLAGS.input_seq_length+predictframes
-    modelframes=predictframes
-
-    # Set how often dump video to disk
-    howoftenvid=1000
-    # Set how often reports error to summary
-    howoftensummary=2000
-    # Set how often to write checkpoint file
-    howoftenckpt=2000
-
-    ###############
-    # Training Loop
-    startstep=nstep
-    for step in xrange(startstep,FLAGS.max_minibatches):
-      nstep=step
-
-      # Generate mini-batch
-      dat = md.generate_model_sample(FLAGS.minibatch_size, FLAGS.input_seq_length, FLAGS.sizexy, num_balls, modeltype)
-      
-      # Get model data for comparing to prediction if generating video
-      if nstep%howoftenvid == 0:
-        datmodel = md.generate_model_sample(1, modelframes, FLAGS.sizexy, num_balls, modeltype)
-        # Overwrite so consistent with ground truth for video output
-        dat[0,0:FLAGS.input_seq_length] = datmodel[0,0:FLAGS.input_seq_length]
-      
-      # Train on mini-batch
-      # Compute error in prediction vs. model and compute time of mini-batch task
-      t = time.time()
-      _, lossm = sess.run([train_operation, loss],feed_dict={x:dat, hold_prob:FLAGS.hold_prob})
-      elapsed = time.time() - t
-      assert not np.isnan(lossm), 'Model reached lossm = NaN'
-
-
-      # Store model and print-out loss
-      if nstep%howoftensummary == 0 and nstep != 0:
-        summary_str = sess.run(summary_op, feed_dict={x:dat, hold_prob:FLAGS.hold_prob})
-        summary_writer.add_summary(summary_str, nstep) 
-        print("")
-        print("time per batch is " + str(elapsed) + " seconds")
-        print("step=%d nstep=%d" % (step,nstep))
-        print("L2 loss=%g" % (lossm))
-
-        #normalnorm=np.sum(dat[0,0])
-        normalnorm=np.sum(dat[0,FLAGS.predict_frame_start+1:,:,:,:])
-        print("normalnorm=%d" % (normalnorm))
-        print("L2 percent loss=%g" % (100.0*(np.sqrt(float(lossm))/float(normalnorm))))
-      else:
-        # track progress
-        sys.stdout.write('.')
-        sys.stdout.flush()
-            
-
-      # Save checkpoint
-      if nstep%howoftenckpt == 0:
-        print("Saving checkpoint")
-        checkpoint_path = os.path.join(FLAGS.ckpt_dir, 'model.ckpt')
-        saver.save(sess, checkpoint_path, global_step=nstep)  
-        print("checkpoint saved to " + FLAGS.ckpt_dir)
-
-      # Output video of model and prediction for single video in mini-batch at this step
-      if nstep%howoftenvid == 0:
-
-        # Write model video (includes given and ground truth frames)
-        video_path = os.path.join(FLAGS.video_dir, '')
-
-        #http://stackoverflow.com/questions/10605163/opencv-videowriter-under-osx-producing-no-output
-        cc = cv2.cv.CV_FOURCC('m', 'p', '4', 'v') 
-        fps=4
-        sizevx=100
-        sizevy=100
-        sizevid=(sizevx, sizevy)
-
-        print("")
-        print("Writing model video")
-        video = cv2.VideoWriter()
-        success = video.open(video_path + "model_" + str(nstep) + ".mov", cc, fps, sizevid, True)
-        image = datmodel[0]
-        print(image.shape)
-        for i in xrange(modelframes):
-          x_1_r = np.uint8(np.minimum(1, np.maximum(image[i,:,:,:], 0)) * 255)
-          new_im = cv2.resize(x_1_r, (sizevx,sizevy))
-          video.write(new_im)
-        video.release()
-
-        # Write given + predicted video
-        print("Writing predicted video")
-        video = cv2.VideoWriter()
-        success = video.open(video_path + "clstm_" + str(nstep) + ".mov", cc, fps, sizevid, True)
-
-        # Preappend starting sequence
-        image = datmodel[0]
-        print(image.shape)
-        for i in xrange(FLAGS.predict_frame_start):
-          x_1_r = np.uint8(np.minimum(1,np.maximum(image[i,:,:,:], 0)) * 255)
-          new_im = cv2.resize(x_1_r, (sizevx,sizevy))
-          video.write(new_im)
-
-        # Append predicted video
-        dat_gif = dat
-        image = sess.run([x_pred_long],feed_dict={x:dat_gif, hold_prob:FLAGS.hold_prob})
-        image = image[0][0]
-        print(image.shape)
-        for i in xrange(modelframes - FLAGS.predict_frame_start):
-          x_1_r = np.uint8(np.minimum(1,np.maximum(image[i,:,:,:], 0)) * 255)
-          new_im = cv2.resize(x_1_r, (sizevx,sizevy))
-          video.write(new_im)
-        video.release()
+        # * means all if need specific format then *.csv
+        list_of_files = glob.glob(FLAGS.ckpt_dir + '/model.ckpt-*.meta')
+        if(len(list_of_files)==0):
+          print("Initialize network")
+          sess.run(init)
+        else:
+          latest_file = max(list_of_files, key=os.path.getctime)
+          print("latest_file=%s" % (latest_file))
+          #
+          checkpoint_path = latest_file
+          saver = tf.train.import_meta_graph(checkpoint_path)
+          saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
+          all_vars = tf.get_collection('vars')
+          m = re.search('ckpt-([0-9]+).meta', latest_file)
+          nstep = int(m.group(1))
+          print("done loading network: nstep=%d" % (nstep))
+
+      # Setup summary
+      summary_writer = tf.summary.FileWriter(FLAGS.ckpt_dir, sess.graph)
+
+      # Set number of model frames
+      #modelframes=FLAGS.input_seq_length+predictframes
+      modelframes=predictframes
+
+      # Set how often dump video to disk
+      howoftenvid=1000
+      # Set how often reports error to summary
+      howoftensummary=100
+      # Set how often to write checkpoint file
+      howoftenckpt=2000
+
+      ###############
+      # Training Loop
+      startstep=nstep
+      for step in xrange(startstep,FLAGS.max_minibatches):
+        nstep=step
+
+        # Generate mini-batch
+        dat = md.generate_model_sample(FLAGS.minibatch_size, FLAGS.input_seq_length, FLAGS.sizexy, num_balls, modeltype)
+
+        # Get model data for comparing to prediction if generating video
+        if nstep%howoftenvid == 0:
+          datmodel = md.generate_model_sample(1, modelframes, FLAGS.sizexy, num_balls, modeltype)
+          # Overwrite so consistent with ground truth for video output
+          dat[0,0:FLAGS.input_seq_length] = datmodel[0,0:FLAGS.input_seq_length]
+
+        # Train on mini-batch
+        # Compute error in prediction vs. model and compute time of mini-batch task
+        t = time.time()
+        _, lossm = sess.run([train_operation, loss],feed_dict={x:dat, hold_prob:FLAGS.hold_prob})
+        elapsed = time.time() - t
+        assert not np.isnan(lossm), 'Model reached lossm = NaN'
+
+
+        # Store model and print-out loss
+        if nstep%howoftensummary == 0:
+          summary_str = sess.run(summary_op, feed_dict={x:dat, hold_prob:FLAGS.hold_prob})
+          summary_writer.add_summary(summary_str, nstep) 
+          print("")
+          print("time per batch is " + str(elapsed) + " seconds")
+          print("step=%d nstep=%d" % (step,nstep))
+          print("L2 loss=%g" % (lossm))
+
+          #normalnorm=np.sum(dat[0,0])
+          normalnorm=np.sum(dat[0,FLAGS.predict_frame_start+1:,:,:,:])
+          print("normalnorm=%d" % (normalnorm))
+          print("L2 percent loss=%g" % (100.0*(np.sqrt(float(lossm))/float(normalnorm))))
+        else:
+          # track progress
+          sys.stdout.write('.')
+          sys.stdout.flush()
+
+
+        # Save checkpoint
+        if nstep%howoftenckpt == 0:
+          print("Saving checkpoint")
+          checkpoint_path = os.path.join(FLAGS.ckpt_dir, 'model.ckpt')
+          saver.save(sess, checkpoint_path, global_step=nstep)  
+          print("checkpoint saved to " + FLAGS.ckpt_dir)
+
+        # Output video of model and prediction for single video in mini-batch at this step
+        if nstep%howoftenvid == 0:
+
+          # Write model video (includes given and ground truth frames)
+          video_path = os.path.join(FLAGS.video_dir, '')
+
+          #http://stackoverflow.com/questions/10605163/opencv-videowriter-under-osx-producing-no-output
+          cc = cv2.cv.CV_FOURCC('m', 'p', '4', 'v') 
+          fps=4
+          sizevx=100
+          sizevy=100
+          sizevid=(sizevx, sizevy)
+
+          print("")
+          print("Writing model video")
+          video = cv2.VideoWriter()
+          success = video.open(video_path + "model_" + str(nstep) + ".mov", cc, fps, sizevid, True)
+          image = datmodel[0]
+          print(image.shape)
+          for i in xrange(modelframes):
+            x_1_r = np.uint8(np.minimum(1, np.maximum(image[i,:,:,:], 0)) * 255)
+            new_im = cv2.resize(x_1_r, (sizevx,sizevy))
+            video.write(new_im)
+          video.release()
+
+          # Write given + predicted video
+          print("Writing predicted video")
+          video = cv2.VideoWriter()
+          success = video.open(video_path + "clstm_" + str(nstep) + ".mov", cc, fps, sizevid, True)
+
+          # Preappend starting sequence
+          image = datmodel[0]
+          print(image.shape)
+          for i in xrange(FLAGS.predict_frame_start):
+            x_1_r = np.uint8(np.minimum(1,np.maximum(image[i,:,:,:], 0)) * 255)
+            new_im = cv2.resize(x_1_r, (sizevx,sizevy))
+            video.write(new_im)
+
+          # Append predicted video
+          dat_gif = dat
+          image = sess.run([x_pred_long],feed_dict={x:dat_gif, hold_prob:FLAGS.hold_prob})
+          image = image[0][0]
+          print(image.shape)
+          for i in xrange(modelframes - FLAGS.predict_frame_start):
+            x_1_r = np.uint8(np.minimum(1,np.maximum(image[i,:,:,:], 0)) * 255)
+            new_im = cv2.resize(x_1_r, (sizevx,sizevy))
+            video.write(new_im)
+          video.release()
 
 
 def main(argv=None):

From bed55b0cabe154bebaf13edfc0a75c8753b5b25a Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Fri, 3 Feb 2017 22:05:09 -0800
Subject: [PATCH 05/12] added peek

---
 clstm.py | 32 +++++++++++++++++++++++++++++---
 main.py  |  9 +++++----
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/clstm.py b/clstm.py
index a04e8c0..a79e467 100644
--- a/clstm.py
+++ b/clstm.py
@@ -5,7 +5,7 @@ class CRNNCell(object):
   """CRNN cell.
   """
 
-  def __call__(self, inputs, state, typec='Conv', scope=None):
+  def __call__(self, inputs, state, typec='Conv', dopeek=0, scope=None):
     """Run this RNN cell on inputs, starting from the inputted state.
     """
     raise NotImplementedError("Abstract method")
@@ -76,8 +76,13 @@ def state_size(self):
   def output_size(self):
     return self._num_units
 
-  def __call__(self, inputs, state, typec='Conv', scope=None):
+  def __call__(self, inputs, state, typec='Conv', dopeek=0, scope=None):
     """Long short-term memory cell (LSTM)."""
+
+    # whether to use peek on c
+    #dopeek=0
+
+
     # inputs: batchsize x clstmshape x clstmshape x clstmfeatures
     with tf.variable_scope(scope or type(self).__name__):
       # Parameters of gates are concatenated into one multiply for efficiency.
@@ -88,6 +93,7 @@ def __call__(self, inputs, state, typec='Conv', scope=None):
         c, h = tf.split(3, 2, state)
       # [inputs,h] is: 2 x batchsize x clstmshape x clstmshape x clstmfeatures
 
+
       doclstm=1
       if doclstm==1:
         concat = _convolve_linear([inputs, h], self.filter, self.stride, self.features * 4, typec, True, scope=scope)
@@ -96,12 +102,25 @@ def __call__(self, inputs, state, typec='Conv', scope=None):
         i, j, f, o = tf.split(3, 4, concat)
       else:
         # TODO: work in-progress
-        incat = tf.concat(3,args)
+        incat = tf.concat(3,[inputs, h])
         # general W.x + b separately for each i,j,f,o
         #i = tf.matmul(incat,weightsi) + biasesi
         #j = tf.matmul(incat,weightsj) + biasesj
         #f = tf.matmul(incat,weightsf) + biasesf
         #o = tf.matmul(incat,weightso) + biaseso
+
+
+      #https://github.com/tensorflow/tensorflow/issues/834
+      # https://arxiv.org/abs/1308.0850
+      # https://arxiv.org/pdf/1506.04214v2.pdf
+      if dopeek==1:
+        # setup weights same size as c, since element-wise multiplication
+        weights_ci = tf.get_variable( "Weights_ci", c.get_shape(), dtype=c.dtype)
+        i = i + c * weights_ci
+
+        weights_cf = tf.get_variable( "Weights_cf", c.get_shape(), dtype=c.dtype)
+        f = f + c * weights_cf
+
         
       # concat: batchsize x clstmshape x clstmshape x (clstmfeatures*4)
 
@@ -109,6 +128,13 @@ def __call__(self, inputs, state, typec='Conv', scope=None):
       # If stride!=1, then c will be different size than i,j,f,o, so next operation won't work.
       new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) * self._activation(j))
       # If stride!=1, then o different dimension than new_h needs to be. (because c and h need to be same size if packing/splitting them as well as recurrently needs to be same size)
+
+      if dopeek==1:
+        weights_co = tf.get_variable( "Weights_co", c.get_shape(), dtype=c.dtype)
+        o = o + new_c * weights_co
+        
+
+      
       new_h = self._activation(new_c) * tf.nn.sigmoid(o)
 
       if self._state_is_tuple:
diff --git a/main.py b/main.py
index 7e91746..54ff14b 100644
--- a/main.py
+++ b/main.py
@@ -85,6 +85,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
         testsize=testsize/cnnstrides[i]
     #
         
+    dopeek=1 # whether to peek as cell state when constructing gates
     clstminput=sizexy/cnnstrideproduct # must be evenly divisible
     clstmshape=[clstminput,clstminput]
     clstmkernel=[3,3]
@@ -154,10 +155,10 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
       # Convolutional lstm layer (input y_0 and hidden state, output prediction y_1 and new hidden state new_state)
       y_0 = cnn4 #y_0 should be same shape as first argument in clstm.clstm() above.
-      y_1, new_state = convcell(y_0, new_state, 'Conv', 'clstm')
+      y_1, new_state = convcell(y_0, new_state, 'Conv', dopeek, 'clstm')
 
       # deConvolutional LSTM layer
-      y_2, denew_state = deconvcell(y_1, denew_state, 'deConv', 'declstm')
+      y_2, denew_state = deconvcell(y_1, denew_state, 'deConv', dopeek, 'declstm')
 
       # DECODE
       # cnn5
@@ -211,10 +212,10 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
       # Convolutional lstm layer
       y_0 = cnn4
-      y_1, new_state_pred = convcell(y_0, new_state_pred, 'Conv', 'clstm')
+      y_1, new_state_pred = convcell(y_0, new_state_pred, 'Conv', dopeek, 'clstm')
 
       # deConvolutional lstm layer
-      y_2, new_destate_pred = deconvcell(y_1, new_destate_pred, 'deConv', 'declstm')
+      y_2, new_destate_pred = deconvcell(y_1, new_destate_pred, 'deConv', dopeek, 'declstm')
 
       # DECODE
       # cnn5

From 1166ad32d8b251fc37c54c78b5225bde481b64d5 Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Sat, 4 Feb 2017 19:00:54 -0800
Subject: [PATCH 06/12] initial import

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index dc4588f..8df1075 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,17 @@ https://arxiv.org/abs/1502.04681
 https://arxiv.org/abs/1605.07157
 http://www.ri.cmu.edu/pub_files/2014/3/egpaper_final.pdf
 
+# Papers on temporal convolution
+
+http://machinelearning.wustl.edu/mlpapers/paper_files/icml2010_JiXYY10.pdf
+https://arxiv.org/pdf/1506.01911.pdf
+http://link.springer.com/chapter/10.1007/978-3-642-25446-8_4#page-1
+https://ai2-s2-pdfs.s3.amazonaws.com/12b6/551a0f9f5aa62f7d37f03ebc66631e529c4b.pdf
+http://vision.stanford.edu/pdf/karpathy14.pdf
+https://arxiv.org/abs/1412.0767
+https://arxiv.org/abs/1511.06432
+
+
 # Uses parts of (or inspired by) the following repos:
 
 https://github.com/tensorflow/models/blob/master/real_nvp/real_nvp_utils.py

From 3cfc56566931ad592eef7a9b7ddbb32f3346819c Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Tue, 7 Feb 2017 00:54:27 -0800
Subject: [PATCH 07/12] compute number of parameters, separate checkpoint from
 loss output

---
 README.md |  9 ++++++---
 main.py   | 28 ++++++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 8df1075..497883f 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ https://ai2-s2-pdfs.s3.amazonaws.com/12b6/551a0f9f5aa62f7d37f03ebc66631e529c4b.p
 http://vision.stanford.edu/pdf/karpathy14.pdf
 https://arxiv.org/abs/1412.0767
 https://arxiv.org/abs/1511.06432
-
+And do 3D convolutions in tensorflow: https://www.tensorflow.org/api_docs/python/nn/convolution
 
 # Uses parts of (or inspired by) the following repos:
 
@@ -135,7 +135,7 @@ Notes for wheel case:
 
 * Try different hyperparameters
 
-* Try multi-scale for space
+* Try multi-scale for space (http://vision.stanford.edu/pdf/karpathy14.pdf)
 
 * Try multi-scale for time (to capture periods over long times)
 
@@ -143,7 +143,9 @@ Notes for wheel case:
 
 * Try skip connections (https://arxiv.org/pdf/1605.07157v4.pdf)
 
-* Try temporal convolution
+* Try feedbackward connections in time (like implicit stepping vs. explicit forward euler stepping).  Pass backwards cell state, back along diagonal in space-time. https://arxiv.org/pdf/1506.01911.pdf
+
+* Try temporal convolutions.  Can factorize with spatial,temporal,spatial, etc. to reduce parameters
 
 * Try other LSTM architectures (C-peek, bind forget-recall, GRU, etc.)
 
@@ -156,6 +158,7 @@ http://blog.aylien.com/introduction-generative-adversarial-networks-code-tensorf
 http://blog.aylien.com/introduction-generative-adversarial-networks-code-tensorflow/ (pytorch)
 http://blog.aylien.com/introduction-generative-adversarial-networks-code-tensorflow/
 https://arxiv.org/pdf/1511.05644v2.pdf
+https://arxiv.org/pdf/1612.00005v1.pdf
 
 * Try more depth in time
 
diff --git a/main.py b/main.py
index 54ff14b..702c55b 100644
--- a/main.py
+++ b/main.py
@@ -43,6 +43,23 @@
 
 
 
+def total_parameters():
+  total_parameters = 0
+  for variable in tf.trainable_variables():
+      # shape is an array of tf.Dimension
+      shape = variable.get_shape()
+      #print(shape)
+      #print(len(shape))
+      variable_parametes = 1
+      for dim in shape:
+          print(dim)
+          variable_parametes *= dim.value
+      #print(variable_parametes)
+      total_parameters += variable_parametes
+  print("total_parameters=%d" % (total_parameters))
+  
+
+
 # Function to train autoencoder network
 def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
@@ -311,6 +328,9 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
       # Set how often to write checkpoint file
       howoftenckpt=2000
 
+      # count and output total number of model/graph parameters
+      total_parameters()
+
       ###############
       # Training Loop
       startstep=nstep
@@ -334,7 +354,12 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
         assert not np.isnan(lossm), 'Model reached lossm = NaN'
 
 
-        # Store model and print-out loss
+        # Store model
+        if nstep%howoftensummary == 0 and nstep!=0:
+          summary_str = sess.run(summary_op, feed_dict={x:dat, hold_prob:FLAGS.hold_prob})
+          summary_writer.add_summary(summary_str, nstep)
+          
+        # Print-out loss
         if nstep%howoftensummary == 0:
           summary_str = sess.run(summary_op, feed_dict={x:dat, hold_prob:FLAGS.hold_prob})
           summary_writer.add_summary(summary_str, nstep) 
@@ -444,4 +469,3 @@ def main(argv=None):
 if __name__ == '__main__':
   tf.app.run()
 
-

From f7ee1e5325797173cc26e132f152f969839f9975 Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Sat, 25 Feb 2017 19:56:50 -0800
Subject: [PATCH 08/12] tensorflow 1.0 upgrade script

---
 README.md |  1 +
 clstm.py  | 12 ++++++------
 layers.py |  8 ++++----
 main.py   |  4 ++--
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 497883f..b0f7a9a 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,7 @@ http://blog.aylien.com/introduction-generative-adversarial-networks-code-tensorf
 http://blog.aylien.com/introduction-generative-adversarial-networks-code-tensorflow/
 https://arxiv.org/pdf/1511.05644v2.pdf
 https://arxiv.org/pdf/1612.00005v1.pdf
+https://arxiv.org/pdf/1701.06547.pdf
 
 * Try more depth in time
 
diff --git a/clstm.py b/clstm.py
index a79e467..bfffd08 100644
--- a/clstm.py
+++ b/clstm.py
@@ -90,7 +90,7 @@ def __call__(self, inputs, state, typec='Conv', dopeek=0, scope=None):
         c, h = state
       else:
         # c and h are each batchsize x clstmshape x clstmshape x clstmfeatures
-        c, h = tf.split(3, 2, state)
+        c, h = tf.split(axis=3, num_or_size_splits=2, value=state)
       # [inputs,h] is: 2 x batchsize x clstmshape x clstmshape x clstmfeatures
 
 
@@ -99,10 +99,10 @@ def __call__(self, inputs, state, typec='Conv', dopeek=0, scope=None):
         concat = _convolve_linear([inputs, h], self.filter, self.stride, self.features * 4, typec, True, scope=scope)
         # http://colah.github.io/posts/2015-08-Understanding-LSTMs/
         # i = input_gate, j = new_input, f = forget_gate, o = output_gate (each with clstmfeatures features)
-        i, j, f, o = tf.split(3, 4, concat)
+        i, j, f, o = tf.split(axis=3, num_or_size_splits=4, value=concat)
       else:
         # TODO: work in-progress
-        incat = tf.concat(3,[inputs, h])
+        incat = tf.concat(axis=3,values=[inputs, h])
         # general W.x + b separately for each i,j,f,o
         #i = tf.matmul(incat,weightsi) + biasesi
         #j = tf.matmul(incat,weightsj) + biasesj
@@ -140,7 +140,7 @@ def __call__(self, inputs, state, typec='Conv', dopeek=0, scope=None):
       if self._state_is_tuple:
         new_state = LSTMStateTuple(new_c, new_h)
       else:
-        new_state = tf.concat(3, [new_c, new_h])
+        new_state = tf.concat(axis=3, values=[new_c, new_h])
       return new_h, new_state
 
 def _convolve_linear(args, filter, stride, features, typec, bias, bias_start=0.0, scope=None):
@@ -178,7 +178,7 @@ def _convolve_linear(args, filter, stride, features, typec, bias, bias_start=0.0
   if len(args) == 1:
     inputs = args[0]
   else:
-    inputs=tf.concat(3, args)
+    inputs=tf.concat(axis=3, values=args)
 
   # Conv
   if typec=='Conv':
@@ -203,7 +203,7 @@ def _convolve_linear(args, filter, stride, features, typec, bias, bias_start=0.0
       # setup weights as kernel x kernel x (new features=clstmfeatures*4) x (input features = clstmfeatures*2).
       # i.e., 2nd arg to transpose version is [height, width, output_channels, in_channels], where last 2 are switched compared to normal conv2d
       deweights = tf.get_variable( "deWeights", [filter[0], filter[1], features, total_arg_size_depth], dtype=dtype)
-      output_shape = tf.pack([tf.shape(inputs)[0], tf.shape(inputs)[1]*stride, tf.shape(inputs)[2]*stride, features]) 
+      output_shape = tf.stack([tf.shape(inputs)[0], tf.shape(inputs)[1]*stride, tf.shape(inputs)[2]*stride, features]) 
       # first argument is batchsize x clstmshape x clstmshape x (2*clstmfeatures)
       # res: batchsize x clstmshape x clstmshape x (clstmfeatures*4)
       res = tf.nn.conv2d_transpose(inputs, deweights, output_shape, strides=[1, stride, stride, 1], padding='SAME')
diff --git a/layers.py b/layers.py
index 6608007..f274551 100644
--- a/layers.py
+++ b/layers.py
@@ -33,8 +33,8 @@ def _activation_summary(x):
     nothing
   """
   tensor_name = x.op.name
-  tf.histogram_summary(tensor_name + '/activations', x)
-  tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
+  tf.summary.histogram(tensor_name + '/activations', x)
+  tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
 
   # used by cifar10 and inception in tensorflow for multi-GPU systems that have no P2P.
   # But Titan X's have DMA P2P, so change to /gpu:0
@@ -75,7 +75,7 @@ def _variable_with_weight_decay(name, shape, stddev, wd):
   var = _variable_on_cpu(name, shape,
                          tf.truncated_normal_initializer(stddev=stddev))
   if wd:
-    weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
+    weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
     weight_decay.set_shape([])
     tf.add_to_collection('losses', weight_decay)
   return var
@@ -103,7 +103,7 @@ def dcnn2d_layer(inputs, kernel, stride, features, idx, linear = False):
     weights = _variable_with_weight_decay('deweights', shape=[kernel,kernel,features,input_channels], stddev=0.01, wd=FLAGS.weight_decay)
     biases = _variable_on_cpu('debiases',[features],tf.constant_initializer(0.01))
     batch_size = tf.shape(inputs)[0]
-    output_shape = tf.pack([tf.shape(inputs)[0], tf.shape(inputs)[1]*stride, tf.shape(inputs)[2]*stride, features]) 
+    output_shape = tf.stack([tf.shape(inputs)[0], tf.shape(inputs)[1]*stride, tf.shape(inputs)[2]*stride, features]) 
     dcnn = tf.nn.conv2d_transpose(inputs, weights, output_shape, strides=[1,stride,stride,1], padding='SAME')
     dcnn_biased = tf.nn.bias_add(dcnn, biases)
     if linear:
diff --git a/main.py b/main.py
index 702c55b..f1741fd 100644
--- a/main.py
+++ b/main.py
@@ -195,7 +195,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
     # Pack-up predictive layer's results
     # e.g. for input_seq_length=10 loop 0..9, had put into x_pred i=5,6,7,8,9 (i.e. 5 frame prediction)
-    x_pred = tf.pack(x_pred)
+    x_pred = tf.stack(x_pred)
     # reshape so in order of minibatch x frame x sizex x sizey x rgb
     x_pred = tf.transpose(x_pred, [1,0,2,3,4])
     
@@ -247,7 +247,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
         x_pred_long.append(x_1_pred)
 
     # Pack-up predicted layer's results
-    x_pred_long = tf.pack(x_pred_long)
+    x_pred_long = tf.stack(x_pred_long)
     x_pred_long = tf.transpose(x_pred_long, [1,0,2,3,4])
 
 

From ba863a4890de1f4a84d9e4d9271f40f8f5e89be7 Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Thu, 2 Mar 2017 12:00:27 -0800
Subject: [PATCH 09/12] work around for bug in tf1.0.0

---
 README.md |   7 +-
 layers.py |   5 +-
 main.py   | 205 +++++++++++++++++++++++++++---------------------------
 3 files changed, 111 insertions(+), 106 deletions(-)

diff --git a/README.md b/README.md
index b0f7a9a..dd75564 100644
--- a/README.md
+++ b/README.md
@@ -64,13 +64,16 @@ cs231n.stanford.edu/reports2016/223_Report.pdf
 
 # Program Requirements:
 
-* Tensorflow and related packages like python
-* OpenCV
+* Tensorflow(1.0)
+* Python(2.7)
+* OpenCV(2)
 
 # Post-Processing requirements
 
 * avconv, mencoder, MP4Box,smplayer
 
+sudo apt-get install Imagemagick avconv libav-tools mencoder MP4Box
+
 
 # How to run:
 
diff --git a/layers.py b/layers.py
index f274551..42755ab 100644
--- a/layers.py
+++ b/layers.py
@@ -50,8 +50,8 @@ def _variable_on_cpu(name, shape, initializer):
   Returns:
     Variable Tensor
   """
-#  with tf.device('/cpu:0'):
-  with tf.device('/gpu:0'):
+  with tf.device('/cpu:0'):
+#  with tf.device('/gpu:0'):
     var = tf.get_variable(name, shape, initializer=initializer)
   return var
 
@@ -102,6 +102,7 @@ def dcnn2d_layer(inputs, kernel, stride, features, idx, linear = False):
     
     weights = _variable_with_weight_decay('deweights', shape=[kernel,kernel,features,input_channels], stddev=0.01, wd=FLAGS.weight_decay)
     biases = _variable_on_cpu('debiases',[features],tf.constant_initializer(0.01))
+
     batch_size = tf.shape(inputs)[0]
     output_shape = tf.stack([tf.shape(inputs)[0], tf.shape(inputs)[1]*stride, tf.shape(inputs)[2]*stride, features]) 
     dcnn = tf.nn.conv2d_transpose(inputs, weights, output_shape, strides=[1,stride,stride,1], padding='SAME')
diff --git a/main.py b/main.py
index f1741fd..23d41bd 100644
--- a/main.py
+++ b/main.py
@@ -52,7 +52,7 @@ def total_parameters():
       #print(len(shape))
       variable_parametes = 1
       for dim in shape:
-          print(dim)
+          #print(dim)
           variable_parametes *= dim.value
       #print(variable_parametes)
       total_parameters += variable_parametes
@@ -149,106 +149,106 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
       denew_state = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32) 
 
       
-
-    ########################
-    # Create CNN-LSTM-dCNN for an input of input_seq_length-1 frames in n time for an output of input_seq_length-1 frames in n+1 time
-    for i in xrange(FLAGS.input_seq_length-1):
-
-      # ENCODE
-      # CNN: (name, 2D square kernel filter size, stride for spatial domain, number of feature maps, name) using ELUs
-      # cnn1:
-      if i < FLAGS.predict_frame_start:
-        # only dropout on training layers
-        cnn1 = ld.cnn2d_layer(x_dropout[:,i,:,:,:], cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
-      else:
-        # direct input of prior output for predictive layers
-        cnn1 = ld.cnn2d_layer(x_1, cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
-      # cnn2:
-      cnn2 = ld.cnn2d_layer(cnn1, cnnkernels[1], cnnstrides[1], cnnfeatures[1], "cnn_2")
-      # cnn3:
-      cnn3 = ld.cnn2d_layer(cnn2, cnnkernels[2], cnnstrides[2], cnnfeatures[2], "cnn_3")
-      # cnn4:
-      cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4")
-
-      # Convolutional lstm layer (input y_0 and hidden state, output prediction y_1 and new hidden state new_state)
-      y_0 = cnn4 #y_0 should be same shape as first argument in clstm.clstm() above.
-      y_1, new_state = convcell(y_0, new_state, 'Conv', dopeek, 'clstm')
-
-      # deConvolutional LSTM layer
-      y_2, denew_state = deconvcell(y_1, denew_state, 'deConv', dopeek, 'declstm')
-
-      # DECODE
-      # cnn5
-      cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_4")
-      # cnn6
-      cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_3")
-      # cnn7
-      cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_2")
-      # x_1 (linear act)
-      x_1 = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_1", True)
-      if i >= FLAGS.predict_frame_start:
-        # add predictive layer
-        x_pred.append(x_1)
-      # set reuse to true after first go
-      if i == 0:
-        tf.get_variable_scope().reuse_variables()
-
-    # Pack-up predictive layer's results
-    # e.g. for input_seq_length=10 loop 0..9, had put into x_pred i=5,6,7,8,9 (i.e. 5 frame prediction)
-    x_pred = tf.stack(x_pred)
-    # reshape so in order of minibatch x frame x sizex x sizey x rgb
-    x_pred = tf.transpose(x_pred, [1,0,2,3,4])
-    
-
-    #######################################################
-    # Create network to generate predicted video
-    predictframes=50
-
-    ##############
-    # Setup CLSTM (initialize to zero, but same convcell as in other network)
-    x_pred_long = []
-    new_state_pred = convcell.set_zero_state(FLAGS.minibatch_size, tf.float32)
-    new_destate_pred = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32)
-
-    #######
-    # Setup long prediction network
-    for i in xrange(predictframes):
-
-      # ENCODE
-      # cnn1
-      if i < FLAGS.predict_frame_start: # use known sequence for this many frames
-        cnn1 = ld.cnn2d_layer(x[:,i,:,:,:], cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
-      else: # use generated sequence for rest of frames
-        cnn1 = ld.cnn2d_layer(x_1_pred, cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
-      # cnn2
-      cnn2 = ld.cnn2d_layer(cnn1, cnnkernels[1], cnnstrides[1], cnnfeatures[1], "cnn_2")
-      # cnn3
-      cnn3 = ld.cnn2d_layer(cnn2, cnnkernels[2], cnnstrides[2], cnnfeatures[2], "cnn_3")
-      # cnn4
-      cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4")
-
-      # Convolutional lstm layer
-      y_0 = cnn4
-      y_1, new_state_pred = convcell(y_0, new_state_pred, 'Conv', dopeek, 'clstm')
-
-      # deConvolutional lstm layer
-      y_2, new_destate_pred = deconvcell(y_1, new_destate_pred, 'deConv', dopeek, 'declstm')
-
-      # DECODE
-      # cnn5
-      cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_4")
-      # cnn6
-      cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_3")
-      # cnn7
-      cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_2")
-      # x_1_pred (linear act)
-      x_1_pred = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_1", True)
-      if i >= FLAGS.predict_frame_start:
-        x_pred_long.append(x_1_pred)
-
-    # Pack-up predicted layer's results
-    x_pred_long = tf.stack(x_pred_long)
-    x_pred_long = tf.transpose(x_pred_long, [1,0,2,3,4])
+    with tf.variable_scope('graph'):
+      ########################
+      # Create CNN-LSTM-dCNN for an input of input_seq_length-1 frames in n time for an output of input_seq_length-1 frames in n+1 time
+      for i in xrange(FLAGS.input_seq_length-1):
+
+        # ENCODE
+        # CNN: (name, 2D square kernel filter size, stride for spatial domain, number of feature maps, name) using ELUs
+        # cnn1:
+        if i < FLAGS.predict_frame_start:
+          # only dropout on training layers
+          cnn1 = ld.cnn2d_layer(x_dropout[:,i,:,:,:], cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
+        else:
+          # direct input of prior output for predictive layers
+          cnn1 = ld.cnn2d_layer(x_1, cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
+        # cnn2:
+        cnn2 = ld.cnn2d_layer(cnn1, cnnkernels[1], cnnstrides[1], cnnfeatures[1], "cnn_2")
+        # cnn3:
+        cnn3 = ld.cnn2d_layer(cnn2, cnnkernels[2], cnnstrides[2], cnnfeatures[2], "cnn_3")
+        # cnn4:
+        cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4")
+
+        # Convolutional lstm layer (input y_0 and hidden state, output prediction y_1 and new hidden state new_state)
+        y_0 = cnn4 #y_0 should be same shape as first argument in clstm.clstm() above.
+        y_1, new_state = convcell(y_0, new_state, 'Conv', dopeek, 'clstm')
+
+        # deConvolutional LSTM layer
+        y_2, denew_state = deconvcell(y_1, denew_state, 'deConv', dopeek, 'declstm')
+
+        # DECODE
+        # cnn5
+        cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_4")
+        # cnn6
+        cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_3")
+        # cnn7
+        cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_2")
+        # x_1 (linear act)
+        x_1 = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_1", True)
+        if i >= FLAGS.predict_frame_start:
+          # add predictive layer
+          x_pred.append(x_1)
+        # set reuse to true after first go
+        if i == 0:
+          tf.get_variable_scope().reuse_variables()
+
+      # Pack-up predictive layer's results
+      # e.g. for input_seq_length=10 loop 0..9, had put into x_pred i=5,6,7,8,9 (i.e. 5 frame prediction)
+      x_pred = tf.stack(x_pred)
+      # reshape so in order of minibatch x frame x sizex x sizey x rgb
+      x_pred = tf.transpose(x_pred, [1,0,2,3,4])
+
+
+      #######################################################
+      # Create network to generate predicted video
+      predictframes=50
+
+      ##############
+      # Setup CLSTM (initialize to zero, but same convcell as in other network)
+      x_pred_long = []
+      new_state_pred = convcell.set_zero_state(FLAGS.minibatch_size, tf.float32)
+      new_destate_pred = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32)
+
+      #######
+      # Setup long prediction network
+      for i in xrange(predictframes):
+
+        # ENCODE
+        # cnn1
+        if i < FLAGS.predict_frame_start: # use known sequence for this many frames
+          cnn1 = ld.cnn2d_layer(x[:,i,:,:,:], cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
+        else: # use generated sequence for rest of frames
+          cnn1 = ld.cnn2d_layer(x_1_pred, cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
+        # cnn2
+        cnn2 = ld.cnn2d_layer(cnn1, cnnkernels[1], cnnstrides[1], cnnfeatures[1], "cnn_2")
+        # cnn3
+        cnn3 = ld.cnn2d_layer(cnn2, cnnkernels[2], cnnstrides[2], cnnfeatures[2], "cnn_3")
+        # cnn4
+        cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4")
+
+        # Convolutional lstm layer
+        y_0 = cnn4
+        y_1, new_state_pred = convcell(y_0, new_state_pred, 'Conv', dopeek, 'clstm')
+
+        # deConvolutional lstm layer
+        y_2, new_destate_pred = deconvcell(y_1, new_destate_pred, 'deConv', dopeek, 'declstm')
+
+        # DECODE
+        # cnn5
+        cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_4")
+        # cnn6
+        cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_3")
+        # cnn7
+        cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_2")
+        # x_1_pred (linear act)
+        x_1_pred = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_1", True)
+        if i >= FLAGS.predict_frame_start:
+          x_pred_long.append(x_1_pred)
+
+      # Pack-up predicted layer's results
+      x_pred_long = tf.stack(x_pred_long)
+      x_pred_long = tf.transpose(x_pred_long, [1,0,2,3,4])
 
 
     #######################################################
@@ -263,10 +263,11 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
     ploss = tf.sqrt(10.0*loss/normalnorm)
     tf.summary.scalar('ploss', ploss)
 
+    
     # Set training method
     with tf.name_scope('train'):
       train_operation = tf.train.AdamOptimizer(FLAGS.adamvar).minimize(loss)
-    
+     
     # List of all Variables
     variables = tf.global_variables()
 

From 648cf0b6e796298e2ac8b527c7d3a7952433a8fa Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Thu, 9 Mar 2017 17:28:37 -0800
Subject: [PATCH 10/12] multiple gpus

---
 README.md |  11 +-
 main.py   | 631 ++++++++++++++++++++++++++++++++++--------------------
 2 files changed, 402 insertions(+), 240 deletions(-)

diff --git a/README.md b/README.md
index dd75564..b0be50c 100644
--- a/README.md
+++ b/README.md
@@ -77,14 +77,15 @@ sudo apt-get install Imagemagick avconv libav-tools mencoder MP4Box
 
 # How to run:
 
-python main.py
+python main.py --num_gpus=<number of gpus>
 
-Post-processing: making model vs. predicted video:
+where <number of gpus> can be 1 through number of actual GPU boards on node.
+
+# Post-processing: making model vs. predicted video:
 
 sh mergemov.sh
 
 smplayer out_all.mp4
-or
 smplayer out_all2_fast.mp4
 
 # Some training results:
@@ -117,10 +118,6 @@ Notes for wheel case:
 1) In main.py:
 
 * Choose global flags
-* In main():
-  * Choose to use checkpoints (if exist) or not: continuetrain
-  * type of model: modeltype
-  * number of balls: num_balls
 
 2) In balls.py:
 
diff --git a/main.py b/main.py
index 23d41bd..20c8097 100644
--- a/main.py
+++ b/main.py
@@ -27,10 +27,14 @@
                             """directory to store checkpoints""")
 tf.app.flags.DEFINE_integer('sizexy', 32,
                             """size x and y dimensions for model, training, and prediction""")
+tf.app.flags.DEFINE_integer('sizez', 3,
+                            """size z for rgb or any other such information""")
 tf.app.flags.DEFINE_integer('input_seq_length', 50,
                             """size of hidden layer""")
 tf.app.flags.DEFINE_integer('predict_frame_start', 25,
                             """ frame number, in zero-base counting, to start using prediction as output or next input""")
+tf.app.flags.DEFINE_integer('predictframes', 50,
+                            """number of frames to predict""")
 tf.app.flags.DEFINE_integer('max_minibatches', 1000000,
                             """maximum number of mini-batches""")
 tf.app.flags.DEFINE_float('hold_prob', .8,
@@ -39,8 +43,20 @@
                             """adamvar for dropout""")
 tf.app.flags.DEFINE_integer('minibatch_size', 16,
                             """mini-batch size""")
-
-
+tf.app.flags.DEFINE_integer('num_balls', 1,
+                            """How many balls to model.""")
+# Choose which model to work on
+# 0 = classic bouncing balls
+# 1 = rotating "ball"
+tf.app.flags.DEFINE_integer('modeltype', 1,
+                            """Type of model.""")
+tf.app.flags.DEFINE_integer('num_gpus', 1,
+                            """How many GPUs to use.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+tf.app.flags.DEFINE_integer('continuetrain', 1,
+                            """Whether to continue to train (1, default) or not (0).""")
 
 
 def total_parameters():
@@ -59,218 +75,347 @@ def total_parameters():
   print("total_parameters=%d" % (total_parameters))
   
 
+def tower_loss(x,x_dropout,scope):
+  """Calculate the total loss on a single tower running the model.
+
+  Args:
+    scope: unique prefix string identifying the tower, e.g. 'tower0'
+
+  Returns:
+     Tensor of shape [] containing the total loss for a batch of data
+  """
+
+  #######################################################
+  # Create network to train
+  #
+  # Setup inputs
+  # size of balls in x-y directions each (same)
+  sizexy=FLAGS.sizexy
+  # Number of rgb or depth estimation at t=0, but no convolution in this direction
+  sizez=FLAGS.sizez
+
+
+  cnnkernels=[3,3,3,1]
+  cnnstrides=[2,1,2,1]
+  cnnstrideproduct=np.product(cnnstrides)
+  cnnfeatures=[8,8,8,4]
+  #
+  # check strides are acceptable
+  testsize=sizexy
+  for i in xrange(len(cnnstrides)):
+    if testsize % cnnstrides[i] !=0:
+      print("sizexy must be evenly divisible by each stride, in order to keep input to cnn or dcnn an integer number of pixels")
+      exit
+    else:
+      testsize=testsize/cnnstrides[i]
+  #
+
+  dopeek=1 # whether to peek as cell state when constructing gates
+  clstminput=sizexy/cnnstrideproduct # must be evenly divisible
+  clstmshape=[clstminput,clstminput]
+  clstmkernel=[3,3]
+  clstmstride=1 # currently needs to be 1 unless implement tf.pad() or tf.nn.fractional_avg_pool()
+  clstmfeatures=cnnfeatures[3] # same as features of last cnn layer fed into clstm
+  #
+  dcnnkernels=[1,3,3,3] # reasonably the reverse order of cnnkernels
+  dcnnstrides=[1,2,1,2] # reasonably the reverse order of cnnstrides
+  dcnnstrideproduct=np.product(dcnnstrides)
+  # last dcnn feature is rgb again
+  dcnnfeatures=[8,8,8,sizez] # reasonably the reverse order of cnnfeatures, except last cnnfeatures and last dcnnfeatures (note, features are for produced object, while kernels and strides operate on current object, hence apparent shift)
+  #
+  # check d-strides are acceptable
+  testsize=sizexy
+  for i in xrange(len(dcnnstrides)):
+    if testsize % dcnnstrides[i] !=0:
+      print("sizexy must be evenly divisible by each d-stride, in order to keep input to cnn or dcnn an integer number of pixels")
+      exit
+    else:
+      testsize=testsize/dcnnstrides[i]
+  #
+  # ensure strides cumulate to same total product so input and output same size, because we feed output back as input
+  if dcnnstrideproduct!=cnnstrideproduct:
+    print("cnn and dcnn strides must match for creating input size and output same size");
+    exit
+  #
+  #
+  #
+
+
+
+
+  ####################
+  # Setup CLSTM
+  with tf.variable_scope('clstm', initializer = tf.random_uniform_initializer(-.01, 0.1)):
+    # input shape, kernel filter size, number of features
+    convcell = clstm.clstm(clstmshape, clstmkernel, clstmstride, clstmfeatures)
+    # state: batchsize x clstmshape x clstmshape x clstmfeatures
+    new_state = convcell.set_zero_state(FLAGS.minibatch_size, tf.float32) 
+
+    # Setup deCLSTM
+  with tf.variable_scope('declstm', initializer = tf.random_uniform_initializer(-.01, 0.1)):
+    # input shape, kernel filter size, number of features
+    deconvcell = clstm.clstm(clstmshape, clstmkernel, clstmstride, clstmfeatures)
+    # state: batchsize x clstmshape x clstmshape x clstmfeatures
+    denew_state = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32) 
+
+
+  ########################
+  # Create CNN-LSTM-dCNN for an input of input_seq_length-1 frames in n time for an output of input_seq_length-1 frames in n+1 time
+  x_pred = []
+  for i in xrange(FLAGS.input_seq_length-1):
+
+    # ENCODE
+    # CNN: (name, 2D square kernel filter size, stride for spatial domain, number of feature maps, name) using ELUs
+    # cnn1:
+    if i < FLAGS.predict_frame_start:
+      # only dropout on training layers
+      cnn1 = ld.cnn2d_layer(x_dropout[:,i,:,:,:], cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
+    else:
+      # direct input of prior output for predictive layers
+      cnn1 = ld.cnn2d_layer(x_1, cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
+    # cnn2:
+    cnn2 = ld.cnn2d_layer(cnn1, cnnkernels[1], cnnstrides[1], cnnfeatures[1], "cnn_2")
+    # cnn3:
+    cnn3 = ld.cnn2d_layer(cnn2, cnnkernels[2], cnnstrides[2], cnnfeatures[2], "cnn_3")
+    # cnn4:
+    cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4")
+
+    # Convolutional lstm layer (input y_0 and hidden state, output prediction y_1 and new hidden state new_state)
+    y_0 = cnn4 #y_0 should be same shape as first argument in clstm.clstm() above.
+    y_1, new_state = convcell(y_0, new_state, 'Conv', dopeek, 'clstm')
+
+    # deConvolutional LSTM layer
+    y_2, denew_state = deconvcell(y_1, denew_state, 'deConv', dopeek, 'declstm')
+
+    # DECODE
+    # cnn5
+    cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_4")
+    # cnn6
+    cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_3")
+    # cnn7
+    cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_2")
+    # x_1 (linear act)
+    x_1 = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_1", True)
+    if i >= FLAGS.predict_frame_start:
+      # add predictive layer
+      x_pred.append(x_1)
+    # set reuse to true after first go
+    if i == 0:
+      tf.get_variable_scope().reuse_variables()
+
+  # Pack-up predictive layer's results
+  # e.g. for input_seq_length=10 loop 0..9, had put into x_pred i=5,6,7,8,9 (i.e. 5 frame prediction)
+  x_pred = tf.stack(x_pred)
+  # reshape so in order of minibatch x frame x sizex x sizey x rgb
+  x_pred = tf.transpose(x_pred, [1,0,2,3,4])
+
+
+  #######################################################
+  # Create network to generate predicted video (TODO: could keep on only 1 gpu or on cpu)
+  predictframes=FLAGS.predictframes
+
+  ##############
+  # Setup CLSTM (initialize to zero, but same convcell as in other network)
+  x_pred_long = []
+  new_state_pred = convcell.set_zero_state(FLAGS.minibatch_size, tf.float32)
+  new_destate_pred = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32)
+
+  #######
+  # Setup long prediction network
+  for i in xrange(predictframes):
+
+    # ENCODE
+    # cnn1
+    if i < FLAGS.predict_frame_start: # use known sequence for this many frames
+      cnn1 = ld.cnn2d_layer(x[:,i,:,:,:], cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
+    else: # use generated sequence for rest of frames
+      cnn1 = ld.cnn2d_layer(x_1_pred, cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
+    # cnn2
+    cnn2 = ld.cnn2d_layer(cnn1, cnnkernels[1], cnnstrides[1], cnnfeatures[1], "cnn_2")
+    # cnn3
+    cnn3 = ld.cnn2d_layer(cnn2, cnnkernels[2], cnnstrides[2], cnnfeatures[2], "cnn_3")
+    # cnn4
+    cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4")
+
+    # Convolutional lstm layer
+    y_0 = cnn4
+    y_1, new_state_pred = convcell(y_0, new_state_pred, 'Conv', dopeek, 'clstm')
+
+    # deConvolutional lstm layer
+    y_2, new_destate_pred = deconvcell(y_1, new_destate_pred, 'deConv', dopeek, 'declstm')
+
+    # DECODE
+    # cnn5
+    cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_4")
+    # cnn6
+    cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_3")
+    # cnn7
+    cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_2")
+    # x_1_pred (linear act)
+    x_1_pred = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_1", True)
+    if i >= FLAGS.predict_frame_start:
+      x_pred_long.append(x_1_pred)
+
+  # Pack-up predicted layer's results
+  x_pred_long = tf.stack(x_pred_long)
+  x_pred_long = tf.transpose(x_pred_long, [1,0,2,3,4])
+
+
+  #######################################################
+  # Setup loss Computation
+  # Loss computes L2 for original sequence vs. predicted sequence over input_seq_length - (seq.start+1) frames
+  # Compare x^{n+1} to xpred^n (that is supposed to be approximation to x^{n+1})
+  # x: batchsize, time steps, sizexy, sizexy, sizez
+  loss = tf.nn.l2_loss(x[:,FLAGS.predict_frame_start+1:,:,:,:] - x_pred[:,:,:,:,:])
+  tf.summary.scalar('loss', loss)
+  normalnorm=tf.nn.l2_loss(x[:,FLAGS.predict_frame_start+1:,:,:,:])
+  tf.summary.scalar('normalnorm', normalnorm)
+  ploss = tf.sqrt(10.0*loss/normalnorm)
+  tf.summary.scalar('ploss', ploss)
+
+  return loss,normalnorm,ploss,x_pred,x_pred_long
+  
+
+
+def average_gradients(tower_grads):
+  """Calculate the average gradient for each shared variable across all towers.
+
+  Note that this function provides a synchronization point across all towers.
+
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over individual gradients. The inner list is over the gradient
+      calculation for each tower.
+  Returns:
+     List of pairs of (gradient, variable) where the gradient has been averaged
+     across all towers.
+  """
+  average_grads = []
+  for grad_and_vars in zip(*tower_grads):
+    # Note that each grad_and_vars looks like the following:
+    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+    grads = []
+    for g, _ in grad_and_vars:
+      # Add 0 dimension to the gradients to represent the tower.
+      expanded_g = tf.expand_dims(g, 0)
+
+      # Append on a 'tower' dimension which we will average over below.
+      grads.append(expanded_g)
+
+    # Average over the 'tower' dimension.
+    grad = tf.concat(grads, 0)
+    grad = tf.reduce_mean(grad, 0)
+
+    # Keep in mind that the Variables are redundant because they are shared
+    # across towers. So .. we will just return the first tower's pointer to
+    # the Variable.
+    v = grad_and_vars[0][1]
+    grad_and_var = (grad, v)
+    average_grads.append(grad_and_var)
+  return average_grads
+
+
 
 # Function to train autoencoder network
 def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
-  with tf.Graph().as_default():
-    
-    # Setup inputs
-    # size of balls in x-y directions each (same)
-    sizexy=FLAGS.sizexy
-    # Number of rgb or depth estimation at t=0, but no convolution in this direction
-    sizez=3
-    with tf.name_scope('input'):
-      # x: minibatches x input_seq_length of frames x sizex x sizey x sizez(rgb)
-      x = tf.placeholder(tf.float32, [None, FLAGS.input_seq_length, sizexy, sizexy, sizez])
-
-    # Setup dropout
-    hold_prob = tf.placeholder("float")
-    x_dropout = tf.nn.dropout(x, hold_prob)
-
-    # Some checks
-    if FLAGS.input_seq_length-1<=FLAGS.predict_frame_start:
-      print("prediction frame starting point (zero starting point) beyond input size - 1, so no prediction used as next input or even used as any output to compute loss")
-      exit
+
+  # Some checks
+  if FLAGS.input_seq_length-1<=FLAGS.predict_frame_start:
+    print("prediction frame starting point (zero starting point) beyond input size - 1, so no prediction used as next input or even used as any output to compute loss")
+    exit
+
+
+  # Setup graph and train
+  with tf.Graph().as_default(), tf.device('/cpu:0'):
+
+    # Create a variable to count the number of train() calls. This equals the
+    # number of batches processed * FLAGS.num_gpus.
+    global_step = tf.get_variable(
+        'global_step', [],
+        initializer=tf.constant_initializer(0), trainable=False)
+
+
+    # Set training method for all towers
+    opt = tf.train.AdamOptimizer(FLAGS.adamvar)
     
 
-    #######################################################
-    # Create network to train
-    #
-    cnnkernels=[3,3,3,1]
-    cnnstrides=[2,1,2,1]
-    cnnstrideproduct=np.product(cnnstrides)
-    cnnfeatures=[8,8,8,4]
-    #
-    # check strides are acceptable
-    testsize=sizexy
-    for i in xrange(len(cnnstrides)):
-      if testsize % cnnstrides[i] !=0:
-        print("sizexy must be evenly divisible by each stride, in order to keep input to cnn or dcnn an integer number of pixels")
-        exit
-      else:
-        testsize=testsize/cnnstrides[i]
-    #
-        
-    dopeek=1 # whether to peek as cell state when constructing gates
-    clstminput=sizexy/cnnstrideproduct # must be evenly divisible
-    clstmshape=[clstminput,clstminput]
-    clstmkernel=[3,3]
-    clstmstride=1 # currently needs to be 1 unless implement tf.pad() or tf.nn.fractional_avg_pool()
-    clstmfeatures=cnnfeatures[3] # same as features of last cnn layer fed into clstm
-    #
-    dcnnkernels=[1,3,3,3] # reasonably the reverse order of cnnkernels
-    dcnnstrides=[1,2,1,2] # reasonably the reverse order of cnnstrides
-    dcnnstrideproduct=np.product(dcnnstrides)
-    # last dcnn feature is rgb again
-    dcnnfeatures=[8,8,8,sizez] # reasonably the reverse order of cnnfeatures, except last cnnfeatures and last dcnnfeatures (note, features are for produced object, while kernels and strides operate on current object, hence apparent shift)
-    #
-    # check d-strides are acceptable
-    testsize=sizexy
-    for i in xrange(len(dcnnstrides)):
-      if testsize % dcnnstrides[i] !=0:
-        print("sizexy must be evenly divisible by each d-stride, in order to keep input to cnn or dcnn an integer number of pixels")
-        exit
-      else:
-        testsize=testsize/dcnnstrides[i]
-    #
-    # ensure strides cumulate to same total product so input and output same size, because we feed output back as input
-    if dcnnstrideproduct!=cnnstrideproduct:
-      print("cnn and dcnn strides must match for creating input size and output same size");
-      exit
-    #
-    #
-    #
-    x_pred = []
-
-    ####################
-    # Setup CLSTM
-    with tf.variable_scope('clstm', initializer = tf.random_uniform_initializer(-.01, 0.1)):
-      # input shape, kernel filter size, number of features
-      convcell = clstm.clstm(clstmshape, clstmkernel, clstmstride, clstmfeatures)
-      # state: batchsize x clstmshape x clstmshape x clstmfeatures
-      new_state = convcell.set_zero_state(FLAGS.minibatch_size, tf.float32) 
-
-      # Setup deCLSTM
-    with tf.variable_scope('declstm', initializer = tf.random_uniform_initializer(-.01, 0.1)):
-      # input shape, kernel filter size, number of features
-      deconvcell = clstm.clstm(clstmshape, clstmkernel, clstmstride, clstmfeatures)
-      # state: batchsize x clstmshape x clstmshape x clstmfeatures
-      denew_state = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32) 
+    # Setup independent Graph model for each gpu
+    tower_grads = []
+    tower_vars = []
+    tower_x_pred = []
+    tower_x_pred_long = []
+    with tf.variable_scope(tf.get_variable_scope()): # variable scope
 
-      
-    with tf.variable_scope('graph'):
-      ########################
-      # Create CNN-LSTM-dCNN for an input of input_seq_length-1 frames in n time for an output of input_seq_length-1 frames in n+1 time
-      for i in xrange(FLAGS.input_seq_length-1):
-
-        # ENCODE
-        # CNN: (name, 2D square kernel filter size, stride for spatial domain, number of feature maps, name) using ELUs
-        # cnn1:
-        if i < FLAGS.predict_frame_start:
-          # only dropout on training layers
-          cnn1 = ld.cnn2d_layer(x_dropout[:,i,:,:,:], cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
-        else:
-          # direct input of prior output for predictive layers
-          cnn1 = ld.cnn2d_layer(x_1, cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
-        # cnn2:
-        cnn2 = ld.cnn2d_layer(cnn1, cnnkernels[1], cnnstrides[1], cnnfeatures[1], "cnn_2")
-        # cnn3:
-        cnn3 = ld.cnn2d_layer(cnn2, cnnkernels[2], cnnstrides[2], cnnfeatures[2], "cnn_3")
-        # cnn4:
-        cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4")
-
-        # Convolutional lstm layer (input y_0 and hidden state, output prediction y_1 and new hidden state new_state)
-        y_0 = cnn4 #y_0 should be same shape as first argument in clstm.clstm() above.
-        y_1, new_state = convcell(y_0, new_state, 'Conv', dopeek, 'clstm')
-
-        # deConvolutional LSTM layer
-        y_2, denew_state = deconvcell(y_1, denew_state, 'deConv', dopeek, 'declstm')
-
-        # DECODE
-        # cnn5
-        cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_4")
-        # cnn6
-        cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_3")
-        # cnn7
-        cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_2")
-        # x_1 (linear act)
-        x_1 = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_1", True)
-        if i >= FLAGS.predict_frame_start:
-          # add predictive layer
-          x_pred.append(x_1)
-        # set reuse to true after first go
-        if i == 0:
-          tf.get_variable_scope().reuse_variables()
-
-      # Pack-up predictive layer's results
-      # e.g. for input_seq_length=10 loop 0..9, had put into x_pred i=5,6,7,8,9 (i.e. 5 frame prediction)
-      x_pred = tf.stack(x_pred)
-      # reshape so in order of minibatch x frame x sizex x sizey x rgb
-      x_pred = tf.transpose(x_pred, [1,0,2,3,4])
-
-
-      #######################################################
-      # Create network to generate predicted video
-      predictframes=50
-
-      ##############
-      # Setup CLSTM (initialize to zero, but same convcell as in other network)
-      x_pred_long = []
-      new_state_pred = convcell.set_zero_state(FLAGS.minibatch_size, tf.float32)
-      new_destate_pred = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32)
-
-      #######
-      # Setup long prediction network
-      for i in xrange(predictframes):
-
-        # ENCODE
-        # cnn1
-        if i < FLAGS.predict_frame_start: # use known sequence for this many frames
-          cnn1 = ld.cnn2d_layer(x[:,i,:,:,:], cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
-        else: # use generated sequence for rest of frames
-          cnn1 = ld.cnn2d_layer(x_1_pred, cnnkernels[0], cnnstrides[0], cnnfeatures[0], "cnn_1")
-        # cnn2
-        cnn2 = ld.cnn2d_layer(cnn1, cnnkernels[1], cnnstrides[1], cnnfeatures[1], "cnn_2")
-        # cnn3
-        cnn3 = ld.cnn2d_layer(cnn2, cnnkernels[2], cnnstrides[2], cnnfeatures[2], "cnn_3")
-        # cnn4
-        cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4")
-
-        # Convolutional lstm layer
-        y_0 = cnn4
-        y_1, new_state_pred = convcell(y_0, new_state_pred, 'Conv', dopeek, 'clstm')
-
-        # deConvolutional lstm layer
-        y_2, new_destate_pred = deconvcell(y_1, new_destate_pred, 'deConv', dopeek, 'declstm')
-
-        # DECODE
-        # cnn5
-        cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_4")
-        # cnn6
-        cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_3")
-        # cnn7
-        cnn7 = ld.dcnn2d_layer(cnn6, dcnnkernels[2], dcnnstrides[2], dcnnfeatures[2], "dcnn_2")
-        # x_1_pred (linear act)
-        x_1_pred = ld.dcnn2d_layer(cnn7, dcnnkernels[3], dcnnstrides[3], dcnnfeatures[3], "dcnn_1", True)
-        if i >= FLAGS.predict_frame_start:
-          x_pred_long.append(x_1_pred)
-
-      # Pack-up predicted layer's results
-      x_pred_long = tf.stack(x_pred_long)
-      x_pred_long = tf.transpose(x_pred_long, [1,0,2,3,4])
-
-
-    #######################################################
-    # Setup loss Computation
-    # Loss computes L2 for original sequence vs. predicted sequence over input_seq_length - (seq.start+1) frames
-    # Compare x^{n+1} to xpred^n (that is supposed to be approximation to x^{n+1})
-    # x: batchsize, time steps, sizexy, sizexy, sizez
-    loss = tf.nn.l2_loss(x[:,FLAGS.predict_frame_start+1:,:,:,:] - x_pred[:,:,:,:,:])
-    tf.summary.scalar('loss', loss)
-    normalnorm=tf.nn.l2_loss(x[:,FLAGS.predict_frame_start+1:,:,:,:])
-    tf.summary.scalar('normalnorm', normalnorm)
-    ploss = tf.sqrt(10.0*loss/normalnorm)
-    tf.summary.scalar('ploss', ploss)
 
+      # setup graph input x and x_dropout
+      # x: gpus x minibatch size x input_seq_length of frames x sizex x sizey x sizez(rgb)
+      x = tf.placeholder(tf.float32, [FLAGS.num_gpus, FLAGS.minibatch_size, FLAGS.input_seq_length, FLAGS.sizexy, FLAGS.sizexy, FLAGS.sizez])
     
-    # Set training method
-    with tf.name_scope('train'):
-      train_operation = tf.train.AdamOptimizer(FLAGS.adamvar).minimize(loss)
-     
+      # Setup dropout
+      hold_prob = tf.placeholder("float")
+      x_dropout = tf.nn.dropout(x, FLAGS.hold_prob)
+
+      # Go over gpus
+      for i in xrange(FLAGS.num_gpus):
+        with tf.device('/gpu:%d' % i):
+          with tf.name_scope('%s%d' % ("tower", i)) as scope: # only op scope
+
+            # Calculate the loss for one tower. This function
+            # constructs the entire model but shares the variables across
+            # all towers.
+            with tf.variable_scope('graph'):
+              towerloss,normalnorm,ploss,towerxpred,towerxpredlong = tower_loss(x[i],x_dropout[i],scope)
+              tower_vars.append(towerloss)
+
+
+            # Collect vars for all towers.
+            tower_x_pred.append(towerxpred)
+            tower_x_pred_long.append(towerxpredlong)
+
+            # Reuse variables for the next tower (share variables across towers -- one one each gpu)
+            tf.get_variable_scope().reuse_variables()
+
+            # Retain the summaries from the final tower.
+            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+            
+            # Calculate the gradients for the batch of data on this CIFAR tower.
+            grads = opt.compute_gradients(towerloss)
+
+            # Keep track of the gradients across all towers.
+            tower_grads.append(grads)
+
+    # We must calculate the mean of each gradient. Note that this is the
+    # synchronization point across all towers.
+    grads = average_gradients(tower_grads)
+
+    # Add histograms for gradients.
+    for grad, var in grads:
+      if grad is not None:
+        summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))
+
+    # Apply the gradients to adjust the shared variables.
+    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+            
+    # Add histograms for trainable variables.
+    print("trainable vars")
+    for var in tf.trainable_variables():
+      print(var)
+      summaries.append(tf.summary.histogram(var.op.name, var))
+
+    # Track the moving averages of all trainable variables.
+    MOVING_AVERAGE_DECAY=0.9999
+    variable_averages = tf.train.ExponentialMovingAverage(
+        MOVING_AVERAGE_DECAY, global_step)
+    # synchronous variable averaging
+    variables_averages_op = variable_averages.apply(tf.trainable_variables())
+
+    # Group all updates to into a single train op.
+    train_op = tf.group(apply_gradient_op, variables_averages_op)
+      
     # List of all Variables
     variables = tf.global_variables()
-
     # Create saver for checkpoints and summary
     saver = tf.train.Saver(variables)
 
@@ -288,8 +433,15 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
       init = tf.global_variables_initializer()
 
       # Start session
-      sess = tf.Session()
+      sess = tf.Session(config=tf.ConfigProto(
+        allow_soft_placement=True,
+        log_device_placement=FLAGS.log_device_placement))
 
+
+      # Start the queue runners.
+      tf.train.start_queue_runners(sess=sess)
+
+      
       # Initialize Network
       if continuetrain==0:
         print("Initialize network")
@@ -320,7 +472,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
       # Set number of model frames
       #modelframes=FLAGS.input_seq_length+predictframes
-      modelframes=predictframes
+      modelframes=FLAGS.predictframes
 
       # Set how often dump video to disk
       howoftenvid=1000
@@ -338,41 +490,60 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
       for step in xrange(startstep,FLAGS.max_minibatches):
         nstep=step
 
-        # Generate mini-batch
-        dat = md.generate_model_sample(FLAGS.minibatch_size, FLAGS.input_seq_length, FLAGS.sizexy, num_balls, modeltype)
-
-        # Get model data for comparing to prediction if generating video
-        if nstep%howoftenvid == 0:
-          datmodel = md.generate_model_sample(1, modelframes, FLAGS.sizexy, num_balls, modeltype)
-          # Overwrite so consistent with ground truth for video output
-          dat[0,0:FLAGS.input_seq_length] = datmodel[0,0:FLAGS.input_seq_length]
+        tower_dat = []
+        tower_datmodel = []
+        with tf.variable_scope(tf.get_variable_scope()): # variable scope
+          for i in xrange(FLAGS.num_gpus):
+            # Generate mini-batch
+            dat = md.generate_model_sample(FLAGS.minibatch_size, FLAGS.input_seq_length, FLAGS.sizexy, FLAGS.num_balls, FLAGS.modeltype)
+  
+            # Get model data for comparing to prediction if generating video
+            datmodel = md.generate_model_sample(1, modelframes, FLAGS.sizexy, FLAGS.num_balls, FLAGS.modeltype)
+            # Overwrite so consistent with ground truth for video output
+            dat[0,0:FLAGS.input_seq_length] = datmodel[0,0:FLAGS.input_seq_length]
+
+            # Collect dat for all towers.
+            tower_dat.append(dat)
+            tower_datmodel.append(datmodel)
+            
+        # pack-up input data
+        tower_dat = np.asarray(tower_dat)
+        tower_datmodel = np.asarray(tower_datmodel)
 
+        
         # Train on mini-batch
         # Compute error in prediction vs. model and compute time of mini-batch task
         t = time.time()
-        _, lossm = sess.run([train_operation, loss],feed_dict={x:dat, hold_prob:FLAGS.hold_prob})
+        
+        #_, lossm = sess.run(train_op,feed_dict={x:tower_dat})
+        #print("sess.run on step=%d" % (step));sys.stdout.flush()
+        #print("shape of tower_dat")
+        #print(np.shape(tower_dat));sys.stdout.flush()
+        #print("shape of x")
+        #print(x.get_shape());sys.stdout.flush()
+ 
+        _, lossm = sess.run([train_op,towerloss],feed_dict={x:tower_dat})
         elapsed = time.time() - t
         assert not np.isnan(lossm), 'Model reached lossm = NaN'
 
 
         # Store model
         if nstep%howoftensummary == 0 and nstep!=0:
-          summary_str = sess.run(summary_op, feed_dict={x:dat, hold_prob:FLAGS.hold_prob})
+          summary_str = sess.run(summary_op, feed_dict={x:tower_dat})
           summary_writer.add_summary(summary_str, nstep)
           
         # Print-out loss
         if nstep%howoftensummary == 0:
-          summary_str = sess.run(summary_op, feed_dict={x:dat, hold_prob:FLAGS.hold_prob})
+          summary_str = sess.run(summary_op, feed_dict={x:tower_dat})
           summary_writer.add_summary(summary_str, nstep) 
           print("")
           print("time per batch is " + str(elapsed) + " seconds")
           print("step=%d nstep=%d" % (step,nstep))
           print("L2 loss=%g" % (lossm))
 
-          #normalnorm=np.sum(dat[0,0])
-          normalnorm=np.sum(dat[0,FLAGS.predict_frame_start+1:,:,:,:])
-          print("normalnorm=%d" % (normalnorm))
-          print("L2 percent loss=%g" % (100.0*(np.sqrt(float(lossm))/float(normalnorm))))
+          localnormalnorm=np.sum(tower_dat[0][0,FLAGS.predict_frame_start+1:,:,:,:]) # pull from 0th tower
+          print("localnormalnorm=%d" % (localnormalnorm))
+          print("L2 percent loss=%g" % (100.0*(np.sqrt(float(lossm))/float(localnormalnorm))))
         else:
           # track progress
           sys.stdout.write('.')
@@ -403,7 +574,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
           print("Writing model video")
           video = cv2.VideoWriter()
           success = video.open(video_path + "model_" + str(nstep) + ".mov", cc, fps, sizevid, True)
-          image = datmodel[0]
+          image = tower_datmodel[0][0] # pull from 0th tower
           print(image.shape)
           for i in xrange(modelframes):
             x_1_r = np.uint8(np.minimum(1, np.maximum(image[i,:,:,:], 0)) * 255)
@@ -417,7 +588,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
           success = video.open(video_path + "clstm_" + str(nstep) + ".mov", cc, fps, sizevid, True)
 
           # Preappend starting sequence
-          image = datmodel[0]
+          image = tower_datmodel[0][0] # pull from 0th tower
           print(image.shape)
           for i in xrange(FLAGS.predict_frame_start):
             x_1_r = np.uint8(np.minimum(1,np.maximum(image[i,:,:,:], 0)) * 255)
@@ -425,9 +596,8 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
             video.write(new_im)
 
           # Append predicted video
-          dat_gif = dat
-          image = sess.run([x_pred_long],feed_dict={x:dat_gif, hold_prob:FLAGS.hold_prob})
-          image = image[0][0]
+          image = sess.run([tower_x_pred_long],feed_dict={x:tower_dat})
+          image = image[0][0][0]  # pull from 0th tower
           print(image.shape)
           for i in xrange(modelframes - FLAGS.predict_frame_start):
             x_1_r = np.uint8(np.minimum(1,np.maximum(image[i,:,:,:], 0)) * 255)
@@ -438,16 +608,11 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
 
 def main(argv=None):
   #
-  # Choose to continue training (1) or not (0)
-  continuetrain=1
+  continuetrain=FLAGS.continuetrain
   #
   #
-  # Choose which model to work on
-  # 0 = classic bouncing balls
-  # 1 = rotating "ball"
-  modeltype=1
-  # Number of balls
-  num_balls=1
+  modeltype=FLAGS.modeltype
+  num_balls=FLAGS.num_balls
   #
   # Setup checkpoint directory
   if tf.gfile.Exists(FLAGS.ckpt_dir):

From c009f0ac3081a5b83130d810fef2773bb83b1449 Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Thu, 9 Mar 2017 21:15:22 -0800
Subject: [PATCH 11/12] control balls

---
 main.py   | 24 ++++++++++++++++++------
 models.py |  6 ++++++
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/main.py b/main.py
index 20c8097..f8c46f8 100644
--- a/main.py
+++ b/main.py
@@ -43,7 +43,7 @@
                             """adamvar for dropout""")
 tf.app.flags.DEFINE_integer('minibatch_size', 16,
                             """mini-batch size""")
-tf.app.flags.DEFINE_integer('num_balls', 1,
+tf.app.flags.DEFINE_integer('init_num_balls', 1,
                             """How many balls to model.""")
 # Choose which model to work on
 # 0 = classic bouncing balls
@@ -317,7 +317,7 @@ def average_gradients(tower_grads):
 
 
 # Function to train autoencoder network
-def autoencode(continuetrain=0,modeltype=0,num_balls=2):
+def autoencode(continuetrain=0,modeltype=0,init_num_balls=2):
 
 
   # Some checks
@@ -487,18 +487,30 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2):
       ###############
       # Training Loop
       startstep=nstep
+      num_balls = FLAGS.init_num_balls
       for step in xrange(startstep,FLAGS.max_minibatches):
         nstep=step
 
+        #########################
+        # model-dependent code
+        if step%100==0 and step>0:
+          num_balls=num_balls+1
+          # limit so doesn't go beyond point where can't fit balls and reaches good_config=False always in models.py
+          if num_balls>5:
+            num_balls=5
+          print("num_balls=%d" % (num_balls))
+
+          
+        # create input data
         tower_dat = []
         tower_datmodel = []
         with tf.variable_scope(tf.get_variable_scope()): # variable scope
           for i in xrange(FLAGS.num_gpus):
             # Generate mini-batch
-            dat = md.generate_model_sample(FLAGS.minibatch_size, FLAGS.input_seq_length, FLAGS.sizexy, FLAGS.num_balls, FLAGS.modeltype)
+            dat = md.generate_model_sample(FLAGS.minibatch_size, FLAGS.input_seq_length, FLAGS.sizexy, num_balls, FLAGS.modeltype)
   
             # Get model data for comparing to prediction if generating video
-            datmodel = md.generate_model_sample(1, modelframes, FLAGS.sizexy, FLAGS.num_balls, FLAGS.modeltype)
+            datmodel = md.generate_model_sample(1, modelframes, FLAGS.sizexy, num_balls, FLAGS.modeltype)
             # Overwrite so consistent with ground truth for video output
             dat[0,0:FLAGS.input_seq_length] = datmodel[0,0:FLAGS.input_seq_length]
 
@@ -612,7 +624,7 @@ def main(argv=None):
   #
   #
   modeltype=FLAGS.modeltype
-  num_balls=FLAGS.num_balls
+  init_num_balls=FLAGS.init_num_balls
   #
   # Setup checkpoint directory
   if tf.gfile.Exists(FLAGS.ckpt_dir):
@@ -630,7 +642,7 @@ def main(argv=None):
     tf.gfile.MakeDirs(FLAGS.video_dir)
 
   # Start training autoencoder
-  autoencode(continuetrain=continuetrain,modeltype=modeltype,num_balls=num_balls)
+  autoencode(continuetrain=continuetrain,modeltype=modeltype,init_num_balls=init_num_balls)
 
 if __name__ == '__main__':
   tf.app.run()
diff --git a/models.py b/models.py
index cd570b9..9fbc37b 100644
--- a/models.py
+++ b/models.py
@@ -41,7 +41,13 @@ def model_n(T=64, TY=0, n=2, r=None, m=None):
 
     good_config=False
 
+    goodconfigattempt=0
+    maxgoodconfigattempts=10
     while not good_config:
+        goodconfigattempt=goodconfigattempt+1
+        if goodconfigattempt>maxgoodconfigattempts:
+            break
+        
         x = 2+rand(n,2)*8
         good_config=True
         for i in range(n):

From c7624cf648b68a274bfb7b573789460cb9aeb8a0 Mon Sep 17 00:00:00 2001
From: pseudotensor <pseudotensor@gmail.com>
Date: Fri, 10 Mar 2017 19:49:27 -0800
Subject: [PATCH 12/12] minor ball count

---
 main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.py b/main.py
index f8c46f8..b20b939 100644
--- a/main.py
+++ b/main.py
@@ -493,11 +493,11 @@ def autoencode(continuetrain=0,modeltype=0,init_num_balls=2):
 
         #########################
         # model-dependent code
-        if step%100==0 and step>0:
+        if step%howoftenvid==0 and step>0:
           num_balls=num_balls+1
           # limit so doesn't go beyond point where can't fit balls and reaches good_config=False always in models.py
-          if num_balls>5:
-            num_balls=5
+          if num_balls>3:
+            num_balls=3
           print("num_balls=%d" % (num_balls))