diff --git a/README.md b/README.md index 55a3550..dc4588f 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,10 @@ # Inspired by papers: +https://arxiv.org/abs/1506.04214 (Conv LSTM) http://www.jmlr.org/proceedings/papers/v2/sutskever07a/sutskever07a.pdf https://arxiv.org/abs/1411.4389 https://arxiv.org/abs/1504.08023 -https://arxiv.org/abs/1506.04214 (like this paper with RNN but now with LSTM) https://arxiv.org/abs/1511.06380 https://arxiv.org/abs/1511.05440 https://arxiv.org/abs/1605.08104 @@ -88,6 +88,15 @@ smplayer out_all2_fast.mp4 * Training Curve in Tensorflow (norm order 40): ![Alt text](https://github.com/pseudotensor/temporal_autoencoder/blob/master/lossexamples/loss_wheel.jpg "Training loss curve for wheel prediction vs. model.") +Notes for wheel case: + +* Longer training frames work better to predict longer + +* Seems to need to have loss over at least one rotation to be able to predict well into multiple frames in the future + +* Central part of wheel diffuses even when otherwise does well. Lack of resolution + + # Parameters: @@ -102,6 +111,7 @@ smplayer out_all2_fast.mp4 2) In balls.py: * SIZE: size of ball's bounding box in pixels +* omega: angular frequency of rotation for modeltype=1 (wheel type) # Ideas and Future Work: @@ -110,6 +120,18 @@ smplayer out_all2_fast.mp4 * Try more filters +* Try L2 loss not only on (or not just on) final image, but hidden states. Should approximate adversarial networks, which keep image and hidden latent variable more smoothly connected (i.e. avoid fractured manifold). + +* Try different hyperparameters + +* Try multi-scale for space + +* Try multi-scale for time (to capture periods over long times) + +* Try Stacked Conv/Deconv LSTMs (https://arxiv.org/pdf/1506.04214v2.pdf and https://arxiv.org/pdf/1605.07157v4.pdf) + +* Try skip connections (https://arxiv.org/pdf/1605.07157v4.pdf) + * Try temporal convolution * Try other LSTM architectures (C-peek, bind forget-recall, GRU, etc.) diff --git a/clstm.py b/clstm.py index e3e2a41..1a23310 100644 --- a/clstm.py +++ b/clstm.py @@ -42,12 +42,13 @@ class clstm(CRNNCell): # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py - def __init__(self, shape, filter, features, forget_bias=1.0, input_size=None, + def __init__(self, shape, filter, stride, features, forget_bias=1.0, input_size=None, state_is_tuple=False, activation=tf.nn.tanh): """Initialize the basic CLSTM cell. Args: shape: int tuple of the height and width of the cell filter: int tuple of the height and width of the filter + stride: stride to use if doing convolution or deconvolution features: int of the depth of the cell forget_bias: float, the bias added to forget gates (see above). input_size: Deprecated. @@ -60,6 +61,7 @@ def __init__(self, shape, filter, features, forget_bias=1.0, input_size=None, logging.warn("%s: Input_size parameter is deprecated.", self) self.shape = shape self.filter = filter + self.stride = stride self.features = features self._forget_bias = forget_bias self._state_is_tuple = state_is_tuple @@ -74,7 +76,7 @@ def state_size(self): def output_size(self): return self._num_units - def __call__(self, inputs, state, scope=None): + def __call__(self, inputs, state, typec='Conv', scope=None): """Long short-term memory cell (LSTM).""" # inputs: batchsize x clstmshape x clstmshape x clstmfeatures with tf.variable_scope(scope or type(self).__name__): @@ -88,12 +90,12 @@ def __call__(self, inputs, state, scope=None): doclstm=1 if doclstm==1: - concat = _convolve_linear([inputs, h], self.filter, self.features * 4, True) + concat = _convolve_linear([inputs, h], self.filter, self.stride, self.features * 4, typec, True) # http://colah.github.io/posts/2015-08-Understanding-LSTMs/ # i = input_gate, j = new_input, f = forget_gate, o = output_gate (each with clstmfeatures features) i, j, f, o = tf.split(3, 4, concat) else: - # work in-progress + # TODO: work in-progress incat = tf.concat(3,args) # general W.x + b separately for each i,j,f,o #i = tf.matmul(incat,weightsi) + biasesi @@ -103,9 +105,10 @@ def __call__(self, inputs, state, scope=None): # concat: batchsize x clstmshape x clstmshape x (clstmfeatures*4) - - new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) * - self._activation(j)) + # Hadamard (element-by-element) products (*) + # If stride!=1, then c will be different size than i,j,f,o, so next operation won't work. + new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) * self._activation(j)) + # If stride!=1, then o different dimension than new_h needs to be. (because c and h need to be same size if packing/splitting them as well as recurrently needs to be same size) new_h = self._activation(new_c) * tf.nn.sigmoid(o) if self._state_is_tuple: @@ -114,11 +117,12 @@ def __call__(self, inputs, state, scope=None): new_state = tf.concat(3, [new_c, new_h]) return new_h, new_state -def _convolve_linear(args, filter, features, bias, bias_start=0.0, scope=None): +def _convolve_linear(args, filter, stride, features, typec, bias, bias_start=0.0, scope=None): """convolution: Args: args: 4D Tensor or list of 4D, batch x n, Tensors. filter: int tuple of filter with height and width. + stride: stride for convolution features: int, as number of features. bias_start: starting value to initialize bias; 0 by default. scope: VariableScope for created subgraph; defaults to "Linear". @@ -144,23 +148,49 @@ def _convolve_linear(args, filter, features, bias, bias_start=0.0, scope=None): dtype = [a.dtype for a in args][0] - # Computation - with tf.variable_scope(scope or "Conv"): - # setup weights as kernel x kernel x (input features = clstmfeatures*2) x (new features=clstmfeatures*4) - mat = tf.get_variable( - "Mat", [filter[0], filter[1], total_arg_size_depth, features], dtype=dtype) - if len(args) == 1: - res = tf.nn.conv2d(args[0], mat, strides=[1, 1, 1, 1], padding='SAME') + # concat + if len(args) == 1: + inputs = args[0] + else: + inputs=tf.concat(3, args) + + # Conv + if typec=='Conv': + with tf.variable_scope(scope or "Conv"): + # setup weights as kernel x kernel x (input features = clstmfeatures*2) x (new features=clstmfeatures*4) + weights = tf.get_variable( "Weights", [filter[0], filter[1], total_arg_size_depth, features], dtype=dtype) + res = tf.nn.conv2d(inputs, weights, strides=[1, stride, stride, 1], padding='SAME') + + # BIAS + if bias: + bias_term = tf.get_variable( + "Bias", [features], + dtype=dtype, + initializer=tf.constant_initializer( + bias_start, dtype=dtype)) else: + bias_term = 0*res + + # deConv + if typec=='deConv': + with tf.variable_scope(scope or "deConv"): + # setup weights as kernel x kernel x (new features=clstmfeatures*4) x (input features = clstmfeatures*2). + # i.e., 2nd arg to transpose version is [height, width, output_channels, in_channels], where last 2 are switched compared to normal conv2d + deweights = tf.get_variable( "deWeights", [filter[0], filter[1], features, total_arg_size_depth], dtype=dtype) + output_shape = tf.pack([tf.shape(inputs)[0], tf.shape(inputs)[1]*stride, tf.shape(inputs)[2]*stride, features]) # first argument is batchsize x clstmshape x clstmshape x (2*clstmfeatures) - res = tf.nn.conv2d(tf.concat(3, args), mat, strides=[1, 1, 1, 1], padding='SAME') # res: batchsize x clstmshape x clstmshape x (clstmfeatures*4) - if not bias: - return res - bias_term = tf.get_variable( - "Bias", [features], + res = tf.nn.conv2d_transpose(inputs, deweights, output_shape, strides=[1, stride, stride, 1], padding='SAME') + + # BIAS + if bias: + bias_term = tf.get_variable( + "deBias", [features], dtype=dtype, initializer=tf.constant_initializer( bias_start, dtype=dtype)) + else: + bias_term = 0*res + return res + bias_term diff --git a/main.py b/main.py index 82605ae..4aad671 100644 --- a/main.py +++ b/main.py @@ -27,9 +27,9 @@ """directory to store checkpoints""") tf.app.flags.DEFINE_integer('sizexy', 32, """size x and y dimensions for model, training, and prediction""") -tf.app.flags.DEFINE_integer('input_seq_length', 10, +tf.app.flags.DEFINE_integer('input_seq_length', 50, """size of hidden layer""") -tf.app.flags.DEFINE_integer('predict_frame_start', 5, +tf.app.flags.DEFINE_integer('predict_frame_start', 25, """ frame number, in zero-base counting, to start using prediction as output or next input""") tf.app.flags.DEFINE_integer('max_minibatches', 1000000, """maximum number of mini-batches""") @@ -87,6 +87,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2): clstminput=sizexy/cnnstrideproduct # must be evenly divisible clstmshape=[clstminput,clstminput] clstmkernel=[3,3] + clstmstride=1 # currently needs to be 1 unless implement tf.pad() or tf.nn.fractional_avg_pool() clstmfeatures=cnnfeatures[3] # same as features of last cnn layer fed into clstm # dcnnkernels=[1,3,3,3] # reasonably the reverse order of cnnkernels @@ -112,12 +113,25 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2): # # x_pred = [] + + #################### + # Setup CLSTM with tf.variable_scope('clstm', initializer = tf.random_uniform_initializer(-.01, 0.1)): # input shape, kernel filter size, number of features - cell = clstm.clstm(clstmshape, clstmkernel, clstmfeatures) + convcell = clstm.clstm(clstmshape, clstmkernel, clstmstride, clstmfeatures) # state: batchsize x clstmshape x clstmshape x clstmfeatures - new_state = cell.set_zero_state(FLAGS.minibatch_size, tf.float32) + new_state = convcell.set_zero_state(FLAGS.minibatch_size, tf.float32) + # Setup deCLSTM + with tf.variable_scope('declstm', initializer = tf.random_uniform_initializer(-.01, 0.1)): + # input shape, kernel filter size, number of features + deconvcell = clstm.clstm(clstmshape, clstmkernel, clstmstride, clstmfeatures) + # state: batchsize x clstmshape x clstmshape x clstmfeatures + denew_state = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32) + + + + ######################## # Create CNN-LSTM-dCNN for an input of input_seq_length-1 frames in n time for an output of input_seq_length-1 frames in n+1 time for i in xrange(FLAGS.input_seq_length-1): @@ -137,13 +151,16 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2): # cnn4: cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4") - # lstm layer (input y_0 and hidden state, output prediction y_1 and new hidden state new_state) + # Convolutional lstm layer (input y_0 and hidden state, output prediction y_1 and new hidden state new_state) y_0 = cnn4 #y_0 should be same shape as first argument in clstm.clstm() above. - y_1, new_state = cell(y_0, new_state) + y_1, new_state = convcell(y_0, new_state, 'Conv') + + # deConvolutional LSTM layer + y_2, denew_state = deconvcell(y_1, denew_state, 'deConv') # DECODE # cnn5 - cnn5 = ld.dcnn2d_layer(y_1, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_5") + cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_5") # cnn6 cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_6") # cnn7 @@ -168,8 +185,14 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2): # Create network to generate predicted video predictframes=50 + ############## + # Setup CLSTM (initialize to zero, but same convcell as in other network) x_pred_long = [] - new_state_pred = cell.set_zero_state(FLAGS.minibatch_size, tf.float32) + new_state_pred = convcell.set_zero_state(FLAGS.minibatch_size, tf.float32) + new_destate_pred = deconvcell.set_zero_state(FLAGS.minibatch_size, tf.float32) + + ####### + # Setup long prediction network for i in xrange(predictframes): # ENCODE @@ -185,13 +208,16 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2): # cnn4 cnn4 = ld.cnn2d_layer(cnn3, cnnkernels[3], cnnstrides[3], cnnfeatures[3], "cnn_4") - # lstm layer + # Convolutional lstm layer y_0 = cnn4 - y_1, new_state_pred = cell(y_0, new_state_pred) + y_1, new_state_pred = convcell(y_0, new_state_pred, 'Conv') + + # deConvolutional lstm layer + y_2, new_destate_pred = deconvcell(y_1, new_destate_pred, 'deConv') # DECODE # cnn5 - cnn5 = ld.dcnn2d_layer(y_1, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_5") + cnn5 = ld.dcnn2d_layer(y_2, dcnnkernels[0], dcnnstrides[0], dcnnfeatures[0], "dcnn_5") # cnn6 cnn6 = ld.dcnn2d_layer(cnn5, dcnnkernels[1], dcnnstrides[1], dcnnfeatures[1], "dcnn_6") # cnn7 @@ -210,6 +236,7 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2): # Setup loss Computation # Loss computes L2 for original sequence vs. predicted sequence over input_seq_length - (seq.start+1) frames # Compare x^{n+1} to xpred^n (that is supposed to be approximation to x^{n+1}) + # x: batchsize, time steps, sizexy, sizexy, sizez loss = tf.nn.l2_loss(x[:,FLAGS.predict_frame_start+1:,:,:,:] - x_pred[:,:,:,:,:]) #tf.scalar_summary('loss', loss) tf.summary.scalar('loss', loss) @@ -308,9 +335,10 @@ def autoencode(continuetrain=0,modeltype=0,num_balls=2): print("step=%d nstep=%d" % (step,nstep)) print("L2 loss=%g" % (lossm)) - normalnorm=np.sum(dat[0,0]) + #normalnorm=np.sum(dat[0,0]) + normalnorm=np.sum(dat[0,FLAGS.predict_frame_start+1:,:,:,:]) print("normalnorm=%d" % (normalnorm)) - print("L2 percent loss=%g" % 100.0*(np.sqrt(float(lossm))/float(normalnorm))) + print("L2 percent loss=%g" % (100.0*(np.sqrt(float(lossm))/float(normalnorm)))) else: # track progress sys.stdout.write('.')