layers.py

### Custom Lasagne Layers for Introspective Adversarial Networks
# A Brock, 2016
#
# Layers that are not my own creation should be appropriately attributed here
# MADE wrapped from the implementation by M. Germain et al: https://github.com/mgermain/MADE
# Gaussian Sample layer from Tencia Lee's Recipe: https://github.com/Lasagne/Recipes/blob/master/examples/variational_autoencoder/variational_autoencoder.py
# Minibatch Discrimination layer from OpenAI's Improved GAN Techniques: https://github.com/openai/improved-gan
# Deconv Layer adapted from Radford's DCGAN: https://github.com/Newmu/dcgan_code

from __future__ import division
import numpy as np
import theano
import theano.tensor as T
import lasagne
import lasagne.layers

from lasagne.layers import SliceLayer as SL
from lasagne.layers import batch_norm as BN
from lasagne.layers import ElemwiseSumLayer as ESL
from lasagne.layers import NonlinearityLayer as NL
from lasagne.layers import DenseLayer as DL
from lasagne.init import Normal as initmethod
from lasagne.nonlinearities import elu
from lasagne.nonlinearities import rectify as relu
from lasagne.nonlinearities import LeakyRectify as lrelu

from lasagne.layers import TransposedConv2DLayer as TC2D
from lasagne.layers import ConcatLayer as CL


from math import sqrt
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from mask_generator import MaskGenerator

# BatchReNorm Layer using cuDNN's BatchNorm
# This layer implements BatchReNorm (https://arxiv.org/abs/1702.03275), 
# which modifies BatchNorm to include running-average statistics in addition to 
# per-batch statistics in a well-principled manner. The RMAX and DMAX parameters 
# are clip parameters which should be fscalars that you'll need to manage in the training
# loop, as they follow an annealing schedule (an example of which is given in the paper).
# I've been adjusting this schedule based on the total number of iterations relative
# to the number given in the paper, so for a ~50,000 iteration training run, I anneal
# RMAX between 1k and 5k iterations rather than 5k and 25k. 

# NOTE: Ideally you should not have to manage RMAX and DMAX separately, so
# if someone wants to write a default_update similar to the one used for
# running_average and running_inv_std, that would be excellent.
class BatchReNormDNNLayer(lasagne.layers.BatchNormLayer):
    
    def __init__(self, incoming, RMAX,DMAX,axes='auto', epsilon=1e-4, alpha=0.1,
                 beta=lasagne.init.Constant(0), gamma=lasagne.init.Constant(1),
                 mean=lasagne.init.Constant(0), inv_std=lasagne.init.Constant(1), **kwargs):
        super(BatchReNormDNNLayer, self).__init__(
                incoming, axes, epsilon, alpha, beta, gamma, mean, inv_std,
                **kwargs)
        all_but_second_axis = (0,) + tuple(range(2, len(self.input_shape)))
        
        self.RMAX,self.DMAX = RMAX,DMAX
        
        if self.axes not in ((0,), all_but_second_axis):
            raise ValueError("BatchNormDNNLayer only supports normalization "
                             "across the first axis, or across all but the "
                             "second axis, got axes=%r" % (axes,))

    def get_output_for(self, input, deterministic=False,
                       batch_norm_use_averages=None,
                       batch_norm_update_averages=None, **kwargs):
           
        # Decide whether to use the stored averages or mini-batch statistics
        if batch_norm_use_averages is None:
            batch_norm_use_averages = deterministic
        use_averages = batch_norm_use_averages

        # Decide whether to update the stored averages
        if batch_norm_update_averages is None:
            batch_norm_update_averages = not deterministic
        update_averages = batch_norm_update_averages

        # prepare dimshuffle pattern inserting broadcastable axes as needed
        param_axes = iter(range(input.ndim - len(self.axes)))
        pattern = ['x' if input_axis in self.axes
                   else next(param_axes)
                   for input_axis in range(input.ndim)]
        # and prepare the converse pattern removing those broadcastable axes
        unpattern = [d for d in range(input.ndim) if d not in self.axes]

        # call cuDNN if needed, obtaining normalized outputs and statistics
        if not use_averages or update_averages:
            # cuDNN requires beta/gamma tensors; create them if needed
            shape = tuple(s for (d, s) in enumerate(input.shape)
                          if d not in self.axes)
            gamma = self.gamma or theano.tensor.ones(shape)
            beta = self.beta or theano.tensor.zeros(shape)
            mode = 'per-activation' if self.axes == (0,) else 'spatial'
            
            (normalized,
             input_mean,
             input_inv_std) = theano.sandbox.cuda.dnn.dnn_batch_normalization_train(
                    input, gamma.dimshuffle(pattern), beta.dimshuffle(pattern),
                    mode, self.epsilon)

        # normalize with stored averages, if needed
        if use_averages:
            mean = self.mean.dimshuffle(pattern)
            inv_std = self.inv_std.dimshuffle(pattern)
            gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
            beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
            normalized = (input - mean) * (gamma * inv_std) + beta

        # update stored averages, if needed
        if update_averages:
            # Trick: To update the stored statistics, we create memory-aliased
            # clones of the stored statistics:
            running_mean = theano.clone(self.mean, share_inputs=False)
            running_inv_std = theano.clone(self.inv_std, share_inputs=False)
            # set a default update for them:
            running_mean.default_update = ((1 - self.alpha) * running_mean +
                                           self.alpha * input_mean.dimshuffle(unpattern))
            running_inv_std.default_update = ((1 - self.alpha) *
                                              running_inv_std +
                                              self.alpha * input_inv_std.dimshuffle(unpattern))
            # and make sure they end up in the graph without participating in
            # the computation (this way their default_update will be collected
            # and applied, but the computation will be optimized away):
            # dummy = running_mean + running_inv_std).dimshuffle(pattern)
            r = T.clip(running_inv_std.dimshuffle(pattern)/input_inv_std,1/self.RMAX,self.RMAX)
            d = T.clip( (input_mean-running_mean.dimshuffle(pattern))*running_inv_std.dimshuffle(pattern),-self.DMAX,self.DMAX)
            normalized = normalized * r + d

        return normalized


# More Efficient MDCL layer
# When seeking to construct an MDC block, drop this into a Conv2D layer instead; it's faster.
# You can also easily re-parameterize this to a full-rank MDC block by dropping in the
# line that creates baseW into the for loop such that a new W is sampled each time.
def mdclW(num_filters,num_channels,filter_size,winit,name,scales):
    # Coefficient Initializer
    sinit = lasagne.init.Constant(1.0/(1+len(scales)))
    # Total filter size
    size = filter_size + (filter_size-1)*(scales[-1]-1)
    # Multiscale Dilated Filter 
    W = T.zeros((num_filters,num_channels,size,size))
    # Undilated Base Filter
    baseW = theano.shared(lasagne.utils.floatX(winit.sample((num_filters,num_channels,filter_size,filter_size))),name=name+'.W')
    for scale in enumerate(scales[::-1]): # enumerate backwards so that we place the main filter on top
            W = T.set_subtensor(W[:,:,scales[-1]-scale:size-scales[-1]+scale:scale,scales[-1]-scale:size-scales[-1]+scale:scale],
                                  baseW*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'.coeff_'+str(scale)).dimshuffle(0,'x','x','x'))
    return W

# Subpixel Upsample Layer from (https://arxiv.org/abs/1609.05158)
# This layer uses a set of r^2 set_subtensor calls to reorganize the tensor in a subpixel-layer upscaling style
# as done in the ESPCN Magic ony paper for super-resolution.
# r is the upscale factor.
# c is the number of output channels.
class SubpixelLayer(lasagne.layers.Layer):
    def __init__(self, incoming,r,c, **kwargs):
        super(SubpixelLayer, self).__init__(incoming, **kwargs)
        self.r=r # Upscale factor
        self.c=c # number of output channels
        
    def get_output_shape_for(self, input_shape):
        return (input_shape[0],self.c,self.r*input_shape[2],self.r*input_shape[3])

    def get_output_for(self, input, deterministic=False, **kwargs):
        out = T.zeros((input.shape[0],self.output_shape[1],self.output_shape[2],self.output_shape[3]))
        for x in xrange(self.r): # loop across all feature maps belonging to this channel
            for y in xrange(self.r):
                out=T.set_subtensor(out[:,:,x::self.r,y::self.r],input[:,self.r*x+y::self.r*self.r,:,:])
        return out
# Subpixel Upsample Layer using reshapes as in https://github.com/Tetrachrome/subpixel. This implementation appears to be 10x slower than
# the set_subtensor implementation, presumably because of the extra reshapes after the splits.         
class SubpixelLayer2(lasagne.layers.Layer):
    def __init__(self, incoming,r,c, **kwargs):
        super(SubpixelLayer2, self).__init__(incoming, **kwargs)
        self.r=r
        self.c=c

    
    def get_output_shape_for(self, input_shape):
        return (input_shape[0],self.c,self.r*input_shape[2],self.r*input_shape[3])

    def get_output_for(self, input, deterministic=False, **kwargs):
        def _phase_shift(input,r):
            bsize,c,a,b = input.shape[0],1,self.output_shape[2]//r,self.output_shape[3]//r
            X = T.reshape(input, (bsize,r,r,a,b))
            X = T.transpose(X, (0, 3,4,1,2))  # bsize, a, b, r2,r1
            X = T.split(x=X,splits_size=[1]*a,n_splits=a,axis=1)  # a, [bsize, b, r, r]
            X = [T.reshape(x,(bsize,b,r,r))for x in X]
            X = T.concatenate(X,axis=2)  # bsize, b, a*r, r 
            X = T.split(x=X,splits_size =[1]*b,n_splits=b,axis=1)  # b, [bsize, a*r, r]
            X = [T.reshape(x,(bsize,a*r,r))for x in X]
            X = T.concatenate(X,axis=2) # bsize, a*r, b*r 
            return X.dimshuffle(0,'x',1,2)
        Xc = T.split(x=input,splits_size =[input.shape[1]//self.c]*self.c,n_splits=self.c,axis=1)
        return T.concatenate([_phase_shift(xc,self.r) for xc in Xc],axis=1)        

# Multiscale Dilated Convolution Block
# This function (not a layer in and of itself, though you could make it one) returns a set of concatenated conv2d and dilatedconv2d layers.
# Each layer uses the same basic filter W, operating at a different dilation factor (or taken as the mean of W for the 1x1 conv).
# The channel-wise output of each layer is weighted by a set of coefficients, which are initialized to 1 / the total number of dilation scales,
# meaning that were starting by taking an elementwise mean. These should be learnable parameters.

# NOTES: - I'm considering changing the variable names to be more descriptive, and look less like ridiculous academic code. It's on the to-do list.
#        - I keep the bias and nonlinearity out of the default definition for this layer, as I expect it to be batchnormed and nonlinearized in the model config.
def MDCL(incoming,num_filters,scales,name,dnn=True):
    if dnn:
        from lasagne.layers.dnn import Conv2DDNNLayer as C2D
    # W initialization method--this should also work as Orthogonal('relu'), but I have yet to validate that as thoroughly.
    winit = initmethod(0.02)
    
    # Initialization method for the coefficients
    sinit = lasagne.init.Constant(1.0/(1+len(scales)))
    
    # Number of incoming channels
    ni =lasagne.layers.get_output_shape(incoming)[1]
    
    # Weight parameter--the primary parameter for this block
    W = theano.shared(lasagne.utils.floatX(winit.sample((num_filters,lasagne.layers.get_output_shape(incoming)[1],3,3))),name=name+'W')
    
    # Primary Convolution Layer--No Dilation
    n = C2D(incoming = incoming,
                            num_filters = num_filters,
                            filter_size = [3,3],
                            stride = [1,1],
                            pad = (1,1),
                            W = W*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'_coeff_base').dimshuffle(0,'x','x','x'), # Note the broadcasting dimshuffle for the num_filter scalars.
                            b = None,
                            nonlinearity = None,
                            name = name+'base'
                        )
    # List of remaining layers. This should probably just all be concatenated into a single list rather than being a separate deal.
    nd = []    
    for i,scale in enumerate(scales):
        
        # I don't think 0 dilation is technically defined (or if it is it's just the regular filter) but I use it here as a convenient keyword to grab the 1x1 mean conv.
        if scale==0:
            nd.append(C2D(incoming = incoming,
                            num_filters = num_filters,
                            filter_size = [1,1],
                            stride = [1,1],
                            pad = (0,0),
                            W = T.mean(W,axis=[2,3]).dimshuffle(0,1,'x','x')*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'_coeff_1x1').dimshuffle(0,'x','x','x'),
                            b = None,
                            nonlinearity = None,
                            name = name+str(scale)))
        # Note the dimshuffles in this layer--these are critical as the current DilatedConv2D implementation uses a backward pass.
        else:
            nd.append(lasagne.layers.DilatedConv2DLayer(incoming = lasagne.layers.PadLayer(incoming = incoming, width=(scale,scale)),
                                num_filters = num_filters,
                                filter_size = [3,3],
                                dilation=(scale,scale),
                                W = W.dimshuffle(1,0,2,3)*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'_coeff_'+str(scale)).dimshuffle('x',0,'x','x'),
                                b = None,
                                nonlinearity = None,
                                name =  name+str(scale)))
    return ESL(nd+[n])

# MDC-based Upsample Layer.
# This is a prototype I don't make use of extensively. It's operational but it doesn't seem to improve results yet.
def USL(incoming,num_filters,scales,name,dnn=True):
    if dnn:
        from lasagne.layers.dnn import Conv2DDNNLayer as C2D
    
    # W initialization method--this should also work as Orthogonal('relu'), but I have yet to validate that as thoroughly.
    winit = initmethod(0.02)
    
    # Initialization method for the coefficients
    sinit = lasagne.init.Constant(1.0/(1+len(scales)))
    
    # Number of incoming channels
    ni =lasagne.layers.get_output_shape(incoming)[1]
    
    # Weight parameter--the primary parameter for this block
    W = theano.shared(lasagne.utils.floatX(winit.sample((num_filters,lasagne.layers.get_output_shape(incoming)[1],3,3))),name=name+'W')
    
    # Primary Convolution Layer--No Dilation
    n = C2D(incoming = Upscale2DLayer(incoming,2),
                            num_filters = num_filters,
                            filter_size = [3,3],
                            stride = [1,1],
                            pad = (1,1),
                            W = W*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'_coeff_base').dimshuffle(0,'x','x','x'),
                            b = None,
                            nonlinearity = None,
                            name = name+'base'
                        )
    # Remaining layers              
    nd = []    
    for i,scale in enumerate(scales):                    
        if scale==0:
            nd.append(C2D(incoming = Upscale2DLayer(incoming,2),
                            num_filters = num_filters,
                            filter_size = [1,1],
                            stride = [1,1],
                            pad = (0,0),
                            W = T.mean(W,axis=[2,3]).dimshuffle(0,1,'x','x')*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'_coeff_1x1').dimshuffle(0,'x','x','x'),
                            b = None,
                            nonlinearity = None,
                            name = name+'1x1'
                        ))
        else:
            nd.append(lasagne.layers.DilatedConv2DLayer(incoming = lasagne.layers.PadLayer(incoming = Upscale2DLayer(incoming,2), width=(scale,scale)),
                                num_filters = num_filters,
                                filter_size = [3,3],
                                dilation=(scale,scale),
                                W = W.dimshuffle(1,0,2,3)*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'_coeff_'+str(scale)).dimshuffle('x',0,'x','x'),
                                b = None,
                                nonlinearity = None,
                                name =  name+str(scale)))
    
    # A single deconv layer is also concatenated here. Like I said, it's a prototype!
    nd.append(DeconvLayer(incoming = incoming,
                            num_filters = num_filters,
                            filter_size = [3,3],
                            stride = [2,2],
                            crop = (1,1),
                            W = W.dimshuffle(1,0,2,3)*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'_coeff_deconv').dimshuffle('x',0,'x','x'),
                            b = None,
                            nonlinearity = None,
                            name = name+'deconv'
                        ))

    return ESL(nd+[n])     

#MDC-based Downsample Layer.
# This is a prototype I don't make use of extensively. It's operational and it seems like it works alright, but it's restrictively expensive
# and I am not PARALLELICUS, god of GPUs, so I don't have the memory to spare for it.   
# Note that this layer does not currently support having a 0 scale like the others do, and just has a 1x1-stride2 conv by default.
def DSL(incoming,num_filters,scales,name,dnn=True):
    if dnn:
        from lasagne.layers.dnn import Conv2DDNNLayer as C2D
    # W initialization method--this should also work as Orthogonal('relu'), but I have yet to validate that as thoroughly.
    winit = initmethod(0.02)
    
    # Initialization method for the coefficients
    sinit = lasagne.init.Constant(1.0/(1+len(scales)))
    
    # Number of incoming channels
    ni =lasagne.layers.get_output_shape(incoming)[1]
    
    # Weight parameter--the primary parameter for this block
    W = theano.shared(lasagne.utils.floatX(winit.sample((num_filters,lasagne.layers.get_output_shape(incoming)[1],3,3))),name=name+'W')
    
    # Main layer--3x3 conv with stride 2
    n = C2D(incoming = incoming,
                            num_filters = num_filters,
                            filter_size = [3,3],
                            stride = [2,2],
                            pad = (1,1),
                            W = W*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'_coeff_base').dimshuffle(0,'x','x','x'),
                            b = None,
                            nonlinearity = None,
                            name = name+'base'
                        )

                      
    nd = []    
    for i,scale in enumerate(scales):

        p = P2D(incoming = incoming,
                                    pool_size = scale,
                                    stride = 2,
                                    pad = (1,1) if i else (0,0),
                                    mode = 'average_exc_pad',
                                    )

        nd.append(C2D(incoming = p,
                    num_filters = num_filters,
                    filter_size = [3,3],
                    stride = (1,1),
                    pad = (1,1),
                    W = W*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'_coeff_'+str(scale)).dimshuffle(0,'x','x','x'),#.dimshuffle('x',0),
                    b = None,
                    nonlinearity = None,
                    name =  name+str(scale)))            
                                                  
        
    nd.append(C2D(incoming = incoming,
                            num_filters = num_filters,
                            filter_size = [1,1],
                            stride = [2,2],
                            pad = (0,0),
                            W = T.mean(W,axis=[2,3]).dimshuffle(0,1,'x','x')*theano.shared(lasagne.utils.floatX(sinit.sample(num_filters)), name+'_coeff_1x1').dimshuffle(0,'x','x','x'),
                            b = None,
                            nonlinearity = None,
                            name = name+'1x1'
                        ))
   
    return ESL(nd+[n])    

# Beta Distribution Layer   
# This layer takes in a batch_size batch, 2-channel, NxN dimension layer and returns the output of the first channel
# divided by the sum of both channels, which is equivalent to finding the expected value for a beta distribution.
# Note that this version of the layer scales to {-1,1} for compatibility with tanh.
class beta_layer(lasagne.layers.MergeLayer):
    def __init__(self, alpha,beta, **kwargs):
        super(beta_layer, self).__init__([alpha,beta], **kwargs)

    def get_output_shape_for(self, input_shape):
        print(input_shape)
        return input_shape[0]

    def get_output_for(self, inputs, deterministic=False, **kwargs):
        alpha,beta = inputs
        # return 2*T.true_div(alpha,T.add(alpha,beta)+1e-8)-1
        return 2*(alpha/(alpha+beta+1e-8))-1

# Convenience Function to produce a residual pre-activation MDCL block        
def MDBLOCK(incoming,num_filters,scales,name,nonlinearity):
    return NL(BN(ESL([incoming,
         MDCL(NL(BN(MDCL(NL(BN(incoming,name=name+'bnorm0'),nonlinearity),num_filters,scales,name),name=name+'bnorm1'),nonlinearity),
              num_filters,
              scales,
              name+'2')]),name=name+'bnorm2'),nonlinearity)  
              
# Gaussian Sample Layer for VAE from Tencia Lee
class GaussianSampleLayer(lasagne.layers.MergeLayer):
    def __init__(self, mu, logsigma, rng=None, **kwargs):
        self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))
        super(GaussianSampleLayer, self).__init__([mu, logsigma], **kwargs)

    def get_output_shape_for(self, input_shapes):
        return input_shapes[0]

    def get_output_for(self, inputs, deterministic=False, **kwargs):
        mu, logsigma = inputs
        shape=(self.input_shapes[0][0] or inputs[0].shape[0],
                self.input_shapes[0][1] or inputs[0].shape[1])
        if deterministic:
            return mu
        return mu + T.exp(logsigma) * self.rng.normal(shape)

# DeconvLayer adapted from Radford's DCGAN Implementation
class DeconvLayer(lasagne.layers.conv.BaseConvLayer):
    def __init__(self, incoming, num_filters, filter_size, stride=(1, 1),
                 crop=0, untie_biases=False,
                 W=initmethod(), b=lasagne.init.Constant(0.),
                 nonlinearity=lasagne.nonlinearities.rectify, flip_filters=False,
                 **kwargs):
        super(DeconvLayer, self).__init__(
                incoming, num_filters, filter_size, stride, crop, untie_biases,
                W, b, nonlinearity, flip_filters, n=2, **kwargs)
        # rename self.crop to self.pad
        self.crop = self.pad
        del self.pad

    def get_W_shape(self):
        num_input_channels = self.input_shape[1]
        # first two sizes are swapped compared to a forward convolution
        return (num_input_channels, self.num_filters) + self.filter_size

    def get_output_shape_for(self, input_shape):
        
        # when called from the constructor, self.crop is still called self.pad:
        crop = getattr(self, 'crop', getattr(self, 'pad', None))
        crop = crop if isinstance(crop, tuple) else (crop,) * self.n
        batchsize = input_shape[0]
        return(batchsize,self.num_filters)+(input_shape[2]*2,input_shape[3]*2)
        # return ((batchsize, self.num_filters) +
                # tuple(conv_input_length(input, filter, stride, p)
                      # for input, filter, stride, p
                      # in zip(input_shape[2:], self.filter_size,
                             # self.stride, crop)))

    def convolve(self, input, **kwargs):
        
        # Messy to have these imports here, but seems to allow for switching DNN off.
        from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           host_from_gpu,
                                           gpu_contiguous, HostFromGpu,
                                           gpu_alloc_empty)
        from theano.sandbox.cuda.dnn import GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradI, dnn_conv, dnn_pool 
        # Straight outta Radford
        img = gpu_contiguous(input)
        kerns = gpu_contiguous(self.W)
        desc = GpuDnnConvDesc(border_mode=self.crop, subsample=self.stride,
                              conv_mode='conv')(gpu_alloc_empty(img.shape[0], kerns.shape[1], img.shape[2]*self.stride[0], img.shape[3]*self.stride[1]).shape, kerns.shape)
        out = gpu_alloc_empty(img.shape[0], kerns.shape[1], img.shape[2]*self.stride[0], img.shape[3]*self.stride[1])
        conved = GpuDnnConvGradI()(kerns, img, out, desc)

        return conved
        
# Minibatch discrimination layer from OpenAI's improved GAN techniques       
class MinibatchLayer(lasagne.layers.Layer):
    def __init__(self, incoming, num_kernels, dim_per_kernel=5, theta=lasagne.init.Normal(0.05),
                 log_weight_scale=lasagne.init.Constant(0.), b=lasagne.init.Constant(-1.), **kwargs):
        super(MinibatchLayer, self).__init__(incoming, **kwargs)
        self.num_kernels = num_kernels
        num_inputs = int(np.prod(self.input_shape[1:]))
        self.theta = self.add_param(theta, (num_inputs, num_kernels, dim_per_kernel), name="theta")
        self.log_weight_scale = self.add_param(log_weight_scale, (num_kernels, dim_per_kernel), name="log_weight_scale")
        self.W = self.theta * (T.exp(self.log_weight_scale)/T.sqrt(T.sum(T.square(self.theta),axis=0))).dimshuffle('x',0,1)
        self.b = self.add_param(b, (num_kernels,), name="b")
        
    def get_output_shape_for(self, input_shape):
        return (input_shape[0], np.prod(input_shape[1:])+self.num_kernels)

    def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)
        
        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]
        
        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)  

# Convenience function to define an inception-style block
def InceptionLayer(incoming,param_dict,block_name):
    branch = [0]*len(param_dict)
    # Loop across branches
    for i,dict in enumerate(param_dict):
        for j,style in enumerate(dict['style']): # Loop up branch
            branch[i] = C2D(
                incoming = branch[i] if j else incoming,
                num_filters = dict['num_filters'][j],
                filter_size = dict['filter_size'][j],
                pad =  dict['pad'][j] if 'pad' in dict else None,
                stride = dict['stride'][j],
                W = initmethod('relu'),
                nonlinearity = dict['nonlinearity'][j],
                name = block_name+'_'+str(i)+'_'+str(j)) if style=='convolutional'\
            else NL(lasagne.layers.dnn.Pool2DDNNLayer(
                incoming=incoming if j == 0 else branch[i],
                pool_size = dict['filter_size'][j],
                mode = dict['mode'][j],
                stride = dict['stride'][j],
                pad = dict['pad'][j],
                name = block_name+'_'+str(i)+'_'+str(j)),
                nonlinearity = dict['nonlinearity'][j]) if style=='pool'\
            else lasagne.layers.DilatedConv2DLayer(
                incoming = lasagne.layers.PadLayer(incoming = incoming if j==0 else branch[i],width = dict['pad'][j]) if 'pad' in dict else incoming if j==0 else branch[i],
                num_filters = dict['num_filters'][j],
                filter_size = dict['filter_size'][j],
                dilation = dict['dilation'][j],
                # pad = dict['pad'][j] if 'pad' in dict else None,
                W = initmethod('relu'),
                nonlinearity = dict['nonlinearity'][j],
                name = block_name+'_'+str(i)+'_'+str(j))  if style== 'dilation'\
            else DL(
                    incoming = incoming if j==0 else branch[i],
                    num_units = dict['num_filters'][j],
                    W = initmethod('relu'),
                    b = None,
                    nonlinearity = dict['nonlinearity'][j],
                    name = block_name+'_'+str(i)+'_'+str(j))   
                # Apply Batchnorm    
            branch[i] = BN(branch[i],name = block_name+'_bnorm_'+str(i)+'_'+str(j)) if dict['bnorm'][j] else branch[i]
        # Concatenate Sublayers        
            
    return CL(incomings=branch,name=block_name)

# Convenience function to define an inception-style block with upscaling    
def InceptionUpscaleLayer(incoming,param_dict,block_name):
    branch = [0]*len(param_dict)
    # Loop across branches
    for i,dict in enumerate(param_dict):
        for j,style in enumerate(dict['style']): # Loop up branch
            branch[i] = TC2D(
                incoming = branch[i] if j else incoming,
                num_filters = dict['num_filters'][j],
                filter_size = dict['filter_size'][j],
                crop = dict['pad'][j] if 'pad' in dict else None,
                stride = dict['stride'][j],
                W = initmethod('relu'),
                nonlinearity = dict['nonlinearity'][j],
                name = block_name+'_'+str(i)+'_'+str(j)) if style=='convolutional'\
            else NL(
                    incoming = lasagne.layers.dnn.Pool2DDNNLayer(
                        incoming = lasagne.layers.Upscale2DLayer(
                            incoming=incoming if j == 0 else branch[i],
                            scale_factor = dict['stride'][j]),
                        pool_size = dict['filter_size'][j],
                        stride = [1,1],
                        mode = dict['mode'][j],
                        pad = dict['pad'][j],
                        name = block_name+'_'+str(i)+'_'+str(j)),
                    nonlinearity = dict['nonlinearity'][j])
                # Apply Batchnorm    
            branch[i] = BN(branch[i],name = block_name+'_bnorm_'+str(i)+'_'+str(j)) if dict['bnorm'][j] else branch[i]
        # Concatenate Sublayers        
            
    return CL(incomings=branch,name=block_name)

# Convenience function to efficiently generate param dictionaries for use with InceptioNlayer
def pd(num_layers=2,num_filters=32,filter_size=(3,3),pad=1,stride = (1,1),nonlinearity=elu,style='convolutional',bnorm=1,**kwargs):
    input_args = locals()    
    input_args.pop('num_layers')
    return {key:entry if type(entry) is list else [entry]*num_layers for key,entry in input_args.iteritems()}  

# Possible Conv2DDNN convenience function. Remember to delete the C2D import at the top if you use this    
# def C2D(incoming = None, num_filters = 32, filter_size= [3,3],pad = 'same',stride = [1,1], W = initmethod('relu'),nonlinearity = elu,name = None):
    # return lasagne.layers.dnn.Conv2DDNNLayer(incoming,num_filters,filter_size,stride,pad,False,W,None,nonlinearity,False)

# Shape-Preserving Gaussian Sample layer for latent vectors with spatial dimensions.
# This is a holdover from an "old" (i.e. I abandoned it last month) idea. 
class GSL(lasagne.layers.MergeLayer):
    def __init__(self, mu, logsigma, rng=None, **kwargs):
        self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))
        super(GSL, self).__init__([mu, logsigma], **kwargs)

    def get_output_shape_for(self, input_shape):
        print(input_shape)
        return input_shape[0]

    def get_output_for(self, inputs, deterministic=False, **kwargs):
        mu, logsigma = inputs
        if deterministic:
            return mu
        return mu + T.exp(logsigma) * self.rng.normal(logsigma.shape)

# Convenience function to return list of sampled latent layers
def GL(mu,ls):
    return([GSL(z_mu,z_ls) for z_mu,z_ls in zip(mu,ls)])

# Convenience function to return a residual layer. It's not really that much more convenient than ESL'ing,
# but I like being able to see when I'm using Residual connections as opposed to Elemwise-sums    
def ResLayer(incoming, IB,nonlinearity):
    return NL(ESL([IB,incoming]),nonlinearity)


# Inverse autoregressive flow layer       
class IAFLayer(lasagne.layers.MergeLayer):
    def __init__(self, z, mu, logsigma, **kwargs):
        super(IAFLayer, self).__init__([z,mu, logsigma], **kwargs)

    def get_output_shape_for(self, input_shapes):
        return input_shapes[0]

    def get_output_for(self, inputs, deterministic=False, **kwargs):
        z,mu, logsigma = inputs
        return (z - mu) / T.exp(logsigma)

# Masked layer for MADE, adopted from M.Germain        
class MaskedLayer(lasagne.layers.DenseLayer):
    
    def __init__(self, incoming, num_units, mask_generator,layerIdx,W=lasagne.init.GlorotUniform(),
                 b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify, **kwargs):
        super(MaskedLayer, self).__init__(incoming, num_units, W,b, nonlinearity,**kwargs)
        self.mask_generator = mask_generator
        num_inputs = int(np.prod(self.input_shape[1:]))
        self.weights_mask = self.add_param(spec = np.ones((num_inputs, num_units),dtype=np.float32),
                                           shape = (num_inputs, num_units),
                                           name='weights_mask',
                                           trainable=False,
                                           regularizable=False)
        self.layerIdx = layerIdx
        self.shuffle_update = [(self.weights_mask, mask_generator.get_mask_layer_UPDATE(self.layerIdx))]
   
    def get_output_for(self,input, **kwargs):
        if input.ndim > 2:
            input = input.flatten(2)
        activation = T.dot(input, self.W*self.weights_mask)
        if self.b is not None:
            activation = activation + self.b.dimshuffle('x', 0)
        return self.nonlinearity(activation)

        
# Stripped-Down Direct Input masked layer: Combine this with ESL and a masked layer to get a true DIML.
# Consider making this a simultaneous subclass of MaskedLayer and elemwise sum layer for cleanliness
#  adopted from M.Germain  
class DIML(lasagne.layers.DenseLayer):
    
    def __init__(self, incoming, num_units, mask_generator,layerIdx,W=lasagne.init.GlorotUniform(),
                 b=lasagne.init.Constant(0.), nonlinearity=None,**kwargs):
        super(DIML, self).__init__(incoming, num_units, W,b, nonlinearity,**kwargs)
        
        self.mask_generator = mask_generator
        self.layerIdx = layerIdx
        num_inputs = int(np.prod(self.input_shape[1:]))
        self.weights_mask = self.add_param(spec = np.ones((num_inputs, num_units),dtype=np.float32),
                                           shape = (num_inputs, num_units),
                                           name='weights_mask',
                                           trainable=False,
                                           regularizable=False)
        

        self.shuffle_update = [(self.weights_mask, self.mask_generator.get_direct_input_mask_layer_UPDATE(self.layerIdx + 1))]         

   
    def get_output_for(self,input, **kwargs):
        if input.ndim > 2:
            input = input.flatten(2)
        
        activation = T.dot(input, self.W*self.weights_mask)            
        
        if self.b is not None:
            activation = activation + self.b.dimshuffle('x', 0)
        return self.nonlinearity(activation)        

# Conditioning Masked Layer 
# Currently not used.       
# class CML(MaskedLayer):

    # def __init__(self, incoming, num_units, mask_generator,use_cond_mask=False,U=lasagne.init.GlorotUniform(),W=lasagne.init.GlorotUniform(),
                 # b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify, **kwargs):
        # super(CML, self).__init__(incoming, num_units, mask_generator,W,
                 # b, nonlinearity,**kwargs)
        
        # self.use_cond_mask=use_cond_mask
        # if use_cond_mask:            
            # self.U = self.add_param(spec = U,
                                    # shape = (num_inputs, num_units),
                                    # name='U',
                                    # trainable=True,
                                    # regularizable=False)theano.shared(value=self.weights_initialization((self.n_in, self.n_out)), name=self.name+'U', borrow=True)
            # self.add_param(self.U,name = 
    # def get_output_for(self,input,**kwargs):
       # lin = self.lin_output = T.dot(input, self.W * self.weights_mask) + self.b  
       # if self.use_cond_mask:
           # lin = lin+T.dot(T.ones_like(input), self.U * self.weights_mask)
       # return lin if self._activation is None else self._activation(lin) 


# Made layer, adopted from M.Germain        
class MADE(lasagne.layers.Layer):
    def __init__(self,z,hidden_sizes,name,nonlinearity=lasagne.nonlinearities.rectify,output_nonlinearity=None, **kwargs):
        # self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1234))
        super(MADE, self).__init__(z, **kwargs)
        
        # Incoming latents
        self.z = z
        
        # List defining hidden units in each layer
        self.hidden_sizes = hidden_sizes
        
        # Layer name for saving parameters.
        self.name = name
        
        # nonlinearity
        self.nonlinearity = nonlinearity
        
        # Output nonlinearity
        self.output_nonlinearity = output_nonlinearity
        
        # Control parameters from original MADE
        mask_distribution=0
        use_cond_mask = False
        direct_input_connect = "Output"
        direct_output_connect = False
        self.shuffled_once = False
        
        # Mask generator
        self.mask_generator = MaskGenerator(lasagne.layers.get_output_shape(z)[1], hidden_sizes, mask_distribution)
        
        # Build the MADE
        # TODO: Consider making this more compact by directly writing to the layers list
        self.input_layer = MaskedLayer(incoming = z, 
                                  num_units = hidden_sizes[0], 
                                  mask_generator = self.mask_generator,
                                  layerIdx = 0,
                                  W = lasagne.init.Orthogonal('relu'),
                                  nonlinearity=self.nonlinearity,
                                  name = self.name+'_input')
                                  
        self.layers = [self.input_layer]
        
        for i in range(1, len(hidden_sizes)):
        
            self.layers += [MaskedLayer(incoming = self.layers[-1], 
                                       num_units = hidden_sizes[i], 
                                       mask_generator = self.mask_generator,
                                       layerIdx = i,
                                       W = lasagne.init.Orthogonal('relu'),
                                       nonlinearity=self.nonlinearity,
                                       name = self.name+'_layer_'+str(i))]
                                                        
        outputLayerIdx = len(self.layers)
        
        # Output layer
        self.layers += [MaskedLayer(incoming = self.layers[-1], 
                                       num_units = lasagne.layers.get_output_shape(z)[1], 
                                       mask_generator = self.mask_generator,
                                       layerIdx = outputLayerIdx,
                                       W = lasagne.init.Orthogonal('relu'),
                                       nonlinearity = self.output_nonlinearity,
                                       name = self.name+'_output_W'),
                                DIML(incoming = z, 
                                num_units = lasagne.layers.get_output_shape(z)[1],
                                mask_generator = self.mask_generator,
                                layerIdx = outputLayerIdx,
                                W = lasagne.init.Orthogonal('relu'),
                                nonlinearity = self.output_nonlinearity,
                                name = self.name+'_output_D')]


        masks_updates = [layer_mask_update for l in self.layers for layer_mask_update in l.shuffle_update]
        self.update_masks = theano.function(name='update_masks',
                                        inputs=[],
                                        updates=masks_updates)
        # Make the true output layer by ESL'ing the DIML and masked layer
        self.final_layer= ESL([self.layers[-2],self.layers[-1]])
        # self.output_layer = self.layers[-1]
        # params = [p for p in l.get_params(trainable=True) for l in self.layers]
        # print(params)

    def get_output_for(self, input, deterministic=False, **kwargs):
        return lasagne.layers.get_output(self.final_layer,{self.z:input})
    
    def get_params(self, unwrap_shared=True, **tags):
        params = []
        for l in self.layers:
            for p in l.get_params(**tags):
                params.append(p)
        return(params)        
        # params = [p for p in l.get_params(trainable=True) for l in self.layers]
        # return params
        # return [p for p in lay.get_params(unwrap_shared,**tags) for lay in self.layers]
        # return lasagne.layers.get_all_params(self.final_layer,trainable=True)
    
    def shuffle(self, shuffling_type):
        if shuffling_type == "Once" and self.shuffled_once is False:
            self.mask_generator.shuffle_ordering()
            self.mask_generator.sample_connectivity()
            self.update_masks()
            self.shuffled_once = True
            return

        if shuffling_type in ["Ordering", "Full"]:
            self.mask_generator.shuffle_ordering()
        if shuffling_type in ["Connectivity", "Full"]:
            self.mask_generator.sample_connectivity()
        self.update_masks()

    def reset(self, shuffling_type, last_shuffle=0):
        self.mask_generator.reset()

        # Always do a first shuffle so that the natural order does not gives us an edge
        self.shuffle("Full")

        # Set the mask to the requested shuffle
        for i in range(last_shuffle):
            self.shuffle(shuffling_type)