Merge pull request #21 from fujii-team/tf12

Tf12
fujii-team · Jan 10, 2017 · ae888b1 · ae888b1
2 parents dbed0ee + ed16ff2
commit ae888b1
Show file tree

Hide file tree

Showing 24 changed files with 700 additions and 957 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,23 @@
+sudo: required
+dist: trusty
+language: python
+python:
+  - 3.5
+install:
+  - sudo apt-get update
+  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
+  - bash miniconda.sh -b -p $HOME/miniconda
+  - export PATH="$HOME/miniconda/bin:$PATH"
+  - hash -r
+  - conda config --set always_yes yes --set changeps1 no
+  - conda update -q conda
+  # Useful for debugging any issues with conda
+  - conda info -a
+
+  # Replace dep1 dep2 ... with your dependencies
+  - conda create -q -n test-environment python=3.5 numpy scipy pandas nose
+  - source activate test-environment
+  - pip install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.1-cp35-cp35m-linux_x86_64.whl
+  - python setup.py install
+script:
+  - nosetests --nologcapture testing
diff --git a/Henbun/_version.py b/Henbun/_version.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 
 
-__version__ = "0.0.1"
+__version__ = "0.1"
diff --git a/Henbun/gp/gp.py b/Henbun/gp/gp.py
@@ -8,16 +8,15 @@
 
 class GP(Parameterized):
     """
-    A vanila implementation, in order to sample from the Gaussian Process
-    posterior.
+    An implementation to sample from the Gaussian Process posterior.
 
     The posterior mean and covariance are represented by
     + mean: L*u.q_mu
     + covariance: (L*u.q_sqrt)*(L*u.q_sqrt)^T
     where
     L is cholesky factor of the GP kernel, K(x,x) = L L^T
-    u.q_mu is the mean of the whitened variational posterior, u
-    u.q_sqrt is the cholesky factor the whitened variational posterior, u,
+    u.q_mu is the mean of the variational posterior, u
+    u.q_sqrt is the cholesky factor of the variational posterior, u,
     u ~ N(q_mu, q_sqrt*q_sqrt^T)
 
     This class does not consider mean functions.
@@ -40,15 +39,15 @@ def samples(self, x, u):
         Draw samples from the posterior, with given coordinate variables x.
         args:
         + x: Coordinate variables sized [n, d].
-        + u: Variational parameters, such as Normal or Gaussian, sized [n, N].
+        + u: Variational parameters, such as Normal or Gaussian, sized [N,n].
         returns:
         + samples: Samples from the posterior sized [n,N]
 
         Note:
         The size of the first axis of x and u should be the same.
         """
         L = self.kern.Cholesky(x) # sized [n,n]
-        return tf.matmul(L, u) # sized [n,N]
+        return tf.matmul(u, L, transpose_b=True) # sized [N,n]
 
 
 class SparseGP(GP):
@@ -60,23 +59,21 @@ class SparseGP(GP):
     the variational posterior is represented by N(m,S)
     with
     m = Knm*Lm^-T*u.q_mu
-    S = Knm*Kmm^-1*Kmn + (Knm*Lm^-T*u.q_sqrt)^2
+    S = (Knn-Knm*Kmm^-1*Kmn) + (Knm*Lm^-T*u.q_sqrt)^2
 
     where
     Knm = K(x,z)          sized [N,n,m]
     Kmm = K(z,z)          sized [m,m]
     Lm = cholesky(K(z,z)) sized [m,m]
 
     Due to the sparse approximation, an additional variance should be considered,
-    Knm*Kmm^-1*Kmn
+    Knn-Knm*Kmm^-1*Kmn
     We support the following three approximations,
     + diagonal: The first term of S is approximated by diagonalization.
                 This option is default.
     + neglected: The first term of S is neglected.
-                Do not use this option in the learning phase, because it removes
-                the z dependence of ELBO.
     + fullrank: The first term of S is fully factorized.
-                This option might be very slow and GP class may be faster.
+                This option might be very slow and even GP class may be faster.
 
     This class does not consider mean functions.
     mean_functions should be added manually.
@@ -97,74 +94,99 @@ def __init__(self, kern, z, collections=[graph_key.VARIABLES]):
         # inducing points
         self.z = Variable(shape=z.shape, collections=collections)
         self.z = z # set the inital value
+        self.m = len(z)
 
     def samples(self, x, u, q_shape='diagonal'):
         """
         Returns samples from GP.
         args:
-        + x: coordinate variables, sized [n,d] or [n,d,N].
-        + u: inducing point values, sized [m,N]
+        + x: coordinate variables, sized [n,d] or [N,n,d].
+        + u: inducing point values, sized [N,m]
         + q_shape: How to approximate the covariance, Knn-Knm Kmm^-1 Kmn term.
                 Shoule be one of ['diagonal', 'neglect', 'fullrank'].
                 'diagonal': Diagonalize this term (default).
-                'neglected' : Neglect this term. Do not use in the learning phase.
-                            (No dependence on z).
-                'fullrank': Fully diagonalize this term.
+                'neglected' : Neglect this term.
+                'fullrank': Fully factorize this term.
                             Very slow.
         """
         assert(q_shape in ['diagonal','neglected','fullrank'])
         jitter = settings.numerics.jitter_level
-        N = tf.shape(u)[-1]
+        N = tf.shape(u)[0]
         # Cholesky factor of K(z,z)
         Lm = self.kern.Cholesky(self.z) # sized [m,m]
-        I = eye(tf.shape(self.z)[0])
-        Lminv = tf.transpose(tf.matrix_triangular_solve(Lm, I)) # [m,m]
-
-        # Non-batch case x:[n,d]. Return shape [n,N]
-        def sample():
-            Ln = tf.matmul(self.kern.K(x, self.z), Lminv) # sized [n,m]
-            samples = tf.matmul(Ln, u) # sized [n,N]
-            # additional variance due to the sparse approximation.
-            if q_shape is 'diagonal':
-                diag_var = jitter + \
-                    tf.expand_dims(self.kern.Kdiag(x) - tf.reduce_sum(tf.square(Ln), -1), -1) # [n,1]
-                return samples + \
-                    tf.sqrt(tf.abs(diag_var)) * tf.random_normal([tf.shape(x)[0], N],
-                                                dtype=float_type)
-            elif q_shape is 'neglected':
-                return samples
-            else: # fullrank
-                Knn = tf.tile(tf.expand_dims(self.kern.K(x),0), [N,1,1])
-                jitterI = tf.tile(tf.expand_dims(eye(tf.shape(x)[0]),0), [N,1,1])
-                chol = tf.cholesky(Knn - tf.batch_matmul(Ln, Ln, adj_y=True) + jitterI) # [N,n,n]
-                return samples + \
-                    tf.transpose(
-                        tf.squeeze(tf.batch_matmul(
-                            chol, tf.random_normal([N, tf.shape(x)[0], 1], dtype=float_type)),
-                            [-1])) # [N,n,1] -> [N,n] -> [n,N]
-
-        # Batch case. x:[n,d,N]. Return shape [n,N]
-        def sample_batch():
-            z = tf.tile(tf.expand_dims(self.z, -1), [1,1,N])
-            Ln = tf.batch_matmul(self.kern.K(x, z), tf.tile(tf.expand_dims(Lminv, 0), [N,1,1]))
-            samples = tf.transpose(tf.squeeze( # [N,n,1] -> [n,N]
-                    tf.batch_matmul(Ln, tf.expand_dims(tf.transpose(u), -1)), [-1]))
-            if q_shape is 'diagonal':
-            # additional variance due to the sparse approximation.
-                diag_var = jitter + \
-                    tf.transpose(self.kern.Kdiag(x) - tf.reduce_sum(tf.square(Ln), -1))
-                return samples + \
-                    tf.sqrt(tf.abs(diag_var)) * tf.random_normal([tf.shape(x)[0], N], dtype=float_type)
-            elif q_shape is 'neglected':
-                return samples
-            else: # fullrank case
-                Knn = self.kern.K(x)
-                jitterI = tf.tile(tf.expand_dims(eye(tf.shape(x)[0]),0), [N,1,1])
-                chol = tf.cholesky(Knn - tf.batch_matmul(Ln, Ln, adj_y=True) + jitterI) # [N,n,n]
-                return samples + \
-                    tf.transpose(
-                        tf.squeeze(tf.batch_matmul(
-                            chol, tf.random_normal([N, tf.shape(x)[0], 1], dtype=float_type)),
-                            [-1])) # [N,n,1] -> [N,n] -> [n,N]
-
-        return tf.cond(tf.equal(tf.rank(x),2), sample, sample_batch)
+
+        # TODO insert assertion for shape difference
+
+        LnT = self._effective_LT(x)
+        if x.get_shape().ndims==2:
+            samples = tf.matmul(u, LnT) # sized [N,n]
+        elif x.get_shape().ndims==3:
+            samples = tf.squeeze(
+                tf.matmul(tf.expand_dims(u,1), LnT), [1]) # [N,1,m]*[N,m,n]->[N,n]
+
+        if q_shape is 'neglected':
+            return samples
+        elif q_shape is 'diagonal':
+            diag_cov = self._additional_cov(x, LnT, 'diagonal')
+            return samples + tf.sqrt(tf.abs(diag_cov)) \
+                        * tf.random_normal(tf.shape(x)[:-1], dtype=float_type)
+        else: # 'fullrank'
+            jitterI = eye(tf.shape(x)[-2]) * jitter
+            chol = tf.cholesky(self._additional_cov(x, LnT, 'fullrank') + jitterI) # [n,n]
+            if x.get_shape().ndims==2:
+                return samples + tf.matmul(
+                    tf.random_normal([N,tf.shape(x)[0]], dtype=float_type), # [N,n]
+                    chol, transpose_b=True) # [N,n]@[n,n] -> [N,n]
+            elif x.get_shape().ndims==3:
+                return samples + tf.squeeze(tf.matmul(
+                    tf.random_normal([N, 1,tf.shape(x)[1]], dtype=float_type),
+                    chol, transpose_b=True), [1])
+
+
+    def _effective_LT(self, x):
+        """
+        Returns the effective cholesky factor,
+              - T       - 1
+        K   L      =  L     K
+         nm                  mn
+                 T
+        with L L   = K
+                      mm
+        args:
+        + x : coordinate for the prediction, sized [N,n,d] or [n,d]
+        """
+        # Cholesky factor of K(z,z)
+        Lm = self.kern.Cholesky(self.z) # sized [m,m]
+        if x.get_shape().ndims==2:
+            # [m,n] -> [n,m]
+            return tf.matrix_triangular_solve(Lm, self.kern.K(self.z, x))
+            #Lminv = tf.matrix_triangular_solve(Lm, eye(self.m)) # [m,m]
+            #return tf.matmul(Lminv, self.kern.K(self.z, x)) # [m,m]@[m,n]->[m,n]
+
+        # batch case
+        elif x.get_shape().ndims==3:
+            N = tf.shape(x)[0]
+            Lminv = tf.matrix_triangular_solve(Lm, eye(self.m)) # [m,m]
+            z = tf.tile(tf.expand_dims(self.z, 0), [N,1,1])
+            return tf.matmul(tf.tile(tf.expand_dims(Lminv, 0), [N,1,1]), #[N,m,m]
+                             self.kern.K(z, x)) # [N,m,m]@[N,m,n] -> [N,m,n]
+
+        raise ValueError('shape is not specified for tensor x')
+
+
+    def _additional_cov(self, x, LnT, q_shape):
+        """
+        Returns the additional GP covariance due to the sparse approximation.
+        Knn-Knm*Kmm^-1*Kmn
+
+        args:
+        + x : coordinate for the prediction, sized [N,n,d] or [n,d]
+        + LnT: Effective Cholesky factor, L^-1 Kmn
+        + q_shape: 'diagonal' or 'fullrank'. If it is 'diagonal', it returns
+        only the diagonal part of the covariance.
+        """
+        if q_shape is 'diagonal':
+            return self.kern.Kdiag(x) - tf.reduce_sum(tf.square(LnT), -2) # [N,n]
+        else:
+            Knn = self.kern.K(x) # [N,n,n] or [n,n]
+            return Knn - tf.matmul(LnT, LnT, transpose_a=True)
diff --git a/Henbun/gp/kernels.py b/Henbun/gp/kernels.py
@@ -18,10 +18,10 @@
 import tensorflow as tf
 import numpy as np
 from .. import transforms
-from ..tf_wraps import eye
-from .._settings import settings
 from ..param import Variable, Parameterized, graph_key
 from ..variationals import Variational
+from ..tf_wraps import eye
+from .._settings import settings
 float_type = settings.dtypes.float_type
 np_float_type = np.float32 if float_type is tf.float32 else np.float64
 
@@ -56,64 +56,48 @@ def square_dist(self, X, X2=None):
         Returns the square distance between X and X2.
         X, X2 is 2d- or 3d-tensor.
 
-        If X is (and X2 should be the same dim.) 2dimensional,
+        If X is (and X2 should have the same dimension) 2dimensional,
         each dimension is considered as [n,d] and [n2,d]
          - n: number of data point
          - d: dimension of each data
         This method returns [n,n2] sized kernel value.
         If X2 is None, then X2 = X is assumed.
 
         If X is 3-dimensional,
-        each dimension is considered as [n,d,N] (and [n2,d,N] for X2)
+        each dimension is considered as [N,n,d] (and [N,n2,d] for X2)
          - N: batch number.
         Returns: [N,n,n2] sized kernel value.
         """
-        # match lengthscales in batched case
-        def fn1(): return self.lengthscales
-        def fn2(): return tf.expand_dims(self.lengthscales, -1)
-        l = tf.cond(tf.equal(tf.rank(X),2), fn1, fn2)
-
-        Xt = tf.transpose(X/l) # [d,n]  or [N,d,n]
-        Xs = tf.reduce_sum(tf.square(Xt), -2) # [n] or [N,N]
+        Xeff = X/self.lengthscales # [n,d]  or [N,n,d]
+        Xs = tf.reduce_sum(tf.square(Xeff), -1) # [n] or [N,n]
         if X2 is None:
             # batched case : [N,n,d]@[N,d,n]->[N,n,n]
             # non-batch case:[n,d]@[d,n]->[n,n]->[n,n]
-            return -2*tf.batch_matmul(Xt, Xt, adj_x=True) + \
+            return -2*tf.matmul(Xeff, Xeff, transpose_b=True) + \
                         tf.expand_dims(Xs, -1) + tf.expand_dims(Xs, -2)
         else:
-            X2t = tf.transpose(X2/l)
-            X2s = tf.reduce_sum(tf.square(X2t), -2)
+            X2eff = X2/self.lengthscales
+            X2s = tf.reduce_sum(tf.square(X2eff), -1)
             # batched case : [N,n,d]@[N,d,n2]->[N,n,n2]
             # non-batch case:[n,d]@[d,n]->[n,n]->[n,n]
-            return -2*tf.batch_matmul(Xt, X2t, adj_x=True) + \
+            return -2*tf.matmul(Xeff, X2eff, transpose_b=True) + \
                         tf.expand_dims(Xs, -1) + tf.expand_dims(X2s, -2)
 
     def euclid_dist(self, X, X2):
         r2 = self.square_dist(X, X2)
         return tf.sqrt(r2 + 1e-12)
 
     def Kdiag(self, X):
-        # callable to absorb the X-rank difference
-        # for non-batch X
-        def fn1(): return tf.ones([tf.shape(X)[0]], dtype=float_type)
-        # for batch X
-        def fn2(): return tf.ones([tf.shape(X)[-1], tf.shape(X)[0]], dtype=float_type)
-        return tf.cond(tf.equal(tf.rank(X), 2),fn1,fn2)
+        return tf.ones(tf.shape(X)[:-1], dtype=float_type)
 
     def Cholesky(self, X):
         """
         Cholesky decomposition of K(X).
-        If X is sized [n,n], this returns [n,n].
-        If X is sized [n,n,N], this returns [n,n,N] where each [n,n] matrices
-        are lower triangular.
+        If X is sized [n,d], this returns [n,n].
+        If X is sized [N,n,d], this returns [N,n,n] where each [n,n] matrix
+        is lower triangular.
         """
-        jitter = settings.numerics.jitter_level
-        # callable to absorb the X-rank difference
-        # for non-batch X
-        def jitter1(): return eye(tf.shape(X)[0])*jitter
-        # for batch X.
-        def jitter2(): return tf.expand_dims(eye(tf.shape(X)[0]),0)* jitter
-        jitter = tf.cond(tf.equal(tf.rank(X), 2), jitter1, jitter2)
+        jitter = eye(tf.shape(X)[-2])*settings.numerics.jitter_level
         return tf.cholesky(self.K(X)+jitter)
 
 class UnitRBF(UnitStationary):
@@ -142,10 +126,6 @@ def K(self, X, X2=None):
              + tf.exp(-self.square_dist(X, -X2)/2)
 
     def Kdiag(self, X):
-        # match lengthscales in batched case
-        def fn1(): return self.lengthscales
-        def fn2(): return tf.expand_dims(self.lengthscales, -1)
-        l = tf.cond(tf.equal(tf.rank(X),2), fn1, fn2)
-        Xt = tf.transpose(X/l) # [d,n]  or [N,d,n]
-        Xs = tf.reduce_sum(tf.square(Xt), -2) # [n] or [N,n]
+        Xeff = X/self.lengthscales # [n,d]  or [N,n,d]
+        Xs = tf.reduce_sum(tf.square(Xeff), -1) # [n] or [N,n]
         return tf.ones_like(Xs, dtype=float_type) + tf.exp(-2*Xs)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,4 +13,4 @@
		# limitations under the License.


		__version__ = "0.0.1"
		__version__ = "0.1"