Skip to content

Commit

Permalink
Merge pull request #21 from fujii-team/tf12
Browse files Browse the repository at this point in the history
Tf12
  • Loading branch information
fujiisoup authored Jan 10, 2017
2 parents dbed0ee + ed16ff2 commit ae888b1
Show file tree
Hide file tree
Showing 24 changed files with 700 additions and 957 deletions.
23 changes: 23 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
sudo: required
dist: trusty
language: python
python:
- 3.5
install:
- sudo apt-get update
- wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
- bash miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- hash -r
- conda config --set always_yes yes --set changeps1 no
- conda update -q conda
# Useful for debugging any issues with conda
- conda info -a

# Replace dep1 dep2 ... with your dependencies
- conda create -q -n test-environment python=3.5 numpy scipy pandas nose
- source activate test-environment
- pip install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.12.1-cp35-cp35m-linux_x86_64.whl
- python setup.py install
script:
- nosetests --nologcapture testing
2 changes: 1 addition & 1 deletion Henbun/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
# limitations under the License.


__version__ = "0.0.1"
__version__ = "0.1"
158 changes: 90 additions & 68 deletions Henbun/gp/gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,15 @@

class GP(Parameterized):
"""
A vanila implementation, in order to sample from the Gaussian Process
posterior.
An implementation to sample from the Gaussian Process posterior.
The posterior mean and covariance are represented by
+ mean: L*u.q_mu
+ covariance: (L*u.q_sqrt)*(L*u.q_sqrt)^T
where
L is cholesky factor of the GP kernel, K(x,x) = L L^T
u.q_mu is the mean of the whitened variational posterior, u
u.q_sqrt is the cholesky factor the whitened variational posterior, u,
u.q_mu is the mean of the variational posterior, u
u.q_sqrt is the cholesky factor of the variational posterior, u,
u ~ N(q_mu, q_sqrt*q_sqrt^T)
This class does not consider mean functions.
Expand All @@ -40,15 +39,15 @@ def samples(self, x, u):
Draw samples from the posterior, with given coordinate variables x.
args:
+ x: Coordinate variables sized [n, d].
+ u: Variational parameters, such as Normal or Gaussian, sized [n, N].
+ u: Variational parameters, such as Normal or Gaussian, sized [N,n].
returns:
+ samples: Samples from the posterior sized [n,N]
Note:
The size of the first axis of x and u should be the same.
"""
L = self.kern.Cholesky(x) # sized [n,n]
return tf.matmul(L, u) # sized [n,N]
return tf.matmul(u, L, transpose_b=True) # sized [N,n]


class SparseGP(GP):
Expand All @@ -60,23 +59,21 @@ class SparseGP(GP):
the variational posterior is represented by N(m,S)
with
m = Knm*Lm^-T*u.q_mu
S = Knm*Kmm^-1*Kmn + (Knm*Lm^-T*u.q_sqrt)^2
S = (Knn-Knm*Kmm^-1*Kmn) + (Knm*Lm^-T*u.q_sqrt)^2
where
Knm = K(x,z) sized [N,n,m]
Kmm = K(z,z) sized [m,m]
Lm = cholesky(K(z,z)) sized [m,m]
Due to the sparse approximation, an additional variance should be considered,
Knm*Kmm^-1*Kmn
Knn-Knm*Kmm^-1*Kmn
We support the following three approximations,
+ diagonal: The first term of S is approximated by diagonalization.
This option is default.
+ neglected: The first term of S is neglected.
Do not use this option in the learning phase, because it removes
the z dependence of ELBO.
+ fullrank: The first term of S is fully factorized.
This option might be very slow and GP class may be faster.
This option might be very slow and even GP class may be faster.
This class does not consider mean functions.
mean_functions should be added manually.
Expand All @@ -97,74 +94,99 @@ def __init__(self, kern, z, collections=[graph_key.VARIABLES]):
# inducing points
self.z = Variable(shape=z.shape, collections=collections)
self.z = z # set the inital value
self.m = len(z)

def samples(self, x, u, q_shape='diagonal'):
"""
Returns samples from GP.
args:
+ x: coordinate variables, sized [n,d] or [n,d,N].
+ u: inducing point values, sized [m,N]
+ x: coordinate variables, sized [n,d] or [N,n,d].
+ u: inducing point values, sized [N,m]
+ q_shape: How to approximate the covariance, Knn-Knm Kmm^-1 Kmn term.
Shoule be one of ['diagonal', 'neglect', 'fullrank'].
'diagonal': Diagonalize this term (default).
'neglected' : Neglect this term. Do not use in the learning phase.
(No dependence on z).
'fullrank': Fully diagonalize this term.
'neglected' : Neglect this term.
'fullrank': Fully factorize this term.
Very slow.
"""
assert(q_shape in ['diagonal','neglected','fullrank'])
jitter = settings.numerics.jitter_level
N = tf.shape(u)[-1]
N = tf.shape(u)[0]
# Cholesky factor of K(z,z)
Lm = self.kern.Cholesky(self.z) # sized [m,m]
I = eye(tf.shape(self.z)[0])
Lminv = tf.transpose(tf.matrix_triangular_solve(Lm, I)) # [m,m]

# Non-batch case x:[n,d]. Return shape [n,N]
def sample():
Ln = tf.matmul(self.kern.K(x, self.z), Lminv) # sized [n,m]
samples = tf.matmul(Ln, u) # sized [n,N]
# additional variance due to the sparse approximation.
if q_shape is 'diagonal':
diag_var = jitter + \
tf.expand_dims(self.kern.Kdiag(x) - tf.reduce_sum(tf.square(Ln), -1), -1) # [n,1]
return samples + \
tf.sqrt(tf.abs(diag_var)) * tf.random_normal([tf.shape(x)[0], N],
dtype=float_type)
elif q_shape is 'neglected':
return samples
else: # fullrank
Knn = tf.tile(tf.expand_dims(self.kern.K(x),0), [N,1,1])
jitterI = tf.tile(tf.expand_dims(eye(tf.shape(x)[0]),0), [N,1,1])
chol = tf.cholesky(Knn - tf.batch_matmul(Ln, Ln, adj_y=True) + jitterI) # [N,n,n]
return samples + \
tf.transpose(
tf.squeeze(tf.batch_matmul(
chol, tf.random_normal([N, tf.shape(x)[0], 1], dtype=float_type)),
[-1])) # [N,n,1] -> [N,n] -> [n,N]

# Batch case. x:[n,d,N]. Return shape [n,N]
def sample_batch():
z = tf.tile(tf.expand_dims(self.z, -1), [1,1,N])
Ln = tf.batch_matmul(self.kern.K(x, z), tf.tile(tf.expand_dims(Lminv, 0), [N,1,1]))
samples = tf.transpose(tf.squeeze( # [N,n,1] -> [n,N]
tf.batch_matmul(Ln, tf.expand_dims(tf.transpose(u), -1)), [-1]))
if q_shape is 'diagonal':
# additional variance due to the sparse approximation.
diag_var = jitter + \
tf.transpose(self.kern.Kdiag(x) - tf.reduce_sum(tf.square(Ln), -1))
return samples + \
tf.sqrt(tf.abs(diag_var)) * tf.random_normal([tf.shape(x)[0], N], dtype=float_type)
elif q_shape is 'neglected':
return samples
else: # fullrank case
Knn = self.kern.K(x)
jitterI = tf.tile(tf.expand_dims(eye(tf.shape(x)[0]),0), [N,1,1])
chol = tf.cholesky(Knn - tf.batch_matmul(Ln, Ln, adj_y=True) + jitterI) # [N,n,n]
return samples + \
tf.transpose(
tf.squeeze(tf.batch_matmul(
chol, tf.random_normal([N, tf.shape(x)[0], 1], dtype=float_type)),
[-1])) # [N,n,1] -> [N,n] -> [n,N]

return tf.cond(tf.equal(tf.rank(x),2), sample, sample_batch)

# TODO insert assertion for shape difference

LnT = self._effective_LT(x)
if x.get_shape().ndims==2:
samples = tf.matmul(u, LnT) # sized [N,n]
elif x.get_shape().ndims==3:
samples = tf.squeeze(
tf.matmul(tf.expand_dims(u,1), LnT), [1]) # [N,1,m]*[N,m,n]->[N,n]

if q_shape is 'neglected':
return samples
elif q_shape is 'diagonal':
diag_cov = self._additional_cov(x, LnT, 'diagonal')
return samples + tf.sqrt(tf.abs(diag_cov)) \
* tf.random_normal(tf.shape(x)[:-1], dtype=float_type)
else: # 'fullrank'
jitterI = eye(tf.shape(x)[-2]) * jitter
chol = tf.cholesky(self._additional_cov(x, LnT, 'fullrank') + jitterI) # [n,n]
if x.get_shape().ndims==2:
return samples + tf.matmul(
tf.random_normal([N,tf.shape(x)[0]], dtype=float_type), # [N,n]
chol, transpose_b=True) # [N,n]@[n,n] -> [N,n]
elif x.get_shape().ndims==3:
return samples + tf.squeeze(tf.matmul(
tf.random_normal([N, 1,tf.shape(x)[1]], dtype=float_type),
chol, transpose_b=True), [1])


def _effective_LT(self, x):
"""
Returns the effective cholesky factor,
- T - 1
K L = L K
nm mn
T
with L L = K
mm
args:
+ x : coordinate for the prediction, sized [N,n,d] or [n,d]
"""
# Cholesky factor of K(z,z)
Lm = self.kern.Cholesky(self.z) # sized [m,m]
if x.get_shape().ndims==2:
# [m,n] -> [n,m]
return tf.matrix_triangular_solve(Lm, self.kern.K(self.z, x))
#Lminv = tf.matrix_triangular_solve(Lm, eye(self.m)) # [m,m]
#return tf.matmul(Lminv, self.kern.K(self.z, x)) # [m,m]@[m,n]->[m,n]

# batch case
elif x.get_shape().ndims==3:
N = tf.shape(x)[0]
Lminv = tf.matrix_triangular_solve(Lm, eye(self.m)) # [m,m]
z = tf.tile(tf.expand_dims(self.z, 0), [N,1,1])
return tf.matmul(tf.tile(tf.expand_dims(Lminv, 0), [N,1,1]), #[N,m,m]
self.kern.K(z, x)) # [N,m,m]@[N,m,n] -> [N,m,n]

raise ValueError('shape is not specified for tensor x')


def _additional_cov(self, x, LnT, q_shape):
"""
Returns the additional GP covariance due to the sparse approximation.
Knn-Knm*Kmm^-1*Kmn
args:
+ x : coordinate for the prediction, sized [N,n,d] or [n,d]
+ LnT: Effective Cholesky factor, L^-1 Kmn
+ q_shape: 'diagonal' or 'fullrank'. If it is 'diagonal', it returns
only the diagonal part of the covariance.
"""
if q_shape is 'diagonal':
return self.kern.Kdiag(x) - tf.reduce_sum(tf.square(LnT), -2) # [N,n]
else:
Knn = self.kern.K(x) # [N,n,n] or [n,n]
return Knn - tf.matmul(LnT, LnT, transpose_a=True)
54 changes: 17 additions & 37 deletions Henbun/gp/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
import tensorflow as tf
import numpy as np
from .. import transforms
from ..tf_wraps import eye
from .._settings import settings
from ..param import Variable, Parameterized, graph_key
from ..variationals import Variational
from ..tf_wraps import eye
from .._settings import settings
float_type = settings.dtypes.float_type
np_float_type = np.float32 if float_type is tf.float32 else np.float64

Expand Down Expand Up @@ -56,64 +56,48 @@ def square_dist(self, X, X2=None):
Returns the square distance between X and X2.
X, X2 is 2d- or 3d-tensor.
If X is (and X2 should be the same dim.) 2dimensional,
If X is (and X2 should have the same dimension) 2dimensional,
each dimension is considered as [n,d] and [n2,d]
- n: number of data point
- d: dimension of each data
This method returns [n,n2] sized kernel value.
If X2 is None, then X2 = X is assumed.
If X is 3-dimensional,
each dimension is considered as [n,d,N] (and [n2,d,N] for X2)
each dimension is considered as [N,n,d] (and [N,n2,d] for X2)
- N: batch number.
Returns: [N,n,n2] sized kernel value.
"""
# match lengthscales in batched case
def fn1(): return self.lengthscales
def fn2(): return tf.expand_dims(self.lengthscales, -1)
l = tf.cond(tf.equal(tf.rank(X),2), fn1, fn2)

Xt = tf.transpose(X/l) # [d,n] or [N,d,n]
Xs = tf.reduce_sum(tf.square(Xt), -2) # [n] or [N,N]
Xeff = X/self.lengthscales # [n,d] or [N,n,d]
Xs = tf.reduce_sum(tf.square(Xeff), -1) # [n] or [N,n]
if X2 is None:
# batched case : [N,n,d]@[N,d,n]->[N,n,n]
# non-batch case:[n,d]@[d,n]->[n,n]->[n,n]
return -2*tf.batch_matmul(Xt, Xt, adj_x=True) + \
return -2*tf.matmul(Xeff, Xeff, transpose_b=True) + \
tf.expand_dims(Xs, -1) + tf.expand_dims(Xs, -2)
else:
X2t = tf.transpose(X2/l)
X2s = tf.reduce_sum(tf.square(X2t), -2)
X2eff = X2/self.lengthscales
X2s = tf.reduce_sum(tf.square(X2eff), -1)
# batched case : [N,n,d]@[N,d,n2]->[N,n,n2]
# non-batch case:[n,d]@[d,n]->[n,n]->[n,n]
return -2*tf.batch_matmul(Xt, X2t, adj_x=True) + \
return -2*tf.matmul(Xeff, X2eff, transpose_b=True) + \
tf.expand_dims(Xs, -1) + tf.expand_dims(X2s, -2)

def euclid_dist(self, X, X2):
r2 = self.square_dist(X, X2)
return tf.sqrt(r2 + 1e-12)

def Kdiag(self, X):
# callable to absorb the X-rank difference
# for non-batch X
def fn1(): return tf.ones([tf.shape(X)[0]], dtype=float_type)
# for batch X
def fn2(): return tf.ones([tf.shape(X)[-1], tf.shape(X)[0]], dtype=float_type)
return tf.cond(tf.equal(tf.rank(X), 2),fn1,fn2)
return tf.ones(tf.shape(X)[:-1], dtype=float_type)

def Cholesky(self, X):
"""
Cholesky decomposition of K(X).
If X is sized [n,n], this returns [n,n].
If X is sized [n,n,N], this returns [n,n,N] where each [n,n] matrices
are lower triangular.
If X is sized [n,d], this returns [n,n].
If X is sized [N,n,d], this returns [N,n,n] where each [n,n] matrix
is lower triangular.
"""
jitter = settings.numerics.jitter_level
# callable to absorb the X-rank difference
# for non-batch X
def jitter1(): return eye(tf.shape(X)[0])*jitter
# for batch X.
def jitter2(): return tf.expand_dims(eye(tf.shape(X)[0]),0)* jitter
jitter = tf.cond(tf.equal(tf.rank(X), 2), jitter1, jitter2)
jitter = eye(tf.shape(X)[-2])*settings.numerics.jitter_level
return tf.cholesky(self.K(X)+jitter)

class UnitRBF(UnitStationary):
Expand Down Expand Up @@ -142,10 +126,6 @@ def K(self, X, X2=None):
+ tf.exp(-self.square_dist(X, -X2)/2)

def Kdiag(self, X):
# match lengthscales in batched case
def fn1(): return self.lengthscales
def fn2(): return tf.expand_dims(self.lengthscales, -1)
l = tf.cond(tf.equal(tf.rank(X),2), fn1, fn2)
Xt = tf.transpose(X/l) # [d,n] or [N,d,n]
Xs = tf.reduce_sum(tf.square(Xt), -2) # [n] or [N,n]
Xeff = X/self.lengthscales # [n,d] or [N,n,d]
Xs = tf.reduce_sum(tf.square(Xeff), -1) # [n] or [N,n]
return tf.ones_like(Xs, dtype=float_type) + tf.exp(-2*Xs)
Loading

0 comments on commit ae888b1

Please sign in to comment.