minor

isaacdevlugt · isaacdevlugt · commit eb8f1918a22a · 2020-06-17T14:11:40.000-04:00
diff --git a/NADE.jl b/NADE.jl
@@ -190,15 +190,15 @@ function train(
     calc_fidelity=false,
     target=nothing,
     calc_observable=false,
-    fidelity_path=nothing,
     num_samples=nothing,
     observable=nothing,
     observable_args=nothing,
-    observable_path=nothing,
     early_stopping=nothing,
     early_stopping_args=nothing
 )
 
+    return_args = []
+
     # TODO: what if train_size % batch_size != 0
     num_batches = Int(size(train_data, 1) / batch_size)
 
@@ -213,11 +213,6 @@ function train(
         observable_stats = []
     end
 
-    # TODO
-    #if calc_NLL
-    #    NLLs = zeros(epochs / log_every)
-    #end
-
     count = 1
     for ep in 1:epochs
         # shuffle training data
@@ -234,7 +229,6 @@ function train(
             println("epoch: ", ep)
             
             if calc_fidelity
-
                 fid = fidelity(space, target)
                 fidelities = vcat(fid, fidelities)
                 println("Fidelity = ",fid)
@@ -275,40 +269,21 @@ function train(
 
     end
 
-    # save NADE parameters
-    if parameter_path != nothing
-        @save parameter_path*".jld2" θ
-    else
-        @save "NADE_parameters.jld2" θ
-    end
-
-    # save metrics
-    if calc_fidelity   
-        if fidelity_path != nothing
-            tmp = fidelity_path
-        else
-            tmp = "training_fidelities"
-        end
- 
-        open(tmp, "w") do io
-            writedlm(io, fidelities)
-        end
+    if calc_fidelity  
+        push!(return_args, fidelities) 
     end
     
     if calc_observable
-        if observable_path != nothing
-            tmp = observable_path
-        else
-            tmp = "training_"*string(observable)
-        end
-
-        open(tmp, "w") do io
-            writedlm(io, observable_stats)
-        end
+        push!(return_args, observable_stats) 
     end
- 
+    return return_args 
+
 end
 
+function save_params(path)
+    @save path θ
+end 
+
 function generate_hilbert_space()
     dim = [i for i in 0:2^N-1] 
     space = space = parse.(Int64, split(bitstring(dim[1])[end-N+1:end],""))
diff --git a/Theory.md b/Theory.md
@@ -2,11 +2,11 @@
 
 ## Introduction
 
-When algorithmically modeling ground (or thermal) states in computational / statistical physics, an important issue that plagues some algorithms is the equilibration time to generate uncorrelated samples from our model (e.g. Markov chain Monte Carlo). Typically, these algorithms that employ some form of Markov chain are required in order to avoid calculating the exponentially-scaling partition function. Algorithms that produce samples via a Markov chain are usually not desirable, albeit there are plenty of algorithms in existence wherein the equilibration time to produce uncorrelated samples from the model is relatively small. And sometimes, Markov chain methods are all we have. But, generally speaking, a model where samples can only be drawn in this manner is undesirable if alternatives are available.
+When algorithmically modeling ground (or thermal) states in computational / statistical physics, an important issue that plagues some algorithms is the equilibration time to generate uncorrelated samples from our model (e.g. Markov Chain Monte Carlo). Typically, these algorithms that employ some form of Markov Chain are required in order to avoid calculating the exponentially-scaling partition function. Algorithms that produce samples via a Markov Chain are usually not desirable, albeit there are plenty of algorithms in existence wherein the equilibration time to produce uncorrelated samples from the model is relatively small. And sometimes, Markov Chain methods are all we have. But, generally speaking, a model where samples can only be drawn in this manner is undesirable if alternatives are available.
 
-In machine learning, the Restricted Boltzmann machine (RBM) is a generative model that is burdened by a Markov-chain-like procedure to produce samples called Gibbs sampling. However, even though the RBM has this undesirable property, it has many properties that physicists and people in the machine learning community find appealing. And, in some cases, the equilibration time in the Gibbs sampling procedure is quite small.
+In machine learning, the Restricted Boltzmann machine (RBM) is a generative model that is burdened by a Markov Chain-like procedure to produce samples called Gibbs sampling. However, even though the RBM has this undesirable property it has many properties that physicists and people in the machine learning community find very appealing. And in some cases, the equilibration time in the Gibbs sampling procedure is quite small.
 
-Algorithms wherein the partition function need not be calculated, yet the probability distribution defined by the model can be directly sampled, and therefore a Markov chain is not required, are called autoregressive. There exists generative models that have this desirable property (e.g. recurrent neural networks). In this blog post, we will go through one autoregressive generative model called a neural autoregressive distribtuions estimator (NADE). Oddly enough, its network architecture stems from an RBM.
+Algorithms wherein the partition function need not be calculated, yet the probability distribution defined by the model can be directly/exactly sampled, and therefore a Markov Chain is not required, are called autoregressive. There exists generative models that have this desirable property (e.g. recurrent neural networks). In this blog post, we will go through one autoregressive generative model called a neural autoregressive distribtuions estimator (NADE). Oddly enough, its network architecture stems from an RBM. This blog post is based upon Refs. [1-3].
 
 ## An RBM as a Bayesian Network
 
@@ -22,7 +22,7 @@ $$
 p(\mathbf{v}) = \frac{e^{-\sum_{\mathbf{h} \in \mathcal{H}_{\mathbf{h}}}E(\mathbf{v},h)}}{Z},
 $$
 
-where $\mathbf{v}$ and $\mathbf{h}$ denote the visible and hidden layer of the RBM, respectively. Models that are autoregressive define a probability distribution that is the product of conditional disitributions of the $i^{\text{th}}$ visible unit given all preceeding visible units.
+where $\mathbf{v}$ and $\mathbf{h}$ denote the visible and hidden layer of the RBM, respectively. Models that are autoregressive define a probability distribution that is the product of conditional disitributions of the $i^{\text{th}}$ visible unit ($v_i$) given all preceeding visible units ($\mathbf{v}_{<i}$).
 
 $$
 p_{\text{autoreg.}}(\mathbf{v}) = \prod_{i} p(v_i \vert \mathbf{v}_{<i})
@@ -34,7 +34,7 @@ $$
 p(\mathbf{v}) = \prod_{i} p(v_i \vert \mathbf{v}_{<i}) = \prod_{i} \frac{p(v_i, \mathbf{v}_{ \lt i})}{p(\mathbf{v}_{ \lt i})}
 $$
 
-However, this is not tractable. If we can approximate the numerator and denominator, then there may be instances where the above expression is tractable and, therefore, we've made the RBM autoregressive.
+However, $p(v_i, \mathbf{v}_{ \lt i})$ nor $p(\mathbf{v}_{ \lt i})$ are tractable. If we can approximate both quantities, then there might be instances where the above expression is tractable and we've made the RBM autoregressive.
 
 Consider a mean-field approach for the approximation (recall that a mean-field approximation just relates to the idea that our variables are independent, e.g. $p(a,b) = p(a)p(b)$): approximate $p(v_i \vert \mathbf{v}_{<i})$ by finding a tractable approximation for $p(v_i, \mathbf{v}_{>i}, \mathbf{h} \vert \mathbf{v}_{<i}) \approx q(v_i, \mathbf{v}_{>i}, \mathbf{h} \vert \mathbf{v}_{<i})$ such that $q(v_i \vert \mathbf{v}_{<i})$ is easily obtainable. In our mean-field approximation for $p(v_i, \mathbf{v}_{>i}, \mathbf{h} \vert \mathbf{v}_{<i})$, 
 
@@ -165,4 +165,19 @@ $$
 &\qquad \delta \mathbf{a} \leftarrow \delta \mathbf{a} + \delta \mathbf{h}_i \bigodot \mathbf{h}_i \bigodot (1 - \mathbf{h}_i) \\
 &\text{return} \qquad \delta \mathbf{b}, \delta \mathbf{c}, \delta \mathbf{W}, \delta \mathbf{U}
 \end{aligned}
-$$
+$$
+
+## Try for yourself!
+
+I have open-source code for using NADEs to do quantum state reconstruction. It is relatively new and continues to be updated with more functionality regularily. Go check it out [here](https://github.com/isaacdevlugt/GreNADE.git).
+
+## References 
+
+[1] B. McNaughton, M. V. Milošević, A. Perali, and S. Pilati, ArXiv:2002.04292 (2020).
+
+[2] H. Larochelle and I. Murray, AISTATS 15, 9 (2011).
+
+[3] B. Uria, M.-A. Côté, K. Gregor, I. Murray, and H. Larochelle, ArXiv:1605.02226 (2016).
+
+
+
diff --git a/run.jl b/run.jl
@@ -4,11 +4,32 @@ using DelimitedFiles
 using Random
 using Distributions
 using LinearAlgebra
+using ArgParse
 
 include("NADE.jl")
+include("postprocess.jl")
+
+function parse_commandline()
+    s = ArgParseSettings()
+    @add_arg_table! s begin
+    "--Nh"
+        help = "number of hidden units"
+        arg_type=Int
+    "--train_path"
+        help = "training data path"
+        arg_type=String
+    "--psi_path"
+        help = "true psi path"
+        arg_type=String
+    end
+    return parse_args(s)
+end
+
+parsed_args = parse_commandline()
 
-train_path = "tfim1D_samples"
-psi_path = "tfim1D_psi"
+Nh = parsed_args["Nh"]
+train_path = parsed_args["train_path"]
+psi_path = parsed_args["psi_path"]
 
 train_data = Int.(readdlm(train_path))
 true_psi = readdlm(psi_path)[:,1]
@@ -17,8 +38,8 @@ N = size(train_data,2)
 NADE_ID = rand(0:10000) 
 
 # names of files to save things to
-fidelity_path = "fidelities_N=$N"*"_ID=$NADE_ID"
-parameter_path = "parameters_N=$N"*"_ID=$NADE_ID"
+fidelity_path = "fidelities/fidelity_N=$N"*"_Nh=$Nh"*"_ID=$NADE_ID"
+parameter_path = "params/parameters_N=$N"*"_Nh=$Nh"*"_ID=$NADE_ID"
 
 function fidelity_stopping(current_fid, desired_fid)
     if current_fid >= desired_fid
@@ -28,64 +49,38 @@ function fidelity_stopping(current_fid, desired_fid)
     end
 end 
 
-function observable_stopping(current_obs_stats, desired_obs)
-    if abs(current_obs_stats[1] - desired_obs[1]) / desired_obs[1] <= desired_obs[2] 
-        return true
-    else
-        return false
-    end
-end
-
-function true_magnetization()
-    magnetization = 0
-    for Ket = 0:2^N-1
-        SumSz = 0.
-        for SpinIndex = 0:N-1
-            Spin1 = 2*((Ket>>SpinIndex)&1) - 1
-            SumSz += Spin1
-        end
-        magnetization += SumSz*SumSz*psi[Ket+1]^2
-    end
-    return magnetization / N
-end
-
-function spin_flip(idx, s)
-    s[idx] *= -1.0
-end
-
-function magnetization(sample)
-    sample = (sample .* 2) .- 1
-    return sum(sample)*sum(sample) / N
-end 
-
 # Change these hyperparameters to your liking 
-Nh = 20 # number of hidden units 
-
-η = 0.001
+η = 0.01
 batch_size = 100
-epochs = 500
-log_every = 10
+epochs = 10000
+log_every = 100
 opt = ADAM(η)
 
 desired_fid = 0.995
-
-#tolerance = 0.05
-# arguments for early_stopping function
-#desired_obs = (true_magnetization(), tolerance)
-
 initialize_parameters(seed=9999)
 
-train(
+args = train(
     train_data, 
     batch_size=batch_size, 
     opt=opt, 
     epochs=epochs,
     calc_fidelity=true,
     target=true_psi, 
-    fidelity_path=fidelity_path,
     early_stopping=fidelity_stopping,
     early_stopping_args=desired_fid,
-    log_every=1
+    log_every=log_every
 )
 
+fidelities = args[1]
 
+if fidelities[size(fidelities,1)] >= desired_fid
+    println("Reached desired fidelity")
+    open(fidelity_path, "w") do io
+        writedlm(io, fidelities)
+    end
+    @save parameter_path θ  
+else
+    println("Increasing Nh by 5")
+    Nh += 5
+    submit_new_job(Nh, train_path, psi_path) 
+end