StochasticTree
diff --git a/‎R/bart.R
Lines changed: 6 additions & 7 deletions b/‎R/bart.R
Lines changed: 6 additions & 7 deletions
diff --git a/‎R/config.R
Lines changed: 1 addition & 1 deletion b/‎R/config.R
Lines changed: 1 addition & 1 deletion
diff --git a/‎demo/debug/supervised_learning_feature_subsets.py
Lines changed: 54 additions & 0 deletions b/‎demo/debug/supervised_learning_feature_subsets.py
Lines changed: 54 additions & 0 deletions
diff --git a/‎src/py_stochtree.cpp
Lines changed: 5 additions & 5 deletions b/‎src/py_stochtree.cpp
Lines changed: 5 additions & 5 deletions
diff --git a/‎stochtree/bart.py
Lines changed: 16 additions & 2 deletions b/‎stochtree/bart.py
Lines changed: 16 additions & 2 deletions
@@ -44,6 +44,7 @@
 #'   - `keep_every` How many iterations of the burned-in MCMC sampler should be run before forests and parameters are retained. Default `1`. Setting `keep_every <- k` for some `k > 1` will "thin" the MCMC samples by retaining every `k`-th sample, rather than simply every sample. This can reduce the autocorrelation of the MCMC samples.
 #'   - `num_chains` How many independent MCMC chains should be sampled. If `num_mcmc = 0`, this is ignored. If `num_gfr = 0`, then each chain is run from root for `num_mcmc * keep_every + num_burnin` iterations, with `num_mcmc` samples retained. If `num_gfr > 0`, each MCMC chain will be initialized from a separate GFR ensemble, with the requirement that `num_gfr >= num_chains`. Default: `1`.
 #'   - `verbose` Whether or not to print progress during the sampling loops. Default: `FALSE`.
+#'   - `probit_outcome_model` Whether or not the outcome should be modeled as explicitly binary via a probit link. If `TRUE`, `y` must only contain the values `0` and `1`. Default: `FALSE`.
 #'
 #' @param mean_forest_params (Optional) A list of mean forest model parameters, each of which has a default value processed internally, so this argument list is optional.
 #'
@@ -58,8 +59,7 @@
 #'   - `sigma2_leaf_scale` Scale parameter in the `IG(sigma2_leaf_shape, sigma2_leaf_scale)` leaf node parameter variance model. Calibrated internally as `0.5/num_trees` if not set here.
 #'   - `keep_vars` Vector of variable names or column indices denoting variables that should be included in the forest. Default: `NULL`.
 #'   - `drop_vars` Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: `NULL`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored.
-#'   - `probit_outcome_model` Whether or not the outcome should be modeled as explicitly binary via a probit link. If `TRUE`, `y` must only contain the values `0` and `1`. Default: `FALSE`.
-#'   - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features passed in the training dataset.
+#'   - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset.
 #'
 #' @param variance_forest_params (Optional) A list of variance forest model parameters, each of which has a default value processed internally, so this argument list is optional.
 #'
@@ -74,7 +74,7 @@
 #'   - `var_forest_prior_scale` Scale parameter in the `IG(var_forest_prior_shape, var_forest_prior_scale)` conditional error variance model (which is only sampled if `num_trees > 0`). Calibrated internally as `num_trees / leaf_prior_calibration_param^2` if not set.
 #'   - `keep_vars` Vector of variable names or column indices denoting variables that should be included in the forest. Default: `NULL`.
 #'   - `drop_vars` Vector of variable names or column indices denoting variables that should be excluded from the forest. Default: `NULL`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored.
-#'   - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features passed in the training dataset.
+#'   - `num_features_subsample` How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset.
 #'   
 #' @return List of sampling outputs and a wrapper around the sampled forests (which can be used for in-memory prediction on new data, or serialized to JSON on disk).
 #' @export
@@ -117,7 +117,8 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train
         sigma2_global_shape = 0, sigma2_global_scale = 0, 
         variable_weights = NULL, random_seed = -1, 
         keep_burnin = FALSE, keep_gfr = FALSE, keep_every = 1, 
-        num_chains = 1, verbose = FALSE
+        num_chains = 1, verbose = FALSE, 
+        probit_outcome_model = FALSE
     )
     general_params_updated <- preprocessParams(
         general_params_default, general_params
@@ -130,7 +131,6 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train
         sample_sigma2_leaf = TRUE, sigma2_leaf_init = NULL, 
         sigma2_leaf_shape = 3, sigma2_leaf_scale = NULL, 
         keep_vars = NULL, drop_vars = NULL, 
-        probit_outcome_model = FALSE, 
         num_features_subsample = NULL
     )
     mean_forest_params_updated <- preprocessParams(
@@ -167,6 +167,7 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train
     keep_every <- general_params_updated$keep_every
     num_chains <- general_params_updated$num_chains
     verbose <- general_params_updated$verbose
+    probit_outcome_model <- general_params_updated$probit_outcome_model
 
     # 2. Mean forest parameters
     num_trees_mean <- mean_forest_params_updated$num_trees
@@ -180,7 +181,6 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train
     b_leaf <- mean_forest_params_updated$sigma2_leaf_scale
     keep_vars_mean <- mean_forest_params_updated$keep_vars
     drop_vars_mean <- mean_forest_params_updated$drop_vars
-    probit_outcome_model <- mean_forest_params_updated$probit_outcome_model
     num_features_subsample_mean <- mean_forest_params_updated$num_features_subsample
 
     # 3. Variance forest parameters
@@ -388,7 +388,6 @@ bart <- function(X_train, y_train, leaf_basis_train = NULL, rfx_group_ids_train
     if (is.null(num_features_subsample_variance)) {
         num_features_subsample_variance <- ncol(X_train)
     }
-    
 
     # Convert all input data to matrices if not already converted
     if ((is.null(dim(leaf_basis_train))) && (!is.null(leaf_basis_train))) {
 
@@ -143,7 +143,7 @@ ForestModelConfig <- R6::R6Class(
                 stop("`num_features_subsample` cannot be larger than `num_features`")
             }
             if (num_features_subsample <= 0) {
-                stop("`num_features_subsample` must at least 1")
+                stop("`num_features_subsample` must be at least 1")
             }
             self$num_features_subsample <- num_features_subsample
 
 
@@ -0,0 +1,54 @@
+# Supervised Learning Demo Script
+
+# Load necessary libraries
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from stochtree import BARTModel
+from sklearn.model_selection import train_test_split
+
+# Generate sample data
+# RNG
+random_seed = 1234
+rng = np.random.default_rng(random_seed)
+
+# Generate covariates and basis
+n = 1000
+p_X = 20
+X = rng.uniform(0, 1, (n, p_X))
+
+# Define the outcome mean function
+def outcome_mean(X):
+    return np.where(
+        (X[:,0] >= 0.0) & (X[:,0] < 0.25), -7.5, 
+        np.where(
+            (X[:,0] >= 0.25) & (X[:,0] < 0.5), -2.5, 
+            np.where(
+                (X[:,0] >= 0.5) & (X[:,0] < 0.75), 2.5, 
+                7.5
+            )
+        )
+    )
+
+# Generate outcome
+epsilon = rng.normal(0, 1, n)
+y = outcome_mean(X) + epsilon
+
+# Test-train split
+sample_inds = np.arange(n)
+train_inds, test_inds = train_test_split(sample_inds, test_size=0.2)
+X_train = X[train_inds,:]
+X_test = X[test_inds,:]
+y_train = y[train_inds]
+y_test = y[test_inds]
+
+# Run XBART with the full feature set
+bart_model_a = BARTModel()
+forest_config_a = {"num_trees": 100}
+bart_model_a.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=100, num_mcmc=0, mean_forest_params=forest_config_a)
+
+# Run XBART with each tree considering random subsets of 5 features
+bart_model_b = BARTModel()
+forest_config_b = {"num_trees": 100, "num_features_subsample": 5}
+bart_model_b.sample(X_train=X_train, y_train=y_train, X_test=X_test, num_gfr=100, num_mcmc=0, mean_forest_params=forest_config_b)
@@ -1028,7 +1028,7 @@ class ForestSamplerCpp {
   void SampleOneIteration(ForestContainerCpp& forest_samples, ForestCpp& forest, ForestDatasetCpp& dataset, ResidualCpp& residual, RngCpp& rng, 
                           py::array_t<int> feature_types, py::array_t<int> sweep_update_indices, int cutpoint_grid_size, py::array_t<double> leaf_model_scale_input, 
                           py::array_t<double> variable_weights, double a_forest, double b_forest, double global_variance, 
-                          int leaf_model_int, bool keep_forest = true, bool gfr = true) {
+                          int leaf_model_int, int num_features_subsample, bool keep_forest = true, bool gfr = true) {
     // Refactoring completely out of the Python interface.
     // Intention to refactor out of the C++ interface in the future.
     bool pre_initialized = true;
@@ -1090,13 +1090,13 @@ class ForestSamplerCpp {
     std::mt19937* rng_ptr = rng.GetRng();
     if (gfr) {
       if (model_type == StochTree::ModelType::kConstantLeafGaussian) {
-        StochTree::GFRSampleOneIter<StochTree::GaussianConstantLeafModel, StochTree::GaussianConstantSuffStat>(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get<StochTree::GaussianConstantLeafModel>(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true);
+        StochTree::GFRSampleOneIter<StochTree::GaussianConstantLeafModel, StochTree::GaussianConstantSuffStat>(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get<StochTree::GaussianConstantLeafModel>(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_features_subsample);
       } else if (model_type == StochTree::ModelType::kUnivariateRegressionLeafGaussian) {
-        StochTree::GFRSampleOneIter<StochTree::GaussianUnivariateRegressionLeafModel, StochTree::GaussianUnivariateRegressionSuffStat>(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get<StochTree::GaussianUnivariateRegressionLeafModel>(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true);
+        StochTree::GFRSampleOneIter<StochTree::GaussianUnivariateRegressionLeafModel, StochTree::GaussianUnivariateRegressionSuffStat>(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get<StochTree::GaussianUnivariateRegressionLeafModel>(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_features_subsample);
       } else if (model_type == StochTree::ModelType::kMultivariateRegressionLeafGaussian) {
-        StochTree::GFRSampleOneIter<StochTree::GaussianMultivariateRegressionLeafModel, StochTree::GaussianMultivariateRegressionSuffStat, int>(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get<StochTree::GaussianMultivariateRegressionLeafModel>(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_basis);
+        StochTree::GFRSampleOneIter<StochTree::GaussianMultivariateRegressionLeafModel, StochTree::GaussianMultivariateRegressionSuffStat, int>(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get<StochTree::GaussianMultivariateRegressionLeafModel>(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, true, num_features_subsample, num_basis);
       } else if (model_type == StochTree::ModelType::kLogLinearVariance) {
-        StochTree::GFRSampleOneIter<StochTree::LogLinearVarianceLeafModel, StochTree::LogLinearVarianceSuffStat>(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get<StochTree::LogLinearVarianceLeafModel>(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, false);
+        StochTree::GFRSampleOneIter<StochTree::LogLinearVarianceLeafModel, StochTree::LogLinearVarianceSuffStat>(*active_forest_ptr, *(tracker_.get()), *forest_sample_ptr, std::get<StochTree::LogLinearVarianceLeafModel>(leaf_model), *forest_data_ptr, *residual_data_ptr, *(split_prior_.get()), *rng_ptr, var_weights_vector, sweep_update_indices_, global_variance, feature_types_, cutpoint_grid_size, keep_forest, pre_initialized, false, num_features_subsample);
       }
     } else {
       if (model_type == StochTree::ModelType::kConstantLeafGaussian) {
 
@@ -131,6 +131,7 @@ def sample(
             * `keep_gfr` (`bool`): Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to `False`. Ignored if `num_mcmc == 0`.
             * `keep_every` (`int`): How many iterations of the burned-in MCMC sampler should be run before forests and parameters are retained. Defaults to `1`. Setting `keep_every = k` for some `k > 1` will "thin" the MCMC samples by retaining every `k`-th sample, rather than simply every sample. This can reduce the autocorrelation of the MCMC samples.
             * `num_chains` (`int`): How many independent MCMC chains should be sampled. If `num_mcmc = 0`, this is ignored. If `num_gfr = 0`, then each chain is run from root for `num_mcmc * keep_every + num_burnin` iterations, with `num_mcmc` samples retained. If `num_gfr > 0`, each MCMC chain will be initialized from a separate GFR ensemble, with the requirement that `num_gfr >= num_chains`. Defaults to `1`.
+            * `probit_outcome_model` (`bool`): Whether or not the outcome should be modeled as explicitly binary via a probit link. If `True`, `y` must only contain the values `0` and `1`. Default: `False`.
 
         mean_forest_params : dict, optional
             Dictionary of mean forest model parameters, each of which has a default value processed internally, so this argument is optional.
@@ -146,7 +147,7 @@ def sample(
             * `sigma2_leaf_scale` (`float`): Scale parameter in the `IG(sigma2_leaf_shape, sigma2_leaf_scale)` leaf node parameter variance model. Calibrated internally as `0.5/num_trees` if not set here.
             * `keep_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be included in the mean forest. Defaults to `None`.
             * `drop_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be excluded from the mean forest. Defaults to `None`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored.
-            * `probit_outcome_model` (`bool`): Whether or not the outcome should be modeled as explicitly binary via a probit link. If `True`, `y` must only contain the values `0` and `1`. Default: `False`.
+            * `num_features_subsample` (`int`): How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset.
 
         variance_forest_params : dict, optional
             Dictionary of variance forest model  parameters, each of which has a default value processed internally, so this argument is optional.
@@ -162,6 +163,7 @@ def sample(
             * `var_forest_prior_scale` (`float`): Scale parameter in the [optional] `IG(var_forest_prior_shape, var_forest_prior_scale)` conditional error variance forest (which is only sampled if `num_trees > 0`). Calibrated internally as `num_trees / leaf_prior_calibration_param^2` if not set here.
             * `keep_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be included in the variance forest. Defaults to `None`.
             * `drop_vars` (`list` or `np.array`): Vector of variable names or column indices denoting variables that should be excluded from the variance forest. Defaults to `None`. If both `drop_vars` and `keep_vars` are set, `drop_vars` will be ignored.
+            * `num_features_subsample` (`int`): How many features to subsample when growing each tree for the GFR algorithm. Defaults to the number of features in the training dataset.
 
         previous_model_json : str, optional
             JSON string containing a previous BART model. This can be used to "continue" a sampler interactively after inspecting the samples or to run parallel chains "warm-started" from existing forest samples. Defaults to `None`.
@@ -206,6 +208,7 @@ def sample(
             "sigma2_leaf_scale": None,
             "keep_vars": None,
             "drop_vars": None,
+            "num_features_subsample": None,
         }
         mean_forest_params_updated = _preprocess_params(
             mean_forest_params_default, mean_forest_params
@@ -224,6 +227,7 @@ def sample(
             "var_forest_prior_scale": None,
             "keep_vars": None,
             "drop_vars": None,
+            "num_features_subsample": None,
         }
         variance_forest_params_updated = _preprocess_params(
             variance_forest_params_default, variance_forest_params
@@ -257,6 +261,7 @@ def sample(
         b_leaf = mean_forest_params_updated["sigma2_leaf_scale"]
         keep_vars_mean = mean_forest_params_updated["keep_vars"]
         drop_vars_mean = mean_forest_params_updated["drop_vars"]
+        num_features_subsample_mean = mean_forest_params_updated["num_features_subsample"]
 
         # 3. Variance forest parameters
         num_trees_variance = variance_forest_params_updated["num_trees"]
@@ -272,6 +277,7 @@ def sample(
         b_forest = variance_forest_params_updated["var_forest_prior_scale"]
         keep_vars_variance = variance_forest_params_updated["keep_vars"]
         drop_vars_variance = variance_forest_params_updated["drop_vars"]
+        num_features_subsample_variance = variance_forest_params_updated["num_features_subsample"]
 
         # Override keep_gfr if there are no MCMC samples
         if num_mcmc == 0:
@@ -714,6 +720,12 @@ def sample(
             [variable_subset_variance.count(i) == 0 for i in original_var_indices]
         ] = 0
 
+        # Set num_features_subsample to default, ncol(X_train), if not already set
+        if num_features_subsample_mean is None:
+            num_features_subsample_mean = X_train.shape[1]
+        if num_features_subsample_variance is None:
+            num_features_subsample_variance = X_train.shape[1]
+
         # Preliminary runtime checks for probit link
         if not self.include_mean_forest:
             self.probit_outcome_model = False
@@ -1048,7 +1060,8 @@ def sample(
                 max_depth=max_depth_mean,
                 leaf_model_type=leaf_model_mean_forest,
                 leaf_model_scale=current_leaf_scale,
-                cutpoint_grid_size=cutpoint_grid_size,
+                cutpoint_grid_size=cutpoint_grid_size, 
+                num_features_subsample=num_features_subsample_mean
             )
             forest_sampler_mean = ForestSampler(
                 forest_dataset_train,
@@ -1071,6 +1084,7 @@ def sample(
                 cutpoint_grid_size=cutpoint_grid_size,
                 variance_forest_shape=a_forest,
                 variance_forest_scale=b_forest,
+                num_features_subsample=num_features_subsample_variance
             )
             forest_sampler_variance = ForestSampler(
                 forest_dataset_train,
Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,7 @@ ForestModelConfig <- R6::R6Class(`
`143`	`143`	stop("`num_features_subsample` cannot be larger than `num_features`")
`144`	`144`	`}`
`145`	`145`	`if (num_features_subsample <= 0) {`
`146`		- stop("`num_features_subsample` must at least 1")
	`146`	+ stop("`num_features_subsample` must be at least 1")
`147`	`147`	`}`
`148`	`148`	`self$num_features_subsample <- num_features_subsample`
`149`	`149`