diff --git a/README.md b/README.md index e8c3303..23c1279 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # StochasticTree Python Package +**NOTE**: we are in the process of refactoring this project so that the R, Python, and C++ source code sits in the [same repo](https://github.com/StochasticTree/stochtree-cpp/). + ## Getting started The python package can be installed from source. Before you begin, make sure you have [conda](https://www.anaconda.com/download) installed. diff --git a/demo/notebooks/causal_inference.ipynb b/demo/notebooks/causal_inference.ipynb index 209cd4a..0e8663a 100644 --- a/demo/notebooks/causal_inference.ipynb +++ b/demo/notebooks/causal_inference.ipynb @@ -42,8 +42,7 @@ "outputs": [], "source": [ "# RNG\n", - "random_seed = 101\n", - "rng = np.random.default_rng(random_seed)\n", + "rng = np.random.default_rng()\n", "\n", "# Generate covariates and basis\n", "n = 1000\n", @@ -53,9 +52,8 @@ "Z = rng.binomial(1, pi_X, n).astype(float)\n", "\n", "# Define the outcome mean functions (prognostic and treatment effects)\n", - "mu_X = pi_X*5\n", - "# tau_X = np.sin(X[:,1]*2*np.pi)\n", - "tau_X = X[:,1]*2\n", + "mu_X = pi_X*5 + 2*X[:,2]\n", + "tau_X = (X[:,1]*2 - 1)\n", "\n", "# Generate outcome\n", "epsilon = rng.normal(0, 1, n)\n", @@ -105,7 +103,7 @@ "outputs": [], "source": [ "bcf_model = BCFModel()\n", - "bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=1000)" + "bcf_model.sample(X_train, Z_train, y_train, pi_train, X_test, Z_test, pi_test, num_gfr=10, num_mcmc=100)" ] }, { diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..231827d --- /dev/null +++ b/docs/README.md @@ -0,0 +1,18 @@ +# Python Package Documentation + +## Building Documentation Locally + +The online documentation is built automatically upon successful PR merge (see [here](https://github.com/StochasticTree/stochtree-python/blob/main/.github/workflows/docs.yml) for the Github workflow). +To build the documentation locally, first ensure that you have [Sphinx](https://www.sphinx-doc.org/en/master/) installed, then navigate to the python package's main directory (i.e. `cd [path/to/stochtree-python]`), +install the package, and run `sphinx-build` as below + +``` +pip install . +sphinx-build -M html docs/source/ docs/build/ +``` + +## Documentation Style + +Module (class, function, etc...) documentation follows [the numpy standard](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard), +applied in Sphinx using the [napoleon](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html) extension. + diff --git a/docs/requirements.txt b/docs/requirements.txt index 897ca9c..ca08467 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,18 +4,23 @@ beautifulsoup4==4.12.3 certifi==2024.2.2 charset-normalizer==3.3.2 docutils==0.20.1 +exceptiongroup==1.2.1 furo==2024.5.6 idna==3.7 imagesize==1.4.1 importlib_metadata==7.1.0 +iniconfig==2.0.0 Jinja2==3.1.4 joblib==1.4.2 MarkupSafe==2.1.5 numpy==1.24.4 +numpydoc==1.7.0 packaging==24.0 pandas==2.0.3 +pluggy==1.5.0 pybind11==2.12.0 Pygments==2.18.0 +pytest==8.2.1 python-dateutil==2.9.0.post0 pytz==2024.1 requests==2.32.2 @@ -32,7 +37,9 @@ sphinxcontrib-htmlhelp==2.0.1 sphinxcontrib-jsmath==1.0.1 sphinxcontrib-qthelp==1.0.3 sphinxcontrib-serializinghtml==1.1.5 +tabulate==0.9.0 threadpoolctl==3.5.0 +tomli==2.0.1 tzdata==2024.1 urllib3==2.2.1 zipp==3.18.2 diff --git a/docs/source/conf.py b/docs/source/conf.py index b2fd35d..5d762fc 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,13 +21,13 @@ extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', + 'numpydoc' ] templates_path = ['_templates'] exclude_patterns = [] - # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output diff --git a/stochtree/__init__.py b/stochtree/__init__.py index b868e54..da68f6f 100644 --- a/stochtree/__init__.py +++ b/stochtree/__init__.py @@ -2,10 +2,11 @@ from .bcf import BCFModel from .data import Dataset, Residual from .forest import ForestContainer +from .preprocessing import CovariateTransformer from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel from .serialization import JSONSerializer from .utils import NotSampledError __all__ = ['BARTModel', 'BCFModel', 'Dataset', 'Residual', 'ForestContainer', - 'RNG', 'ForestSampler', 'GlobalVarianceModel', 'LeafVarianceModel', - 'JSONSerializer', 'NotSampledError'] \ No newline at end of file + 'CovariateTransformer', 'RNG', 'ForestSampler', 'GlobalVarianceModel', + 'LeafVarianceModel', 'JSONSerializer', 'NotSampledError'] \ No newline at end of file diff --git a/stochtree/bart.py b/stochtree/bart.py index a5bdf0d..b58a75a 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -1,9 +1,13 @@ +""" +Bayesian Additive Regression Trees (BART) module +""" import numpy as np import pandas as pd from scipy.linalg import lstsq from scipy.stats import gamma from .data import Dataset, Residual from .forest import ForestContainer +from .preprocessing import CovariateTransformer from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel from .utils import NotSampledError @@ -20,67 +24,89 @@ def is_sampled(self) -> bool: return self.sampled def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = None, X_test: np.array = None, basis_test: np.array = None, - feature_types: np.array = None, cutpoint_grid_size = 100, sigma_leaf: float = None, alpha: float = 0.95, beta: float = 2.0, - min_samples_leaf: int = 5, nu: float = 3, lamb: float = None, a_leaf: float = 3, b_leaf: float = None, q: float = 0.9, - sigma2: float = None, num_trees: int = 200, num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, - sample_sigma_global: bool = True, sample_sigma_leaf: bool = True, random_seed: int = -1, - keep_burnin: bool = False, keep_gfr: bool = False) -> None: + cutpoint_grid_size = 100, sigma_leaf: float = None, alpha: float = 0.95, beta: float = 2.0, min_samples_leaf: int = 5, + nu: float = 3, lamb: float = None, a_leaf: float = 3, b_leaf: float = None, q: float = 0.9, sigma2: float = None, + num_trees: int = 200, num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, sample_sigma_global: bool = True, + sample_sigma_leaf: bool = True, random_seed: int = -1, keep_burnin: bool = False, keep_gfr: bool = False) -> None: """Runs a BART sampler on provided training set. Predictions will be cached for the training set and (if provided) the test set. Does not require a leaf regression basis. - :param X_train: Training set covariates on which trees may be partitioned - :type X_train: np.array - :param y_train: Training set outcome - :type y_train: np.array - :param basis_train: Optional training set basis vector used to define a regression to be run in the leaves of each tree - :type basis_train: np.array, optional - :param X_test: Test set covariates on which trees may be partitioned - :type X_test: np.array - :param basis_test: Optional test set basis vector used to define a regression to be run in the leaves of each tree. Must be included / omitted consistently (i.e. if basis_train is provided, then basis_test must be provided alongside X_test). - :type basis_test: np.array, optional - :param feature_types: Indicators of feature type (0 = numeric, 1 = ordered categorical, 2 = unordered categorical). If omitted, all covariates are assumed to be numeric. - :type feature_types: np.array, optional - :param cutpoint_grid_size: Maximum number of cutpoints to consider for each feature. Defaults to 100. - :type cutpoint_grid_size: int, optional - :param sigma_leaf: Scale parameter on the leaf node regression model. - :type sigma_leaf: float, optional - :param alpha: Prior probability of splitting for a tree of depth 0. Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. - :type alpha: float, optional - :param beta: Exponent that decreases split probabilities for nodes of depth > 0. Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. - :type beta: float, optional - :param min_samples_leaf: Minimum allowable size of a leaf, in terms of training samples. Defaults to 5. - :type min_samples_leaf: int, optional - :param nu: Shape parameter in the ``IG(nu, nu*lambda)`` global error variance model. Defaults to 3. - :type nu: float, optional - :param lambda: Component of the scale parameter in the ``IG(nu, nu*lambda)`` global error variance prior. If not specified, this is calibrated as in Sparapani et al (2021). - :type lambda: float, optional - :param a_leaf: Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model. Defaults to 3. - :type a_leaf: float, optional - :param b_leaf: Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model. Calibrated internally as ``0.5/num_trees`` if not set here. - :type b_leaf: float, optional - :param q: Quantile used to calibrated ``lambda`` as in Sparapani et al (2021). Defaults to 0.9. - :type q: float, optional - :param sigma2: Starting value of global variance parameter. Calibrated internally as in Sparapani et al (2021) if not set here. - :type sigma2: float, optional - :param num_trees: Number of trees in the ensemble. Defaults to 200. - :type num_trees: int, optional - :param num_gfr: Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Defaults to 5. - :type num_gfr: int, optional - :param num_burnin: Number of "burn-in" iterations of the MCMC sampler. Defaults to 0. - :type num_burnin: int, optional - :param num_mcmc: Number of "retained" iterations of the MCMC sampler. Defaults to 100. If this is set to 0, GFR (XBART) samples will be retained. - :type num_mcmc: int, optional - :param sample_sigma_global: Whether or not to update the ``sigma^2`` global error variance parameter based on ``IG(nu, nu*lambda)``. Defaults to True. - :type sample_sigma_global: bool, optional - :param sample_sigma_leaf: Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)``. Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to True. - :type sample_sigma_leaf: bool, optional - :param random_seed: Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to ``std::random_device``. - :type random_seed: int, optional - :param keep_burnin: Whether or not "burnin" samples should be included in predictions. Defaults to False. Ignored if num_mcmc = 0. - :type keep_burnin: bool, optional - :param keep_gfr: Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to False. Ignored if num_mcmc = 0. - :type keep_gfr: bool, optional + Parameters + ---------- + X_train : np.array + Training set covariates on which trees may be partitioned. + y_train : np.array + Training set outcome. + basis_train : :obj:`np.array`, optional + Optional training set basis vector used to define a regression to be run in the leaves of each tree. + X_test : :obj:`np.array`, optional + Optional test set covariates. + basis_test : :obj:`np.array`, optional + Optional test set basis vector used to define a regression to be run in the leaves of each tree. + Must be included / omitted consistently (i.e. if basis_train is provided, then basis_test must be provided alongside X_test). + cutpoint_grid_size : :obj:`int`, optional + Maximum number of cutpoints to consider for each feature. Defaults to ``100``. + sigma_leaf : :obj:`float`, optional + Scale parameter on the leaf node regression model. + alpha : :obj:`float`, optional + Prior probability of splitting for a tree of depth 0. + Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + beta : :obj:`float`, optional + Exponent that decreases split probabilities for nodes of depth > 0. + Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + min_samples_leaf : :obj:`int`, optional + Minimum allowable size of a leaf, in terms of training samples. Defaults to ``5``. + nu : :obj:`float`, optional + Shape parameter in the ``IG(nu, nu*lamb)`` global error variance model. Defaults to ``3``. + lamb : :obj:`float`, optional + Component of the scale parameter in the ``IG(nu, nu*lambda)`` global error variance prior. If not specified, this is calibrated as in Sparapani et al (2021). + a_leaf : :obj:`float`, optional + Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model. Defaults to ``3``. + b_leaf : :obj:`float`, optional + Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model. Calibrated internally as ``0.5/num_trees`` if not set here. + q : :obj:`float`, optional + Quantile used to calibrated ``lamb`` as in Sparapani et al (2021). Defaults to ``0.9``. + sigma2 : :obj:`float`, optional + Starting value of global variance parameter. Calibrated internally as in Sparapani et al (2021) if not set here. + num_trees : :obj:`int`, optional + Number of trees in the ensemble. Defaults to ``200``. + num_gfr : :obj:`int`, optional + Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Defaults to ``5``. + num_burnin : :obj:`int`, optional + Number of "burn-in" iterations of the MCMC sampler. Defaults to ``0``. Ignored if ``num_gfr > 0``. + num_mcmc : :obj:`int`, optional + Number of "retained" iterations of the MCMC sampler. Defaults to ``100``. If this is set to 0, GFR (XBART) samples will be retained. + sample_sigma_global : :obj:`bool`, optional + Whether or not to update the ``sigma^2`` global error variance parameter based on ``IG(nu, nu*lambda)``. Defaults to ``True``. + sample_sigma_leaf : :obj:`bool`, optional + Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)``. Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to ``True``. + random_seed : :obj:`int`, optional + Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to ``std::random_device``. + keep_burnin : :obj:`bool`, optional + Whether or not "burnin" samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. + keep_gfr : :obj:`bool`, optional + Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. + + Returns + ------- + self : BARTModel + Sampled BART Model. """ + # Check data inputs + if not isinstance(X_train, pd.DataFrame) and not isinstance(X_train, np.ndarray): + raise ValueError("X_train must be a pandas dataframe or numpy array") + if X_test is not None: + if not isinstance(X_test, pd.DataFrame) and not isinstance(X_test, np.ndarray): + raise ValueError("X_test must be a pandas dataframe or numpy array") + if not isinstance(y_train, np.ndarray): + raise ValueError("y_train must be a numpy array") + if basis_train is not None: + if not isinstance(basis_train, np.ndarray): + raise ValueError("basis_train must be a numpy array") + if basis_test is not None: + if not isinstance(basis_test, np.ndarray): + raise ValueError("X_test must be a numpy array") + # Convert everything to standard shape (2-dimensional) if X_train.ndim == 1: X_train = np.expand_dims(X_train, 1) @@ -114,6 +140,14 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N if X_test is not None and basis_test is not None: if X_test.shape[0] != basis_test.shape[0]: raise ValueError("X_test and basis_test must have the same number of rows") + + # Covariate preprocessing + self._covariate_transformer = CovariateTransformer() + self._covariate_transformer.fit(X_train) + X_train_processed = self._covariate_transformer.transform(X_train) + if X_test is not None: + X_test_processed = self._covariate_transformer.transform(X_test) + feature_types = np.asarray(self._covariate_transformer._processed_feature_types) # Determine whether a test set is provided self.has_test = X_test is not None @@ -123,16 +157,12 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N # Unpack data dimensions self.n_train = y_train.shape[0] - self.n_test = X_test.shape[0] if self.has_test else 0 - self.num_covariates = X_train.shape[1] + self.n_test = X_test_processed.shape[0] if self.has_test else 0 + self.num_covariates = X_train_processed.shape[1] self.num_basis = basis_train.shape[1] if self.has_basis else 0 - - # Set feature type defaults if not provided - if feature_types is None: - feature_types = np.zeros(self.num_covariates) # Set variable weights for the prognostic and treatment effect forests - variable_weights = np.repeat(1.0/X_train.shape[1], X_train.shape[1]) + variable_weights = np.repeat(1.0/self.num_covariates, self.num_covariates) # Scale outcome self.y_bar = np.squeeze(np.mean(y_train)) @@ -141,7 +171,7 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N # Calibrate priors for global sigma^2 and sigma_leaf if lamb is None: - reg_basis = np.c_[np.ones(self.n_train),X_train] + reg_basis = np.c_[np.ones(self.n_train),X_train_processed] reg_soln = lstsq(reg_basis, np.squeeze(resid_train)) sigma2hat = reg_soln[1] / self.n_train quantile_cutoff = q @@ -166,12 +196,12 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N # Forest Dataset (covariates and optional basis) forest_dataset_train = Dataset() - forest_dataset_train.add_covariates(X_train) + forest_dataset_train.add_covariates(X_train_processed) if self.has_basis: forest_dataset_train.add_basis(basis_train) if self.has_test: forest_dataset_test = Dataset() - forest_dataset_test.add_covariates(X_test) + forest_dataset_test.add_covariates(X_test_processed) if self.has_basis: forest_dataset_test.add_basis(basis_test) @@ -282,12 +312,17 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N def predict(self, covariates: np.array, basis: np.array = None) -> np.array: """Predict outcome from every retained forest of a BART sampler. - :param covariates: Test set covariates - :type covariates: np.array - :param basis: Optional test set basis vector, must be provided if the model was trained with a leaf regression basis - :type basis: np.array, optional - :return: Array of predictions with as many rows as in ``covariates`` and as many columns as retained samples of the algorithm. - :rtype: np.array + Parameters + ---------- + covariates : np.array + Test set covariates. + basis_train : :obj:`np.array`, optional + Optional test set basis vector, must be provided if the model was trained with a leaf regression basis. + + Returns + ------- + np.array + Array of predictions with as many rows as in ``covariates`` and as many columns as retained samples of the algorithm. """ if not self.is_sampled(): msg = ( diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 9b6a539..ec1683f 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -1,3 +1,6 @@ +""" +Bayesian Causal Forests (BCF) module +""" import numpy as np import pandas as pd from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor @@ -8,6 +11,7 @@ from .bart import BARTModel from .data import Dataset, Residual from .forest import ForestContainer +from .preprocessing import CovariateTransformer from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel from .utils import NotSampledError @@ -24,7 +28,7 @@ def is_sampled(self) -> bool: return self.sampled def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_train: np.array = None, - X_test: np.array = None, Z_test: np.array = None, pi_test: np.array = None, feature_types: np.array = None, + X_test: np.array = None, Z_test: np.array = None, pi_test: np.array = None, cutpoint_grid_size = 100, sigma_leaf_mu: float = None, sigma_leaf_tau: float = None, alpha_mu: float = 0.95, alpha_tau: float = 0.25, beta_mu: float = 2.0, beta_tau: float = 3.0, min_samples_leaf_mu: int = 5, min_samples_leaf_tau: int = 5, nu: float = 3, lamb: float = None, @@ -37,86 +41,100 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra """Runs a BCF sampler on provided training set. Outcome predictions and estimates of the prognostic and treatment effect functions will be cached for the training set and (if provided) the test set. - :param X_train: Covariates used to split trees in the ensemble. Can be passed as either a matrix or dataframe. - :type X_train: np.array - :param Z_train: Vector of (continuous or binary) treatment assignments. - :type Z_train: np.array - :param y_train: Outcome to be modeled by the ensemble. - :type y_train: np.array - :param pi_train: Optional vector of propensity scores. If not provided, this will be estimated from the data. - :type pi_train: np.array, optional - :param X_test: Optional test set of covariates used to define "out of sample" evaluation data. - :type X_test: np.array - :param Z_test: Optional test set of (continuous or binary) treatment assignments. - :type Z_test: np.array, optional - :param pi_test: Optional test set vector of propensity scores. If not provided (but ``X_test`` and ``Z_test`` are), this will be estimated from the data. - :type pi_test: np.array, optional - :param feature_types: Indicators of feature type (0 = numeric, 1 = ordered categorical, 2 = unordered categorical). If omitted, all covariates are assumed to be numeric. - :type feature_types: np.array, optional - :param cutpoint_grid_size: Maximum number of cutpoints to consider for each feature. Defaults to 100. - :type cutpoint_grid_size: int, optional - :param sigma_leaf_mu: Starting value of leaf node scale parameter for the prognostic forest. Calibrated internally as ``2/num_trees_mu`` if not set here. - :type sigma_leaf_mu: float, optional - :param sigma_leaf_tau: Starting value of leaf node scale parameter for the treatment effect forest. Calibrated internally as ``1/num_trees_mu`` if not set here. - :type sigma_leaf_tau: float, optional - :param alpha_mu: Prior probability of splitting for a tree of depth 0 for the prognostic forest. Tree split prior combines ``alpha_mu`` and ``beta_mu`` via ``alpha_mu*(1+node_depth)^-beta_mu``. - :type alpha_mu: float, optional - :param alpha_tau: Prior probability of splitting for a tree of depth 0 for the treatment effect forest. Tree split prior combines ``alpha_tau`` and ``beta_tau`` via ``alpha_tau*(1+node_depth)^-beta_tau``. - :type alpha_tau: float, optional - :param beta_mu: Exponent that decreases split probabilities for nodes of depth > 0 for the prognostic forest. Tree split prior combines ``alpha_mu`` and ``beta_mu`` via ``alpha_mu*(1+node_depth)^-beta_mu``. - :type beta_mu: float, optional - :param beta_tau: Exponent that decreases split probabilities for nodes of depth > 0 for the treatment effect forest. Tree split prior combines ``alpha_tau`` and ``beta_tau`` via ``alpha_tau*(1+node_depth)^-beta_tau``. - :type beta_tau: float, optional - :param min_samples_leaf_mu: Minimum allowable size of a leaf, in terms of training samples, for the prognostic forest. Defaults to 5. - :type min_samples_leaf_mu: int, optional - :param min_samples_leaf_tau: Minimum allowable size of a leaf, in terms of training samples, for the treatment effect forest. Defaults to 5. - :type min_samples_leaf_tau: int, optional - :param nu: Shape parameter in the ``IG(nu, nu*lambda)`` global error variance model. Defaults to 3. - :type nu: float, optional - :param lambda: Component of the scale parameter in the ``IG(nu, nu*lambda)`` global error variance prior. If not specified, this is calibrated as in Sparapani et al (2021). - :type lambda: float, optional - :param a_leaf_mu: Shape parameter in the ``IG(a_leaf_mu, b_leaf_mu)`` leaf node parameter variance model for the prognostic forest. Defaults to 3. - :type a_leaf_mu: float, optional - :param a_leaf_tau: Shape parameter in the ``IG(a_leaf_tau, b_leaf_tau)`` leaf node parameter variance model for the treatment effect forest. Defaults to 3. - :type a_leaf_tau: float, optional - :param b_leaf_mu: Scale parameter in the ``IG(a_leaf_mu, b_leaf_mu)`` leaf node parameter variance model for the prognostic forest. Calibrated internally as ``0.5/num_trees_mu`` if not set here. - :type b_leaf_mu: float, optional - :param b_leaf_tau: Scale parameter in the ``IG(a_leaf_tau, b_leaf_tau)`` leaf node parameter variance model for the treatment effect forest. Calibrated internally as ``0.5/num_trees_tau`` if not set here. - :type b_leaf_tau: float, optional - :param q: Quantile used to calibrated ``lambda`` as in Sparapani et al (2021). Defaults to 0.9. - :type q: float, optional - :param sigma2: Starting value of global variance parameter. Calibrated internally as in Sparapani et al (2021) if not set here. - :type sigma2: float, optional - :param num_trees_mu: Number of trees in the prognostic forest. Defaults to 200. - :type num_trees_mu: int, optional - :param num_trees: Number of trees in the treatment effect forest. Defaults to 50. - :type num_trees: int, optional - :param num_gfr: Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Defaults to 5. - :type num_gfr: int, optional - :param num_burnin: Number of "burn-in" iterations of the MCMC sampler. Defaults to 0. - :type num_burnin: int, optional - :param num_mcmc: Number of "retained" iterations of the MCMC sampler. Defaults to 100. If this is set to 0, GFR (XBART) samples will be retained. - :type num_mcmc: int, optional - :param sample_sigma_global: Whether or not to update the ``sigma^2`` global error variance parameter based on ``IG(nu, nu*lambda)``. Defaults to True. - :type sample_sigma_global: bool, optional - :param sample_sigma_leaf_mu: Whether or not to update the leaf scale variance parameter based on ``IG(a_leaf_mu, b_leaf_mu)`` for the prognostic forest. Defaults to True. - :type sample_sigma_leaf_mu: bool, optional - :param sample_sigma_leaf_tau: Whether or not to update the leaf scale variance parameter based on ``IG(a_leaf_tau, b_leaf_tau)`` for the treatment effect forest. Defaults to True. - :type sample_sigma_leaf_tau: bool, optional - :param propensity_covariate: Whether to include the propensity score as a covariate in either or both of the forests. Enter "none" for neither, "mu" for the prognostic forest, "tau" for the treatment forest, and "both" for both forests. If this is not "none" and a propensity score is not provided, it will be estimated from (``X_train``, ``Z_train``) using ``BARTModel``. Defaults to "mu". - :type propensity_covariate: string, optional - :param adaptive_coding: Whether or not to use an "adaptive coding" scheme in which a binary treatment variable is not coded manually as (0,1) or (-1,1) but learned via parameters ``b_0`` and ``b_1`` that attach to the outcome model ``[b_0 (1-Z) + b_1 Z] tau(X)``. This is ignored when Z is not binary. Defaults to True. - :type adaptive_coding: bool, optional - :param b_0: Initial value of the "control" group coding parameter. This is ignored when Z is not binary. Default: -0.5. - :type b_0: bool, optional - :param b_1: Initial value of the "control" group coding parameter. This is ignored when Z is not binary. Default: 0.5. - :type b_1: bool, optional - :param random_seed: Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to ``std::random_device``. - :type random_seed: int, optional - :param keep_burnin: Whether or not "burnin" samples should be included in predictions. Defaults to False. Ignored if num_mcmc = 0. - :type keep_burnin: bool, optional - :param keep_gfr: Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to False. Ignored if num_mcmc = 0. - :type keep_gfr: bool, optional + Parameters + ---------- + X_train : np.array or pd.DataFrame + Covariates used to split trees in the ensemble. Can be passed as either a matrix or dataframe. + Z_train : np.array + Array of (continuous or binary) treatment assignments. + y_train : np.array + Outcome to be modeled by the ensemble. + pi_train : np.array + Optional vector of propensity scores. If not provided, this will be estimated from the data. + X_test : :obj:`np.array`, optional + Optional test set of covariates used to define "out of sample" evaluation data. + Z_test : :obj:`np.array`, optional + Optional test set of (continuous or binary) treatment assignments. + Must be provided if ``X_test`` is provided. + pi_test : :obj:`np.array`, optional + Optional test set vector of propensity scores. If not provided (but ``X_test`` and ``Z_test`` are), this will be estimated from the data. + cutpoint_grid_size : :obj:`int`, optional + Maximum number of cutpoints to consider for each feature. Defaults to ``100``. + sigma_leaf_mu : :obj:`float`, optional + Starting value of leaf node scale parameter for the prognostic forest. Calibrated internally as ``2/num_trees_mu`` if not set here. + sigma_leaf_tau : :obj:`float`, optional + Starting value of leaf node scale parameter for the treatment effect forest. Calibrated internally as ``1/num_trees_mu`` if not set here. + alpha_mu : :obj:`float`, optional + Prior probability of splitting for a tree of depth 0 for the prognostic forest. + Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + alpha_tau : :obj:`float`, optional + Prior probability of splitting for a tree of depth 0 for the treatment effect forest. + Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + beta_mu : :obj:`float`, optional + Exponent that decreases split probabilities for nodes of depth > 0 for the prognostic forest. + Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + beta_tau : :obj:`float`, optional + Exponent that decreases split probabilities for nodes of depth > 0 for the treatment effect forest. + Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + min_samples_leaf_mu : :obj:`int`, optional + Minimum allowable size of a leaf, in terms of training samples, for the prognostic forest. Defaults to ``5``. + min_samples_leaf_tau : :obj:`int`, optional + Minimum allowable size of a leaf, in terms of training samples, for the treatment effect forest. Defaults to ``5``. + nu : :obj:`float`, optional + Shape parameter in the ``IG(nu, nu*lamb)`` global error variance model. Defaults to ``3``. + lamb : :obj:`float`, optional + Component of the scale parameter in the ``IG(nu, nu*lambda)`` global error variance prior. If not specified, this is calibrated as in Sparapani et al (2021). + a_leaf_mu : :obj:`float`, optional + Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the prognostic forest. Defaults to ``3``. + a_leaf_tau : :obj:`float`, optional + Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the treatment effect forest. Defaults to ``3``. + b_leaf_mu : :obj:`float`, optional + Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the prognostic forest. Calibrated internally as ``0.5/num_trees`` if not set here. + b_leaf_tau : :obj:`float`, optional + Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the treatment effect forest. Calibrated internally as ``0.5/num_trees`` if not set here. + q : :obj:`float`, optional + Quantile used to calibrated ``lamb`` as in Sparapani et al (2021). Defaults to ``0.9``. + sigma2 : :obj:`float`, optional + Starting value of global variance parameter. Calibrated internally as in Sparapani et al (2021) if not set here. + num_trees_mu : :obj:`int`, optional + Number of trees in the prognostic forest. Defaults to ``200``. + num_trees_tau : :obj:`int`, optional + Number of trees in the treatment effect forest. Defaults to ``50``. + num_gfr : :obj:`int`, optional + Number of "warm-start" iterations run using the grow-from-root algorithm (He and Hahn, 2021). Defaults to ``5``. + num_burnin : :obj:`int`, optional + Number of "burn-in" iterations of the MCMC sampler. Defaults to ``0``. Ignored if ``num_gfr > 0``. + num_mcmc : :obj:`int`, optional + Number of "retained" iterations of the MCMC sampler. Defaults to ``100``. If this is set to 0, GFR (XBART) samples will be retained. + sample_sigma_global : :obj:`bool`, optional + Whether or not to update the ``sigma^2`` global error variance parameter based on ``IG(nu, nu*lambda)``. Defaults to ``True``. + sample_sigma_leaf_mu : :obj:`bool`, optional + Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)`` for the prognostic forest. + Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to ``True``. + sample_sigma_leaf_tau : :obj:`bool`, optional + Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)`` for the treatment effect forest. + Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to ``True``. + propensity_covariate : :obj:`str`, optional + Whether to include the propensity score as a covariate in either or both of the forests. Enter ``"none"`` for neither, ``"mu"`` for the prognostic forest, ``"tau"`` for the treatment forest, and ``"both"`` for both forests. + If this is not ``"none"`` and a propensity score is not provided, it will be estimated from (``X_train``, ``Z_train``) using ``BARTModel``. Defaults to ``"mu"``. + adaptive_coding : :obj:`bool`, optional + Whether or not to use an "adaptive coding" scheme in which a binary treatment variable is not coded manually as (0,1) or (-1,1) but learned via + parameters ``b_0`` and ``b_1`` that attach to the outcome model ``[b_0 (1-Z) + b_1 Z] tau(X)``. This is ignored when Z is not binary. Defaults to True. + b_0 : :obj:`float`, optional + Initial value of the "control" group coding parameter. This is ignored when ``Z`` is not binary. Default: ``-0.5``. + b_1 : :obj:`float`, optional + Initial value of the "treated" group coding parameter. This is ignored when ``Z`` is not binary. Default: ``0.5``. + random_seed : :obj:`int`, optional + Integer parameterizing the C++ random number generator. If not specified, the C++ random number generator is seeded according to ``std::random_device``. + keep_burnin : :obj:`bool`, optional + Whether or not "burnin" samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. + keep_gfr : :obj:`bool`, optional + Whether or not "warm-start" / grow-from-root samples should be included in predictions. Defaults to ``False``. Ignored if ``num_mcmc == 0``. + + Returns + ------- + self : BCFModel + Sampled BCF Model. """ # Convert everything to standard shape (2-dimensional) if X_train.ndim == 1: @@ -158,6 +176,14 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra if X_test is not None and pi_test is not None: if X_test.shape[0] != pi_test.shape[0]: raise ValueError("X_test and pi_test must have the same number of rows") + + # Covariate preprocessing + self._covariate_transformer = CovariateTransformer() + self._covariate_transformer.fit(X_train) + X_train_processed = self._covariate_transformer.transform(X_train) + if X_test is not None: + X_test_processed = self._covariate_transformer.transform(X_test) + feature_types = np.asarray(self._covariate_transformer._processed_feature_types) # Determine whether a test set is provided self.has_test = X_test is not None @@ -165,7 +191,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra # Unpack data dimensions self.n_train = y_train.shape[0] self.n_test = X_test.shape[0] if self.has_test else 0 - self.p_x = X_train.shape[1] + self.p_x = X_train_processed.shape[1] # Check whether treatment is binary self.binary_treatment = np.unique(Z_train).size == 2 @@ -180,50 +206,46 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra self.bart_propensity_model = BARTModel() if self.has_test: pi_test = np.mean(self.bart_propensity_model.y_hat_test, axis = 1, keepdims = True) - self.bart_propensity_model.sample(X_train=X_train, y_train=Z_train, X_test=X_test, num_gfr=10, num_mcmc=10) + self.bart_propensity_model.sample(X_train=X_train_processed, y_train=Z_train, X_test=X_test_processed, num_gfr=10, num_mcmc=10) pi_train = np.mean(self.bart_propensity_model.y_hat_train, axis = 1, keepdims = True) pi_test = np.mean(self.bart_propensity_model.y_hat_test, axis = 1, keepdims = True) else: - self.bart_propensity_model.sample(X_train=X_train, y_train=Z_train, num_gfr=10, num_mcmc=10) + self.bart_propensity_model.sample(X_train=X_train_processed, y_train=Z_train, num_gfr=10, num_mcmc=10) pi_train = np.mean(self.bart_propensity_model.y_hat_train, axis = 1, keepdims = True) self.internal_propensity_model = True else: self.internal_propensity_model = False - # Set feature type defaults if not provided - if feature_types is None: - feature_types = np.zeros(self.p_x) - # Update covariates to include propensities if requested if propensity_covariate == "mu": feature_types_mu = np.append(feature_types, 0).astype('int') feature_types_tau = feature_types.astype('int') - X_train_mu = np.c_[X_train, pi_train] - X_train_tau = X_train + X_train_mu = np.c_[X_train_processed, pi_train] + X_train_tau = X_train_processed if self.has_test: X_test_mu = np.c_[X_test, pi_test] X_test_tau = X_test elif propensity_covariate == "tau": feature_types_tau = np.append(feature_types, 0).astype('int') feature_types_mu = feature_types.astype('int') - X_train_tau = np.c_[X_train, pi_train] - X_train_mu = X_train + X_train_tau = np.c_[X_train_processed, pi_train] + X_train_mu = X_train_processed if self.has_test: X_test_tau = np.c_[X_test, pi_test] X_test_mu = X_test elif propensity_covariate == "both": feature_types_tau = np.append(feature_types, 0).astype('int') feature_types_mu = np.append(feature_types, 0).astype('int') - X_train_tau = np.c_[X_train, pi_train] - X_train_mu = np.c_[X_train, pi_train] + X_train_tau = np.c_[X_train_processed, pi_train] + X_train_mu = np.c_[X_train_processed, pi_train] if self.has_test: X_test_tau = np.c_[X_test, pi_test] X_test_mu = np.c_[X_test, pi_test] elif propensity_covariate == "none": feature_types_tau = feature_types.astype('int') feature_types_mu = feature_types.astype('int') - X_train_tau = X_train - X_train_mu = X_train + X_train_tau = X_train_processed + X_train_mu = X_train_processed if self.has_test: X_test_tau = X_test X_test_mu = X_test @@ -244,7 +266,7 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra # Calibrate priors for global sigma^2 and sigma_leaf_mu / sigma_leaf_tau if lamb is None: - reg_basis = np.c_[np.ones(self.n_train),X_train] + reg_basis = np.c_[np.ones(self.n_train),X_train_processed] reg_soln = lstsq(reg_basis, np.squeeze(resid_train)) sigma2hat = reg_soln[1] / self.n_train quantile_cutoff = q @@ -496,14 +518,19 @@ def sample(self, X_train: np.array, Z_train: np.array, y_train: np.array, pi_tra def predict_tau(self, X: np.array, Z: np.array, propensity: np.array = None) -> np.array: """Predict CATE function for every provided observation. - :param X: Test set covariates - :type X: np.array - :param Z: Test set treatment indicators - :type Z: np.array - :param propensity: Optional test set propensities. Must be provided if propensities were provided when ``.sample()`` was run and propensity scores were included in the CATE model. - :type propensity: np.array, optional - :return: Array with as many rows as in ``X`` and as many columns as retained samples of the algorithm. - :rtype: np.array + Parameters + ---------- + X : np.array or pd.DataFrame + Test set covariates. + Z : np.array + Test set treatment indicators. + propensity : :obj:`np.array`, optional + Optional test set propensities. Must be provided if propensities were provided when the model was sampled. + + Returns + ------- + np.array + Array with as many rows as in ``X`` and as many columns as retained samples of the algorithm. """ if not self.is_sampled(): msg = ( @@ -559,14 +586,21 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> np.a """Predict outcome model components (CATE function and prognostic function) as well as overall outcome for every provided observation. Predicted outcomes are computed as ``yhat = mu_x + Z*tau_x`` where mu_x is a sample of the prognostic function and tau_x is a sample of the treatment effect (CATE) function. - :param X: Test set covariates - :type X: np.array - :param Z: Test set treatment indicators - :type Z: np.array - :param propensity: Optional test set propensities. Must be provided if propensities were provided when ``.sample()`` was run. - :type propensity: np.array, optional - :return: Tuple of arrays with as many rows as in ``X`` and as many columns as retained samples of the algorithm. The first entry of the tuple contains conditional average treatment effect (CATE) samples, the second entry contains prognostic effect samples, and the third entry contains outcome prediction samples - :rtype: tuple + Parameters + ---------- + X : np.array or pd.DataFrame + Test set covariates. + Z : np.array + Test set treatment indicators. + propensity : :obj:`np.array`, optional + Optional test set propensities. Must be provided if propensities were provided when the model was sampled. + + Returns + ------- + tuple of np.array + Tuple of arrays with as many rows as in ``X`` and as many columns as retained samples of the algorithm. + The first entry of the tuple contains conditional average treatment effect (CATE) samples, + the second entry contains prognostic effect samples, and the third entry contains outcome prediction samples """ if not self.is_sampled(): msg = ( diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py new file mode 100644 index 0000000..905cab8 --- /dev/null +++ b/stochtree/preprocessing.py @@ -0,0 +1,266 @@ +""" +Data preprocessing module, drawn largely from the sklearn preprocessing module, released under the BSD-3-Clause license, with the following copyright + +Copyright (c) 2007-2024 The scikit-learn developers. +""" +from typing import Union, Optional, Any +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder +from sklearn.utils.validation import check_array, column_or_1d +import numpy as np +import pandas as pd +import warnings + +class CovariateTransformer: + """Class that transforms covariates to a format that can be used to define tree splits + """ + + def __init__(self) -> None: + self._is_fitted = False + self._ordinal_encoders = [] + self._onehot_encoders = [] + self._ordinal_feature_index = [] + self._onehot_feature_index = [] + self._processed_feature_types = [] + self._original_feature_types = [] + + def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool: + if dtype.kind == "b" or dtype.kind == "i" or dtype.kind == "u" or dtype.kind == "f": + return True + else: + return False + + def _process_unordered_categorical(self, covariate: pd.Series) -> int: + num_onehot = len(self._onehot_encoders) + category_list = covariate.array.categories.to_list() + enc = OneHotEncoder(categories=[category_list], sparse_output=False) + enc.fit(pd.DataFrame(covariate)) + self._onehot_encoders.append(enc) + return num_onehot + + def _process_ordered_categorical(self, covariate: pd.Series) -> int: + num_ord = len(self._ordinal_encoders) + category_list = covariate.array.categories.to_list() + enc = OrdinalEncoder(categories=[category_list]) + enc.fit(pd.DataFrame(covariate)) + self._ordinal_encoders.append(enc) + return num_ord + + def _fit_pandas(self, covariates: pd.DataFrame) -> None: + self._num_original_features = covariates.shape[1] + self._ordinal_feature_index = [-1 for i in range(self._num_original_features)] + self._onehot_feature_index = [-1 for i in range(self._num_original_features)] + self._original_feature_types = [-1 for i in range(self._num_original_features)] + datetime_types = covariates.apply(lambda x: pd.api.types.is_datetime64_any_dtype(x)) + object_types = covariates.apply(lambda x: pd.api.types.is_object_dtype(x)) + interval_types = covariates.apply(lambda x: isinstance(x.dtype, pd.IntervalDtype)) + period_types = covariates.apply(lambda x: isinstance(x.dtype, pd.PeriodDtype)) + timedelta_types = np.logical_or(covariates.apply(lambda x: pd.api.types.is_timedelta64_dtype(x)), + covariates.apply(lambda x: pd.api.types.is_timedelta64_ns_dtype(x))) + sparse_types = covariates.apply(lambda x: isinstance(x.dtype, pd.SparseDtype)) + bool_types = covariates.apply(lambda x: pd.api.types.is_bool_dtype(x)) + categorical_types = covariates.apply(lambda x: isinstance(x.dtype, pd.CategoricalDtype)) + float_types = covariates.apply(lambda x: pd.api.types.is_float_dtype(x)) + integer_types = covariates.apply(lambda x: pd.api.types.is_integer_dtype(x)) + string_types = covariates.apply(lambda x: pd.api.types.is_integer_dtype(x)) + if np.any(datetime_types): + # raise ValueError("DateTime columns are currently unsupported") + datetime_cols = covariates.columns[datetime_types].to_list() + warn_msg = "The following columns are a type unsupported by stochtree (DateTime) and will be ignored: {}" + warnings.warn(warn_msg.format(datetime_cols)) + if np.any(interval_types): + # raise ValueError("Interval columns are currently unsupported") + interval_cols = covariates.columns[interval_types].to_list() + warn_msg = "The following columns are a type unsupported by stochtree (Interval) and will be ignored: {}" + warnings.warn(warn_msg.format(interval_cols)) + if np.any(period_types): + # raise ValueError("Period columns are currently unsupported") + period_cols = covariates.columns[period_types].to_list() + warn_msg = "The following columns are a type unsupported by stochtree (Period) and will be ignored: {}" + warnings.warn(warn_msg.format(period_cols)) + if np.any(timedelta_types): + # raise ValueError("TimeDelta columns are currently unsupported") + timedelta_cols = covariates.columns[timedelta_types].to_list() + warn_msg = "The following columns are a type unsupported by stochtree (TimeDelta) and will be ignored: {}" + warnings.warn(warn_msg.format(timedelta_cols)) + if np.any(sparse_types): + # raise ValueError("Sparse columns are currently unsupported") + sparse_cols = covariates.columns[sparse_types].to_list() + warn_msg = "The following columns are a type unsupported by stochtree (Sparse) and will be ignored: {}" + warnings.warn(warn_msg.format(sparse_cols)) + if np.any(object_types): + # raise ValueError("Object columns are currently unsupported") + object_cols = covariates.columns[object_types].to_list() + warn_msg = "The following columns are a type unsupported by stochtree (object) and will be ignored: {}" + warnings.warn(warn_msg.format(object_cols)) + + for i in range(covariates.shape[1]): + covariate = covariates.iloc[:,i] + if categorical_types.iloc[i]: + self._original_feature_types[i] = "category" + if covariate.array.ordered: + ord_index = self._process_ordered_categorical(covariate) + self._ordinal_feature_index[i] = ord_index + self._processed_feature_types.append(1) + else: + onehot_index = self._process_unordered_categorical(covariate) + self._onehot_feature_index[i] = onehot_index + feature_ones = np.repeat(1, len(covariate.array.categories)).tolist() + self._processed_feature_types.extend(feature_ones) + elif string_types.iloc[i]: + self._original_feature_types[i] = "string" + onehot_index = self._process_unordered_categorical(covariate) + self._onehot_feature_index[i] = onehot_index + feature_ones = np.repeat(1, len(self._onehot_encoders[onehot_index].categories_[0])).tolist() + self._processed_feature_types.extend(feature_ones) + elif bool_types.iloc[i]: + self._original_feature_types[i] = "boolean" + self._processed_feature_types.append(1) + elif integer_types.iloc[i]: + self._original_feature_types[i] = "integer" + self._processed_feature_types.append(0) + elif float_types.iloc[i]: + self._original_feature_types[i] = "float" + self._processed_feature_types.append(0) + else: + self._original_feature_types[i] = "unsupported" + + def _fit_numpy(self, covariates: np.array) -> None: + if covariates.ndim == 1: + covariates = np.expand_dims(covariates, 1) + elif covariates.ndim > 2: + raise ValueError("Covariates passed as a numpy array must be 1d or 2d") + + self._num_original_features = covariates.shape[1] + self._ordinal_feature_index = [-1 for i in range(self._num_original_features)] + self._onehot_feature_index = [-1 for i in range(self._num_original_features)] + self._original_feature_types = ["float" for i in range(self._num_original_features)] + + # Check whether the array is numeric + cov_dtype = covariates.dtype + if len(cov_dtype) == 0: + array_numeric = True + else: + array_numeric = True + for i in range(len(cov_dtype)): + if not self._check_is_numeric_dtype(cov_dtype[i]): + array_numeric = False + if not array_numeric: + raise ValueError("Covariates passed as np.array must all be simple numeric types (bool, integer, unsigned integer, floating point)") + + # Scan for binary columns + for i in range(self._num_original_features): + num_unique = np.unique(covariates[:,i]).size + if num_unique == 2: + self._processed_feature_types.append(1) + else: + self._processed_feature_types.append(0) + + def _fit(self, covariates: Union[pd.DataFrame, np.array]) -> None: + if isinstance(covariates, pd.DataFrame): + self._fit_pandas(covariates) + elif isinstance(covariates, np.ndarray): + self._fit_numpy(covariates) + else: + raise ValueError("covariates must be a pd.DataFrame or a np.array") + self._is_fitted = True + + def _transform_pandas(self, covariates: pd.DataFrame) -> np.array: + if self._num_original_features != covariates.shape[1]: + raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality") + + output_array = np.empty((covariates.shape[0], len(self._processed_feature_types)), dtype=np.float64) + output_iter = 0 + for i in range(covariates.shape[1]): + covariate = covariates.iloc[:,i] + if self._original_feature_types[i] == "category" or self._original_feature_types[i] == "string": + if self._ordinal_feature_index[i] != -1: + ord_ind = self._ordinal_feature_index[i] + covariate_transformed = self._ordinal_encoders[ord_ind].transform(pd.DataFrame(covariate)) + output_array[:,output_iter] = np.squeeze(covariate_transformed) + output_iter += 1 + else: + onehot_ind = self._onehot_feature_index[i] + covariate_transformed = self._onehot_encoders[onehot_ind].transform(pd.DataFrame(covariate)) + output_dim = covariate_transformed.shape[1] + output_array[:,np.arange(output_iter, output_iter + output_dim)] = np.squeeze(covariate_transformed) + output_iter += output_dim + + elif self._original_feature_types[i] == "boolean": + output_array[:,output_iter] = (covariate*1.0).to_numpy() + output_iter += 1 + + elif self._original_feature_types[i] == "integer" or self._original_feature_types[i] == "float": + output_array[:,output_iter] = (covariate).to_numpy() + output_iter += 1 + + return output_array + + def _transform_numpy(self, covariates: np.array) -> np.array: + if covariates.ndim == 1: + covariates = np.expand_dims(covariates, 1) + elif covariates.ndim > 2: + raise ValueError("Covariates passed as a numpy array must be 1d or 2d") + if self._num_original_features != covariates.shape[1]: + raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality") + + return covariates + + def _transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: + if self._check_is_fitted(): + if isinstance(covariates, pd.DataFrame): + return self._transform_pandas(covariates) + elif isinstance(covariates, np.ndarray): + return self._transform_numpy(covariates) + else: + raise ValueError("covariates must be a pd.DataFrame or a np.array") + else: + raise ValueError("Attempting to call transform() from an CovariateTransformer that has not yet been fit") + + def _check_is_fitted(self) -> bool: + return self._is_fitted + + def fit(self, covariates: Union[pd.DataFrame, np.array]) -> None: + """Fits a ``CovariateTransformer`` by unpacking (and storing) data type information on the input (raw) covariates + and then converting to a numpy array which can be passed to a tree ensemble sampler. + + If ``covariates`` is a ``pd.DataFrame``, `column dtypes `_ + will be handled as follows: + + * ``category``: one-hot encoded if unordered, ordinal encoded if ordered + * ``string``: one-hot encoded + * ``boolean``: passed through as binary integer, treated as ordered categorical by tree samplers + * integer (i.e. ``Int8``, ``Int16``, etc...): passed through as double (**note**: if you have categorical data stored as integers, you should explicitly convert it to categorical in pandas, see this `user guide `_) + * float (i.e. ``Float32``, ``Float64``): passed through as double + * ``object``: currently unsupported, convert object columns to numeric or categorical before passing + * Datetime (i.e. ``datetime64``): currently unsupported, though datetime columns can be converted to numeric features, see `here `_ + * Period (i.e. ``period[]``): currently unsupported, though period columns can be converted to numeric features, see `here `_ + * Interval (i.e. ``interval``, ``Interval[datetime64[ns]]``): currently unsupported, though interval columns can be converted to numeric or categorical features, see `here `_ + * Sparse (i.e. ``Sparse``, ``Sparse[float]``): currently unsupported, convert sparse columns to dense before passing + + Columns with unsupported types will be ignored, with a warning. + + If ``covariates`` is a ``np.array``, columns must be numeric and the only preprocessing done by ``CovariateTransformer.fit()`` is to + auto-detect binary columns. All other integer-valued columns will be passed through to the tree sampler as (continuous) numeric data. + If you would like to treat integer-valued data as categorical, you can either convert your numpy array to a pandas dataframe and + explicitly tag such columns as ordered / unordered categorical, or preprocess manually using ``sklearn.preprocessing.OneHotEncoder`` + and ``sklearn.preprocessing.OrdinalEncoder``. + + Parameters + ---------- + covariates : np.array or pd.DataFrame + Covariates to be preprocessed. + + Returns + ------- + self : CovariateTransformer + Fitted CovariateTransformer. + """ + self._fit(covariates) + return self + + def transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: + return self._transform(covariates) + + def fit_transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: + self._fit(covariates) + return self._transform(covariates) diff --git a/stochtree/utils.py b/stochtree/utils.py index 2e67743..be09935 100644 --- a/stochtree/utils.py +++ b/stochtree/utils.py @@ -1,4 +1,3 @@ - class NotSampledError(ValueError, AttributeError): """Exception class to raise if attempting to predict from a model before it has been sampled. diff --git a/test/test_preprocessor.py b/test/test_preprocessor.py new file mode 100644 index 0000000..b9a7a51 --- /dev/null +++ b/test/test_preprocessor.py @@ -0,0 +1,74 @@ +import numpy as np +import pandas as pd +from stochtree import CovariateTransformer + +class TestPreprocessor: + def test_numpy(self): + cov_transformer = CovariateTransformer() + np_1 = np.array( + [[1.5, 8.7, 1.2], + [2.7, 3.4, 5.4], + [3.6, 1.2, 9.3], + [4.4, 5.4, 10.4], + [5.3, 9.3, 3.6], + [6.1, 10.4, 4.4]] + ) + np_1_transformed = cov_transformer.fit_transform(np_1) + np.testing.assert_array_equal(np_1, np_1_transformed) + assert cov_transformer._processed_feature_types == [0,0,0] + + def test_pandas(self): + df_1 = pd.DataFrame( + {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], + "x2": [8.7, 3.4, 1.2, 5.4, 9.3, 10.4], + "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} + ) + np_1 = np.array( + [[1.5, 8.7, 1.2], + [2.7, 3.4, 5.4], + [3.6, 1.2, 9.3], + [4.4, 5.4, 10.4], + [5.3, 9.3, 3.6], + [6.1, 10.4, 4.4]] + ) + cov_transformer = CovariateTransformer() + df_1_transformed = cov_transformer.fit_transform(df_1) + np.testing.assert_array_equal(np_1, df_1_transformed) + assert cov_transformer._processed_feature_types == [0,0,0] + + df_2 = pd.DataFrame( + {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], + "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, categories=['c', 'b', 'a']), + "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} + ) + np_2 = np.array( + [[1.5, 2, 1.2], + [2.7, 1, 5.4], + [3.6, 0, 9.3], + [4.4, 2, 10.4], + [5.3, 1, 3.6], + [6.1, 0, 4.4]] + ) + cov_transformer = CovariateTransformer() + df_2_transformed = cov_transformer.fit_transform(df_2) + np.testing.assert_array_equal(np_2, df_2_transformed) + assert cov_transformer._processed_feature_types == [0,1,0] + + df_3 = pd.DataFrame( + {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], + "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']), + "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} + ) + np_3 = np.array( + [[1.5, 0, 0, 1, 1.2], + [2.7, 0, 1, 0, 5.4], + [3.6, 1, 0, 0, 9.3], + [4.4, 0, 0, 1, 10.4], + [5.3, 0, 1, 0, 3.6], + [6.1, 1, 0, 0, 4.4]] + ) + cov_transformer = CovariateTransformer() + df_3_transformed = cov_transformer.fit_transform(df_3) + np.testing.assert_array_equal(np_3, df_3_transformed) + assert cov_transformer._processed_feature_types == [0,1,1,1,0] +