linting using black / isort, fixed imports

theislab · picciama · Mar 7, 2022 · Feb 23, 2022 · Feb 23, 2022 · Feb 24, 2022
commit 365bcab662ea2aad79a7052f64a912fba6fac78e
diff --git a/batchglm/api/models/numpy/glm_nb.py b/batchglm/api/models/numpy/glm_nb.py
@@ -1,4 +1,4 @@
-from batchglm.models.glm_nb import Model, Simulator
+from batchglm.models.glm_nb import Model
 from batchglm.train.numpy.glm_nb import Estimator
 
-__all__ = ["Model", "Simulator", "Estimator"]
+__all__ = ["Model", "Estimator"]
diff --git a/batchglm/models/base_glm/__init__.py b/batchglm/models/base_glm/__init__.py
@@ -1,5 +1,4 @@
 from .estimator import _EstimatorGLM
 from .input import InputDataGLM
 from .model import _ModelGLM
-from .simulator import _SimulatorGLM
 from .utils import closedform_glm_mean, closedform_glm_scale, parse_design
diff --git a/batchglm/models/base_glm/external.py b/batchglm/models/base_glm/external.py
@@ -1,2 +1,3 @@
 import batchglm.data as data_utils
+from batchglm import pkg_constants
 from batchglm.utils.linalg import groupwise_solve_lm
diff --git a/batchglm/models/base_glm/model.py b/batchglm/models/base_glm/model.py
@@ -1,6 +1,6 @@
 import abc
 import logging
-from typing import Any, Dict, Iterable, Optional, Union
+from typing import Any, Callable, Dict, Iterable, Optional, Union
 
 import dask.array
 import numpy as np
@@ -10,7 +10,11 @@
 except ImportError:
     anndata = None
 
+import scipy
+
+from .external import pkg_constants
 from .input import InputDataGLM
+from .utils import generate_sample_description
 
 logger = logging.getLogger(__name__)
 
@@ -221,6 +225,145 @@ def get(self, key: Union[str, Iterable]) -> Union[Any, Dict[str, Any]]:
             attrib = {s: self.__getattribute__(s) for s in key}
         return attrib
 
+    def param_bounds(self, dtype):
+
+        dtype = np.dtype(dtype)
+        # dmin = np.finfo(dtype).min
+        dmax = np.finfo(dtype).max
+        dtype = dtype.type
+        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
+
+        return self.bounds(sf, dmax, dtype)
+
+    @abc.abstractmethod
+    def bounds(self, sf, dmax, dtype) -> Dict[str, Any]:
+        pass
+
+    # simulator:
+
+    @abc.abstractmethod
+    def rand_fn_ave(self) -> Optional[Callable]:
+        pass
+
+    @abc.abstractmethod
+    def rand_fn(self) -> Optional[Callable]:
+        pass
+
+    @abc.abstractmethod
+    def rand_fn_loc(self) -> Optional[Callable]:
+        pass
+
+    @abc.abstractmethod
+    def rand_fn_scale(self) -> Optional[Callable]:
+        pass
+
+    def generate_params(
+        self, n_vars: int, rand_fn_ave=None, rand_fn=None, rand_fn_loc=None, rand_fn_scale=None, **kwargs
+    ):
+        """
+        Generate all necessary parameters. TODO: make this documentation better!!!
+
+        :param rand_fn_ave: function which generates random numbers for intercept.
+            Takes one location parameter of intercept distribution across features.
+        :param rand_fn: random function taking one argument `shape`.
+        :param rand_fn_loc: random function taking one argument `shape`.
+            If not provided, will use `rand_fn` instead.
+            This function generates location model parameters in inverse linker space,
+            ie. these parameter will be log transformed if a log linker function is used!
+            Values below 1e-08 will be set to 1e-08 to map them into the positive support.
+        :param rand_fn_scale: random function taking one argument `shape`.
+            If not provided, will use `rand_fn` instead.
+            This function generates scale model parameters in inverse linker space,
+            ie. these parameter will be log transformed if a log linker function is used!
+            Values below 1e-08 will be set to 1e-08 to map them into the positive support.
+        """
+
+        if rand_fn_ave is None:
+            rand_fn_ave = self.rand_fn_ave
+            if rand_fn_ave is None:
+                raise ValueError("rand_fn_ave must not be None!")
+        if rand_fn is None:
+            rand_fn = self.rand_fn
+        if rand_fn_loc is None:
+            rand_fn_loc = self.rand_fn_loc
+        if rand_fn_scale is None:
+            rand_fn_scale = self.rand_fn_scale
+        if rand_fn is None and rand_fn_loc is None:
+            raise ValueError("rand_fn and rand_fn_loc must not be both None!")
+        if rand_fn is None and rand_fn_scale is None:
+            raise ValueError("rand_fn and rand_fn_scale must not be both None!")
+
+        if rand_fn_loc is None:
+            rand_fn_loc = rand_fn
+        if rand_fn_scale is None:
+            rand_fn_scale = rand_fn
+
+        design_loc, design_scale, sample_description = generate_sample_description(**kwargs)
+
+        sim_theta_location = np.concatenate(
+            [
+                self.link_loc(np.expand_dims(rand_fn_ave([n_vars]), axis=0)),  # intercept
+                rand_fn_loc((design_loc.shape[1] - 1, n_vars)),
+            ],
+            axis=0,
+        )
+        sim_theta_scale = np.concatenate([rand_fn_scale((design_scale.shape[1], self.nfeatures))], axis=0)
+
+        return sim_theta_location, sim_theta_scale, design_loc, design_scale, sample_description
+
+    def generate(
+        self,
+        n_obs: int,
+        n_vars: int,
+        num_conditions: int = 2,
+        num_batches: int = 4,
+        intercept_scale: bool = False,
+        shuffle_assignments: bool = False,
+        sparse: bool = False,
+    ):
+        """
+        First generates the parameter set, then observations random data using these parameters.
+
+        :param sparse: Description of parameter `sparse`.
+        """
+        (
+            sim_theta_location,
+            sim_theta_scale,
+            sim_design_loc,
+            sim_design_scale,
+            sample_description,
+        ) = self.generate_params(
+            n_vars=n_vars,
+            num_observations=n_obs,
+            num_conditions=num_conditions,
+            num_batches=num_batches,
+            intercept_scale=intercept_scale,
+            shuffle_assignments=shuffle_assignments,
+        )
+
+        data_matrix = self.generate_data()
+
+        if sparse:
+            data_matrix = scipy.sparse.csr_matrix(data_matrix)
+
+        self.input_data = InputDataGLM(
+            data=data_matrix,
+            design_loc=sim_design_loc,
+            design_scale=sim_design_scale,
+            design_loc_names=None,
+            design_scale_names=None,
+        )
+
+    @abc.abstractmethod
+    def generate_data(self):
+        """
+        Should sample random data based on distribution and parameters.
+
+        :param type args: TODO.
+        :param type kwargs: TODO.
+        """
+        pass
+
     def __getitem__(self, item):
         return self.get(item)
 

diff --git a/batchglm/models/base_glm/simulator.py b/batchglm/models/base_glm/simulator.py
diff --git a/batchglm/models/base_glm/utils.py b/batchglm/models/base_glm/utils.py
@@ -1,4 +1,5 @@
 import logging
+import math
 from typing import Callable, List, Optional, Tuple, Union
 
 import dask.array
@@ -17,6 +18,54 @@
 logger = logging.getLogger("batchglm")
 
 
+def generate_sample_description(
+    num_observations: int,
+    num_conditions: int,
+    num_batches: int,
+    intercept_scale: bool,
+    shuffle_assignments: bool,
+) -> Tuple[patsy.DesignMatrix, patsy.DesignMatrix, pd.DataFrame]:
+    """Build a sample description.
+
+    :param num_observations: Number of observations to simulate.
+    :param num_conditions: number of conditions; will be repeated like [1,2,3,1,2,3]
+    :param num_batches: number of conditions; will be repeated like [1,1,2,2,3,3]
+    :param intercept_scale: If true, returns a single-coefficient design matrix (formula = "~1").
+        If false, returns a design matrix identical to the loc model.
+    :param shuffle_assignments: If true, shuffle the assignments in the xarray.
+        UNSUPPORTED: Must be removed as it is disfunctional!!!
+    """
+    if num_conditions == 0:
+        num_conditions = 1
+    if num_batches == 0:
+        num_batches = 1
+
+    # condition column
+    reps_conditions = math.ceil(num_observations / num_conditions)
+    conditions = np.squeeze(np.tile([np.arange(num_conditions)], reps_conditions))
+    conditions = conditions[range(num_observations)].astype(str)
+
+    # batch column
+    reps_batches = math.ceil(num_observations / num_batches)
+    batches = np.repeat(range(num_batches), reps_batches)
+    batches = batches[range(num_observations)].astype(str)
+    sample_description = pd.DataFrame({"condition": conditions, "batch": batches})
+
+    if shuffle_assignments:
+        sample_description = sample_description.isel(
+            observations=np.random.permutation(sample_description.observations.values)
+        )
+
+    sim_design_loc = patsy.dmatrix("~1+condition+batch", sample_description)
+
+    if intercept_scale:
+        sim_design_scale = patsy.dmatrix("~1", sample_description)
+    else:
+        sim_design_scale = sim_design_loc
+
+    return sim_design_loc, sim_design_scale, sample_description
+
+
 def parse_design(
     design_matrix: Union[pd.DataFrame, patsy.design_info.DesignMatrix, dask.array.core.Array, np.ndarray],
     param_names: List[str] = None,

diff --git a/batchglm/models/glm_beta/__init__.py b/batchglm/models/glm_beta/__init__.py
@@ -1,3 +1,2 @@
 from .external import _EstimatorGLM
 from .model import Model
-from .simulator import Simulator
diff --git a/batchglm/models/glm_beta/external.py b/batchglm/models/glm_beta/external.py
@@ -1,4 +1,4 @@
 import batchglm.data as data_utils
 from batchglm import pkg_constants
-from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, _SimulatorGLM, closedform_glm_mean, closedform_glm_scale
+from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, closedform_glm_mean, closedform_glm_scale
 from batchglm.utils.linalg import groupwise_solve_lm
diff --git a/batchglm/models/glm_beta/model.py b/batchglm/models/glm_beta/model.py
@@ -1,4 +1,5 @@
 import abc
+from typing import Any, Callable, Dict, Optional
 
 try:
     import anndata
@@ -58,3 +59,57 @@ def p(self) -> np.ndarray:
     @property
     def q(self) -> np.ndarray:
         return (1 - self.mean) * self.samplesize
+
+    # parameter contraints:
+
+    def bounds(self, sf, dmax, dtype) -> Dict[str, Any]:
+
+        zero = np.nextafter(0, np.inf, dtype=dtype)
+        one = np.nextafter(1, -np.inf, dtype=dtype)
+
+        bounds_min = {
+            "theta_location": np.log(zero / (1 - zero)) / sf,
+            "theta_scale": np.log(zero) / sf,
+            "eta_loc": np.log(zero / (1 - zero)) / sf,
+            "eta_scale": np.log(zero) / sf,
+            "mean": np.nextafter(0, np.inf, dtype=dtype),
+            "samplesize": np.nextafter(0, np.inf, dtype=dtype),
+            "probs": dtype(0),
+            "log_probs": np.log(zero),
+        }
+        bounds_max = {
+            "theta_location": np.log(one / (1 - one)) / sf,
+            "theta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.log(one / (1 - one)) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "mean": one,
+            "samplesize": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "probs": dtype(1),
+            "log_probs": dtype(0),
+        }
+
+        return bounds_min, bounds_max
+
+    # simulator:
+
+    @property
+    def rand_fn_ave(self) -> Optional[Callable]:
+        return lambda shape: np.random.uniform(0.2, 0.8, shape)
+
+    @property
+    def rand_fn(self) -> Optional[Callable]:
+        return None
+
+    @property
+    def rand_fn_loc(self) -> Optional[Callable]:
+        return lambda shape: np.random.uniform(0.05, 0.15, shape)
+
+    @property
+    def rand_fn_scale(self) -> Optional[Callable]:
+        return lambda shape: np.random.uniform(0.2, 0.5, shape)
+
+    def generate_data(self):
+        """
+        Sample random data based on beta distribution and parameters.
+        """
+        return np.random.beta(a=self.p, b=self.q, size=None)
diff --git a/batchglm/models/glm_beta/simulator.py b/batchglm/models/glm_beta/simulator.py
diff --git a/batchglm/models/glm_nb/__init__.py b/batchglm/models/glm_nb/__init__.py
@@ -1,3 +1,2 @@
 from .external import _EstimatorGLM
 from .model import Model
-from .simulator import Simulator
diff --git a/batchglm/models/glm_nb/external.py b/batchglm/models/glm_nb/external.py
@@ -1,4 +1,4 @@
 import batchglm.data as data_utils
 from batchglm import pkg_constants
-from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, _SimulatorGLM, closedform_glm_mean, closedform_glm_scale
+from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, closedform_glm_mean, closedform_glm_scale
 from batchglm.utils.linalg import groupwise_solve_lm
diff --git a/batchglm/models/glm_nb/model.py b/batchglm/models/glm_nb/model.py
@@ -4,7 +4,7 @@
     import anndata
 except ImportError:
     anndata = None
-from typing import Union
+from typing import Any, Callable, Dict, Optional, Union
 
 import dask.array
 import numpy as np
@@ -56,3 +56,53 @@ def mu(self) -> np.ndarray:
     @property
     def phi(self) -> np.ndarray:
         return self.scale
+
+    # param constraints:
+
+    def bounds(self, sf, dmax, dtype) -> Dict[str, Any]:
+
+        bounds_min = {
+            "theta_location": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "theta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_loc": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "loc": np.nextafter(0, np.inf, dtype=dtype),
+            "scale": np.nextafter(0, np.inf, dtype=dtype),
+            "likelihood": dtype(0),
+            "ll": np.log(np.nextafter(0, np.inf, dtype=dtype)),
+        }
+        bounds_max = {
+            "theta_location": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "theta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "scale": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "likelihood": dtype(1),
+            "ll": dtype(0),
+        }
+        return bounds_min, bounds_max
+
+    # simulator:
+
+    @property
+    def rand_fn_ave(self) -> Optional[Callable]:
+        return lambda shape: np.random.poisson(500, shape) + 1
+
+    @property
+    def rand_fn(self) -> Optional[Callable]:
+        return lambda shape: np.abs(np.random.uniform(0.5, 2, shape))
+
+    @property
+    def rand_fn_loc(self) -> Optional[Callable]:
+        return None
+
+    @property
+    def rand_fn_scale(self) -> Optional[Callable]:
+        return None
+
+    def generate_data(self):
+        """
+        Sample random data based on negative binomial distribution and parameters.
+        """
+        return np.random.negative_binomial(n=self.phi, p=1 - self.mu / (self.phi + self.mu), size=None)
diff --git a/batchglm/models/glm_nb/simulator.py b/batchglm/models/glm_nb/simulator.py
diff --git a/batchglm/models/glm_norm/__init__.py b/batchglm/models/glm_norm/__init__.py
@@ -1,3 +1,2 @@
 from .external import _EstimatorGLM
 from .model import Model
-from .simulator import Simulator
diff --git a/batchglm/models/glm_norm/external.py b/batchglm/models/glm_norm/external.py
@@ -1,4 +1,4 @@
 import batchglm.data as data_utils
 from batchglm import pkg_constants
-from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, _SimulatorGLM, closedform_glm_mean, closedform_glm_scale
+from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, closedform_glm_mean, closedform_glm_scale
 from batchglm.utils.linalg import groupwise_solve_lm
diff --git a/batchglm/models/glm_norm/model.py b/batchglm/models/glm_norm/model.py
@@ -1,4 +1,5 @@
 import abc
+from typing import Any, Callable, Dict, Optional
 
 try:
     import anndata
@@ -104,3 +105,53 @@ def sd(self) -> np.ndarray:
 
         """
         return self.scale
+
+    # param constraints:
+
+    def bounds(self, sf, dmax, dtype) -> Dict[str, Any]:
+
+        bounds_min = {
+            "theta_location": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
+            "theta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_loc": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
+            "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "mean": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
+            "sd": np.nextafter(0, np.inf, dtype=dtype),
+            "probs": dtype(0),
+            "log_probs": np.log(np.nextafter(0, np.inf, dtype=dtype)),
+        }
+        bounds_max = {
+            "theta_location": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "theta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "mean": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "sd": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "probs": dtype(1),
+            "log_probs": dtype(0),
+        }
+        return bounds_min, bounds_max
+
+    # simulator:
+
+    @property
+    def rand_fn_ave(self) -> Optional[Callable]:
+        return lambda shape: np.random.uniform(10, 1000, shape)
+
+    @property
+    def rand_fn(self) -> Optional[Callable]:
+        return None
+
+    @property
+    def rand_fn_loc(self) -> Optional[Callable]:
+        return lambda shape: np.random.uniform(50, 100, shape)
+
+    @property
+    def rand_fn_scale(self) -> Optional[Callable]:
+        return lambda shape: np.random.uniform(1.5, 10, shape)
+
+    def generate_data(self):
+        """
+        Sample random data based on normal distribution and parameters.
+        """
+        return np.random.normal(loc=self.mean, scale=self.sd, size=None)
diff --git a/batchglm/models/glm_norm/simulator.py b/batchglm/models/glm_norm/simulator.py