Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup API / models / simulators / estimators / unittests #130

Merged
merged 37 commits into from
Mar 7, 2022
Merged
Changes from 1 commit
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
365bcab
linting using black / isort, fixed imports
picciama Feb 23, 2022
425f0a6
separate properties from input_data
picciama Feb 23, 2022
1535856
major refactoring removing train/models:
picciama Feb 24, 2022
eb1e551
refactoring numpy estimator:
picciama Feb 25, 2022
fd06d26
cleanup: working fitting procedure:
picciama Feb 25, 2022
b25b315
code reformatting with black / isort and linting
picciama Feb 25, 2022
da23c4f
remove unnecessary compute()
picciama Feb 25, 2022
23ee162
add missing decorators
picciama Feb 25, 2022
01a4574
override theta_loc/scale in training
picciama Feb 28, 2022
fd50d5c
unittests cleanup
picciama Mar 1, 2022
d5b0721
support InputDataGLM when simulating
picciama Mar 1, 2022
66fe624
removed @dask_compute
picciama Mar 1, 2022
784c28e
reintroduced compute()
picciama Mar 1, 2022
f1ae683
cleanup, fix imports and syntax errors
picciama Mar 1, 2022
a3cb3c4
fixed imports, api and type annotations using mypy
picciama Mar 1, 2022
8385c87
reformatting using black
picciama Mar 1, 2022
c186b3d
cleanup using flake8, mypy, black, isort
picciama Mar 1, 2022
2e2c755
added type hints everywhere / fixed mypy issues
picciama Mar 3, 2022
cfaf056
reformatting using black / isort
picciama Mar 3, 2022
04d0e5f
Merge branch 'development' into mp/integrate_simulator
picciama Mar 3, 2022
6a76eb8
reformatting using black / isort
picciama Mar 3, 2022
611e2af
corrected data utils import
picciama Mar 4, 2022
dc63263
better documentation for simuation functions
picciama Mar 4, 2022
b493b54
refactored modelContainer.py -> model_container,py
picciama Mar 4, 2022
12e5909
make model_container attribute in parent estimator
picciama Mar 4, 2022
3767ea4
flake8 / black / isort reformatting
picciama Mar 4, 2022
bd4a619
bugfix: correctly set attribute model_container
picciama Mar 4, 2022
408f83b
bugfix: generate -> generate_artificial_data
picciama Mar 4, 2022
89f1d84
added plotting submodule
picciama Mar 4, 2022
64f53ff
reformatted using black
picciama Mar 4, 2022
6c6d84f
added plotting library
picciama Mar 4, 2022
c0e6cdf
reformatted using black / isort
picciama Mar 4, 2022
00d0dcd
also check python 3.7
picciama Mar 4, 2022
18ebb90
removed python 3.7
picciama Mar 4, 2022
5396eaa
updated numpy to >=1.22.2 to pass safety checks
picciama Mar 4, 2022
6e68734
remove python 3.7
picciama Mar 4, 2022
6048230
fix flake8-bandit issue with newest version:
picciama Mar 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
linting using black / isort, fixed imports
picciama committed Feb 23, 2022
commit 365bcab662ea2aad79a7052f64a912fba6fac78e
4 changes: 2 additions & 2 deletions batchglm/api/models/numpy/glm_nb.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from batchglm.models.glm_nb import Model, Simulator
from batchglm.models.glm_nb import Model
from batchglm.train.numpy.glm_nb import Estimator

__all__ = ["Model", "Simulator", "Estimator"]
__all__ = ["Model", "Estimator"]
1 change: 0 additions & 1 deletion batchglm/models/base_glm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .estimator import _EstimatorGLM
from .input import InputDataGLM
from .model import _ModelGLM
from .simulator import _SimulatorGLM
from .utils import closedform_glm_mean, closedform_glm_scale, parse_design
1 change: 1 addition & 0 deletions batchglm/models/base_glm/external.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
import batchglm.data as data_utils
from batchglm import pkg_constants
from batchglm.utils.linalg import groupwise_solve_lm
145 changes: 144 additions & 1 deletion batchglm/models/base_glm/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import abc
import logging
from typing import Any, Dict, Iterable, Optional, Union
from typing import Any, Callable, Dict, Iterable, Optional, Union

import dask.array
import numpy as np
@@ -10,7 +10,11 @@
except ImportError:
anndata = None

import scipy

from .external import pkg_constants
from .input import InputDataGLM
from .utils import generate_sample_description

logger = logging.getLogger(__name__)

@@ -221,6 +225,145 @@ def get(self, key: Union[str, Iterable]) -> Union[Any, Dict[str, Any]]:
attrib = {s: self.__getattribute__(s) for s in key}
return attrib

def param_bounds(self, dtype):

dtype = np.dtype(dtype)
# dmin = np.finfo(dtype).min
dmax = np.finfo(dtype).max
dtype = dtype.type
sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)

return self.bounds(sf, dmax, dtype)

@abc.abstractmethod
def bounds(self, sf, dmax, dtype) -> Dict[str, Any]:
pass

# simulator:

@abc.abstractmethod
def rand_fn_ave(self) -> Optional[Callable]:
pass

@abc.abstractmethod
def rand_fn(self) -> Optional[Callable]:
pass

@abc.abstractmethod
def rand_fn_loc(self) -> Optional[Callable]:
pass

@abc.abstractmethod
def rand_fn_scale(self) -> Optional[Callable]:
pass

def generate_params(
self, n_vars: int, rand_fn_ave=None, rand_fn=None, rand_fn_loc=None, rand_fn_scale=None, **kwargs
):
"""
Generate all necessary parameters. TODO: make this documentation better!!!

:param rand_fn_ave: function which generates random numbers for intercept.
Takes one location parameter of intercept distribution across features.
:param rand_fn: random function taking one argument `shape`.
:param rand_fn_loc: random function taking one argument `shape`.
If not provided, will use `rand_fn` instead.
This function generates location model parameters in inverse linker space,
ie. these parameter will be log transformed if a log linker function is used!
Values below 1e-08 will be set to 1e-08 to map them into the positive support.
:param rand_fn_scale: random function taking one argument `shape`.
If not provided, will use `rand_fn` instead.
This function generates scale model parameters in inverse linker space,
ie. these parameter will be log transformed if a log linker function is used!
Values below 1e-08 will be set to 1e-08 to map them into the positive support.
"""

if rand_fn_ave is None:
rand_fn_ave = self.rand_fn_ave
if rand_fn_ave is None:
raise ValueError("rand_fn_ave must not be None!")
if rand_fn is None:
rand_fn = self.rand_fn
if rand_fn_loc is None:
rand_fn_loc = self.rand_fn_loc
if rand_fn_scale is None:
rand_fn_scale = self.rand_fn_scale
if rand_fn is None and rand_fn_loc is None:
raise ValueError("rand_fn and rand_fn_loc must not be both None!")
if rand_fn is None and rand_fn_scale is None:
raise ValueError("rand_fn and rand_fn_scale must not be both None!")

if rand_fn_loc is None:
rand_fn_loc = rand_fn
if rand_fn_scale is None:
rand_fn_scale = rand_fn

design_loc, design_scale, sample_description = generate_sample_description(**kwargs)

sim_theta_location = np.concatenate(
[
self.link_loc(np.expand_dims(rand_fn_ave([n_vars]), axis=0)), # intercept
rand_fn_loc((design_loc.shape[1] - 1, n_vars)),
],
axis=0,
)
sim_theta_scale = np.concatenate([rand_fn_scale((design_scale.shape[1], self.nfeatures))], axis=0)

return sim_theta_location, sim_theta_scale, design_loc, design_scale, sample_description

def generate(
self,
n_obs: int,
n_vars: int,
num_conditions: int = 2,
num_batches: int = 4,
intercept_scale: bool = False,
shuffle_assignments: bool = False,
sparse: bool = False,
):
"""
First generates the parameter set, then observations random data using these parameters.

:param sparse: Description of parameter `sparse`.
"""
(
sim_theta_location,
sim_theta_scale,
sim_design_loc,
sim_design_scale,
sample_description,
) = self.generate_params(
n_vars=n_vars,
num_observations=n_obs,
num_conditions=num_conditions,
num_batches=num_batches,
intercept_scale=intercept_scale,
shuffle_assignments=shuffle_assignments,
)

data_matrix = self.generate_data()

if sparse:
data_matrix = scipy.sparse.csr_matrix(data_matrix)

self.input_data = InputDataGLM(
data=data_matrix,
design_loc=sim_design_loc,
design_scale=sim_design_scale,
design_loc_names=None,
design_scale_names=None,
)

@abc.abstractmethod
def generate_data(self):
"""
Should sample random data based on distribution and parameters.

:param type args: TODO.
:param type kwargs: TODO.
"""
pass

def __getitem__(self, item):
return self.get(item)

237 changes: 0 additions & 237 deletions batchglm/models/base_glm/simulator.py

This file was deleted.

49 changes: 49 additions & 0 deletions batchglm/models/base_glm/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import math
from typing import Callable, List, Optional, Tuple, Union

import dask.array
@@ -17,6 +18,54 @@
logger = logging.getLogger("batchglm")


def generate_sample_description(
num_observations: int,
num_conditions: int,
num_batches: int,
intercept_scale: bool,
shuffle_assignments: bool,
) -> Tuple[patsy.DesignMatrix, patsy.DesignMatrix, pd.DataFrame]:
"""Build a sample description.
:param num_observations: Number of observations to simulate.
:param num_conditions: number of conditions; will be repeated like [1,2,3,1,2,3]
:param num_batches: number of conditions; will be repeated like [1,1,2,2,3,3]
:param intercept_scale: If true, returns a single-coefficient design matrix (formula = "~1").
If false, returns a design matrix identical to the loc model.
:param shuffle_assignments: If true, shuffle the assignments in the xarray.
UNSUPPORTED: Must be removed as it is disfunctional!!!
"""
if num_conditions == 0:
num_conditions = 1
if num_batches == 0:
num_batches = 1

# condition column
reps_conditions = math.ceil(num_observations / num_conditions)
conditions = np.squeeze(np.tile([np.arange(num_conditions)], reps_conditions))
conditions = conditions[range(num_observations)].astype(str)

# batch column
reps_batches = math.ceil(num_observations / num_batches)
batches = np.repeat(range(num_batches), reps_batches)
batches = batches[range(num_observations)].astype(str)
sample_description = pd.DataFrame({"condition": conditions, "batch": batches})

if shuffle_assignments:
sample_description = sample_description.isel(
observations=np.random.permutation(sample_description.observations.values)
)

sim_design_loc = patsy.dmatrix("~1+condition+batch", sample_description)

if intercept_scale:
sim_design_scale = patsy.dmatrix("~1", sample_description)
else:
sim_design_scale = sim_design_loc

return sim_design_loc, sim_design_scale, sample_description


def parse_design(
design_matrix: Union[pd.DataFrame, patsy.design_info.DesignMatrix, dask.array.core.Array, np.ndarray],
param_names: List[str] = None,
1 change: 0 additions & 1 deletion batchglm/models/glm_beta/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
from .external import _EstimatorGLM
from .model import Model
from .simulator import Simulator
2 changes: 1 addition & 1 deletion batchglm/models/glm_beta/external.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import batchglm.data as data_utils
from batchglm import pkg_constants
from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, _SimulatorGLM, closedform_glm_mean, closedform_glm_scale
from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, closedform_glm_mean, closedform_glm_scale
from batchglm.utils.linalg import groupwise_solve_lm
55 changes: 55 additions & 0 deletions batchglm/models/glm_beta/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
from typing import Any, Callable, Dict, Optional

try:
import anndata
@@ -58,3 +59,57 @@ def p(self) -> np.ndarray:
@property
def q(self) -> np.ndarray:
return (1 - self.mean) * self.samplesize

# parameter contraints:

def bounds(self, sf, dmax, dtype) -> Dict[str, Any]:

zero = np.nextafter(0, np.inf, dtype=dtype)
one = np.nextafter(1, -np.inf, dtype=dtype)

bounds_min = {
"theta_location": np.log(zero / (1 - zero)) / sf,
"theta_scale": np.log(zero) / sf,
"eta_loc": np.log(zero / (1 - zero)) / sf,
"eta_scale": np.log(zero) / sf,
"mean": np.nextafter(0, np.inf, dtype=dtype),
"samplesize": np.nextafter(0, np.inf, dtype=dtype),
"probs": dtype(0),
"log_probs": np.log(zero),
}
bounds_max = {
"theta_location": np.log(one / (1 - one)) / sf,
"theta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
"eta_loc": np.log(one / (1 - one)) / sf,
"eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
"mean": one,
"samplesize": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
"probs": dtype(1),
"log_probs": dtype(0),
}

return bounds_min, bounds_max

# simulator:

@property
def rand_fn_ave(self) -> Optional[Callable]:
return lambda shape: np.random.uniform(0.2, 0.8, shape)

@property
def rand_fn(self) -> Optional[Callable]:
return None

@property
def rand_fn_loc(self) -> Optional[Callable]:
return lambda shape: np.random.uniform(0.05, 0.15, shape)

@property
def rand_fn_scale(self) -> Optional[Callable]:
return lambda shape: np.random.uniform(0.2, 0.5, shape)

def generate_data(self):
"""
Sample random data based on beta distribution and parameters.
"""
return np.random.beta(a=self.p, b=self.q, size=None)
69 changes: 0 additions & 69 deletions batchglm/models/glm_beta/simulator.py

This file was deleted.

1 change: 0 additions & 1 deletion batchglm/models/glm_nb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
from .external import _EstimatorGLM
from .model import Model
from .simulator import Simulator
2 changes: 1 addition & 1 deletion batchglm/models/glm_nb/external.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import batchglm.data as data_utils
from batchglm import pkg_constants
from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, _SimulatorGLM, closedform_glm_mean, closedform_glm_scale
from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, closedform_glm_mean, closedform_glm_scale
from batchglm.utils.linalg import groupwise_solve_lm
52 changes: 51 additions & 1 deletion batchglm/models/glm_nb/model.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
import anndata
except ImportError:
anndata = None
from typing import Union
from typing import Any, Callable, Dict, Optional, Union

import dask.array
import numpy as np
@@ -56,3 +56,53 @@ def mu(self) -> np.ndarray:
@property
def phi(self) -> np.ndarray:
return self.scale

# param constraints:

def bounds(self, sf, dmax, dtype) -> Dict[str, Any]:

bounds_min = {
"theta_location": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
"theta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
"eta_loc": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
"eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
"loc": np.nextafter(0, np.inf, dtype=dtype),
"scale": np.nextafter(0, np.inf, dtype=dtype),
"likelihood": dtype(0),
"ll": np.log(np.nextafter(0, np.inf, dtype=dtype)),
}
bounds_max = {
"theta_location": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
"theta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
"eta_loc": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
"eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
"loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
"scale": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
"likelihood": dtype(1),
"ll": dtype(0),
}
return bounds_min, bounds_max

# simulator:

@property
def rand_fn_ave(self) -> Optional[Callable]:
return lambda shape: np.random.poisson(500, shape) + 1

@property
def rand_fn(self) -> Optional[Callable]:
return lambda shape: np.abs(np.random.uniform(0.5, 2, shape))

@property
def rand_fn_loc(self) -> Optional[Callable]:
return None

@property
def rand_fn_scale(self) -> Optional[Callable]:
return None

def generate_data(self):
"""
Sample random data based on negative binomial distribution and parameters.
"""
return np.random.negative_binomial(n=self.phi, p=1 - self.mu / (self.phi + self.mu), size=None)
67 changes: 0 additions & 67 deletions batchglm/models/glm_nb/simulator.py

This file was deleted.

1 change: 0 additions & 1 deletion batchglm/models/glm_norm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
from .external import _EstimatorGLM
from .model import Model
from .simulator import Simulator
2 changes: 1 addition & 1 deletion batchglm/models/glm_norm/external.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import batchglm.data as data_utils
from batchglm import pkg_constants
from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, _SimulatorGLM, closedform_glm_mean, closedform_glm_scale
from batchglm.models.base_glm import _EstimatorGLM, _ModelGLM, closedform_glm_mean, closedform_glm_scale
from batchglm.utils.linalg import groupwise_solve_lm
51 changes: 51 additions & 0 deletions batchglm/models/glm_norm/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
from typing import Any, Callable, Dict, Optional

try:
import anndata
@@ -104,3 +105,53 @@ def sd(self) -> np.ndarray:
"""
return self.scale

# param constraints:

def bounds(self, sf, dmax, dtype) -> Dict[str, Any]:

bounds_min = {
"theta_location": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
"theta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
"eta_loc": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
"eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
"mean": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
"sd": np.nextafter(0, np.inf, dtype=dtype),
"probs": dtype(0),
"log_probs": np.log(np.nextafter(0, np.inf, dtype=dtype)),
}
bounds_max = {
"theta_location": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
"theta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
"eta_loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
"eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
"mean": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
"sd": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
"probs": dtype(1),
"log_probs": dtype(0),
}
return bounds_min, bounds_max

# simulator:

@property
def rand_fn_ave(self) -> Optional[Callable]:
return lambda shape: np.random.uniform(10, 1000, shape)

@property
def rand_fn(self) -> Optional[Callable]:
return None

@property
def rand_fn_loc(self) -> Optional[Callable]:
return lambda shape: np.random.uniform(50, 100, shape)

@property
def rand_fn_scale(self) -> Optional[Callable]:
return lambda shape: np.random.uniform(1.5, 10, shape)

def generate_data(self):
"""
Sample random data based on normal distribution and parameters.
"""
return np.random.normal(loc=self.mean, scale=self.sd, size=None)
65 changes: 0 additions & 65 deletions batchglm/models/glm_norm/simulator.py

This file was deleted.