diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index fb5e6573..00000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,4 +0,0 @@ -## [2.3.2] - 2019-04-01 -### Added -- Added this CHANGELOG at version 2.3.2 of lama - diff --git a/__init__.py b/__init__.py index e69de29b..374d6d4b 100644 --- a/__init__.py +++ b/__init__.py @@ -0,0 +1 @@ +from lama.version import __version__ \ No newline at end of file diff --git a/lama/__init__.py b/lama/__init__.py index ef6522a4..b30fd35d 100755 --- a/lama/__init__.py +++ b/lama/__init__.py @@ -4,5 +4,6 @@ # When running from a docker image, get much warnings from sklearn, pandas etc. Turn it off warnings.filterwarnings("ignore") +from .version import __version__ #matplotlib.use('Agg') diff --git a/lama/common.py b/lama/common.py index 32fe7cfc..cd4ab417 100755 --- a/lama/common.py +++ b/lama/common.py @@ -24,6 +24,10 @@ import pandas as pd import psutil import argparse +try: + import git +except ImportError: + git = None import yaml import toml @@ -264,6 +268,18 @@ def read_array( path: Union[str, Path]): return sitk.GetArrayFromImage(sitk.ReadImage(path)) +def read_spec_csv(path: Union[Path, str]) -> pd.DataFrame: + """ + Read a CSV containing specimen data (such as organ or whole mask volumes. + Force index to be str type + + TODO: Should we enforce column header to be str as well? + """ + df = pd.read_csv(path, index_col=0) + df.index = df.index.astype(str) + return df + + def img_path_to_array(img_path: Union[str, Path]): if os.path.isfile(img_path): try: @@ -288,7 +304,7 @@ def git_log() -> str: the git branch, commit, and message """ this_dir = Path(__file__).parent.resolve() - git_msg_file = this_dir / 'current_commit' + git_msg_file = this_dir / 'current_commit1' try: msg = '' @@ -296,6 +312,21 @@ def git_log() -> str: for line in fh: msg += line except OSError: + # current_commit file does not exist (This would come from pip install). + # So try using git + if git: + try: + this_module = Path(__file__).parent + repo = git.Repo(search_parent_directories=True, path=this_module) + sha = repo.head.object.hexsha[:7] + msg = f'Git commit: {sha}' + # Kyle -if the git commit can not be determined, for example + # running python3 setup.py install --user installs into site packages + # stuffing the git commit up - you get the error below and stops LAMA + # from running - hence the extra try except + except git.exc.InvalidGitRepositoryError: + pass + if not msg: msg = f'Cannot determine git commit' return msg @@ -363,7 +394,7 @@ def load_label_map_names(organ_names_path, include_terms=False): df = pd.read_csv(organ_names_path) # Drop clear label, if present - if df.iloc[0].label == 0: + if df.iloc[0].label_num == 0: df.drop(0, inplace=True) # Check required columns are present @@ -779,6 +810,31 @@ def csv_read_dict(path): return lines +def gather_rad_data(_dir): + file_names = [spec for spec in get_file_paths(folder=_dir, extension_tuple=".csv")] + file_names.sort() + data = [pd.read_csv(spec, index_col=0).dropna(axis=1) for spec in file_names] + abnormal_embs = ['22300_e8', '22300_e6', '50_e5'] + for i, df in enumerate(data): + df.index.name = 'org' + df.name = str(file_names[i]).split(".")[0].split("/")[-1] + df['genotype'] = 'HET' if 'het' in str(file_names[i]) else 'WT' + df['background'] = 'C57BL6N' if (('b6ku' in str(file_names[i])) | ('BL6' in str(file_names[i]))) else \ + 'F1' if ('F1' in str(file_names[i])) else 'C3HHEH' + df['HPE'] = 'abnormal' if any(map(str(file_names[i]).__contains__, abnormal_embs)) else 'normal' + data = pd.concat(data, + ignore_index=False, keys=[os.path.splitext(os.path.basename(spec))[0] for spec in file_names], + names=['specimen', 'org']) + + line_file = _dir.parent / "full_results.csv" + org_dir = _dir.parent / "organs" + os.makedirs(org_dir, exist_ok=True) + for org in data.index.get_level_values('org').unique(): + data[data.index.get_level_values('org') == org].to_csv(str(org_dir) + "/results_" + str(org) + ".csv") + + + + def select_subset(paths, subset_ids): """ Trim the files found in the wildtype input directory to thise in the optional subset list file @@ -855,7 +911,13 @@ def strip_img_extension(file_name): return stripped else: return file_name - +# +# def write_dir_doc(dir_, name, msg): +# """ +# Write a log file into an output directory to give user some info on what is in there +# """ +# with open(dir_ / name, 'w') as fh: +# fh.write(msg) def test_installation(app): try: @@ -955,7 +1017,7 @@ def cfg_load(cfg) -> Dict: """ There are 2 types of config file used in the project yaml an toml. Will move to al tml at some point - This function wraps around both and helps with + This function wraps around both Returns ------- @@ -968,11 +1030,9 @@ def cfg_load(cfg) -> Dict: if Path(cfg).suffix == '.yaml': - # If pyyaml version >= 5.1 will get a warning about using explicit loader 'yaml.load(cfg, loader=yaml.Loder) - # But this is OK to ingnore try: with open(cfg, 'r') as fh: - return yaml.load(fh) + return yaml.load(fh, Loader=yaml.FullLoader) except Exception as e: raise ValueError("can't read the config file - {}".format(e)) diff --git a/lama/elastix/__init__.py b/lama/elastix/__init__.py index e33d57d3..140bd5d4 100644 --- a/lama/elastix/__init__.py +++ b/lama/elastix/__init__.py @@ -5,10 +5,10 @@ LOG_FILE = 'inversion.log' TRANSFORMIX_OUT_NAME = 'result.nrrd' INVERSION_DIR_NAME = 'Inverted_transform_parameters' -LABEL_INVERTED_TRANFORM = 'labelInvertedTransform.txt' -IMAGE_INVERTED_TRANSFORM = 'ImageInvertedTransform.txt' +PROPAGATE_LABEL_TRANFORM = 'labelInvertedTransform.txt' +PROPAGATE_IMAGE_TRANSFORM = 'ImageInvertedTransform.txt' VOLUME_CALCULATIONS_FILENAME = "organvolumes.csv" -INVERT_CONFIG = 'invert.yaml' +PROPAGATE_CONFIG = 'propagate.yaml' REG_DIR_ORDER_CFG = 'reg_order.txt' RESOLUTION_IMGS_DIR = 'resolution_images' # When reading images from dir and subdirs, ignore images in this folder TRANSFORMIX_OUT = 'result.nrrd' # This will not be correct if filetype is not nrrd diff --git a/lama/elastix/deformations.py b/lama/elastix/deformations.py index dd338255..b381564e 100755 --- a/lama/elastix/deformations.py +++ b/lama/elastix/deformations.py @@ -2,7 +2,7 @@ """ -Given a sequence of deformation fields, generate a mean deformation field and jacobian determinant file +Generate jacobians and deformation fields from a LAMA registration run """ from lama import common @@ -149,7 +149,7 @@ def _generate_deformation_fields(registration_dirs: List, shutil.copy(elastix_tform_file, temp_transform_file) transform_params.append(temp_transform_file) - _modfy_tforms(transform_params) # Add the InitialtransformParamtere line + _chain_tforms(transform_params) # Add the InitialtransformParamtere line else: # The resolutdeformation_dirion paramter files are numbered from 0 but the config counts from 1 @@ -166,7 +166,7 @@ def _generate_deformation_fields(registration_dirs: List, shutil.copy(elastix_tform_file, temp_transform_file) transform_params.append(temp_transform_file) - _modfy_tforms(transform_params) # Add the InitialtransformParamtere line + _chain_tforms(transform_params) # Add the InitialtransformParamtere line # Copy the tp files into the temp directory and then modify to add initail transform @@ -176,10 +176,10 @@ def _generate_deformation_fields(registration_dirs: List, return neg_jac_array -def _modfy_tforms(tforms: List): +def _chain_tforms(tforms: List): """ Add the initial paramter file paths to the tform files - Wedon't use this now as all the transforms are merged into one by elastix + We don't use this now as all the transforms are merged into one by elastix :return: """ if len(tforms) < 2: # Cannot have initial tform file as we need at least 2 diff --git a/lama/elastix/invert.py.bak b/lama/elastix/invert.py.bak deleted file mode 100755 index 55f456d5..00000000 --- a/lama/elastix/invert.py.bak +++ /dev/null @@ -1,891 +0,0 @@ -#!/usr/bin/env python - -# -*- coding: utf-8 -*- - -"""invert_volumes.py - -This module inverts registrations performed with elastix - -Example -------- - - $ invert_values.py -c invert.yaml - -example config file: - - labelmap: padded_target/labelmap.nrrd - voxel_size: 28 - stage_dirs: - - deformable_to_8 - - deformable_to_128 - - affine - - rigid - -All paths are relative to the directory containing the config file - - -Notes ------ -The inversion will only work well for labelmaps as the final interpolation order is set to 0 to prevent interpolation -of lable map values and to keep them as the correct integers - -Currently only inverts one elx_tform_params file per stage. Should be albe to do multple - -Inversion can fail if the registration resolutions are set incorrectly. -For example, if the non-linear step has 6 resolutions and a a final BSpline grid spacing of 8, the largest grid size -will be 256. It seems that if this is larger than the input image dimensions, the inversion will fail. - -""" - -IGNORE_FOLDER = 'resolution_images' - -from logzero import logger as logging -import tempfile -import os -import subprocess -import sys -from collections import defaultdict -from multiprocessing import Pool -from os.path import join, splitext, abspath, basename, isfile, isdir -import shutil - -import yaml -sys.path.insert(0, join(os.path.dirname(__file__), '..')) -import common -from img_processing.pad import unpad_roi -from paths import RegPaths - -ELX_TRANSFORM_PREFIX = 'TransformParameters.0.txt' -ELX_PARAM_PREFIX = 'elastix_params_' -ELX_INVERTED_POINTS_NAME = 'outputpoints.vtk' -FILE_FORMAT = '.nrrd' -LOG_FILE = 'inversion.log' - -INVERSION_DIR_NAME = 'Inverted_transform_parameters' -LABEL_INVERTED_TRANFORM = 'labelInvertedTransform.txt' -IMAGE_INVERTED_TRANSFORM = 'ImageInvertedTransform.txt' -VOLUME_CALCULATIONS_FILENAME = "organvolumes.csv" - - -def setup_logging(outdir, logname, debug): - """ - If this module is being run directly from command line (ie. not from run_lama.py) setup logging to a new file - - Parameters - ---------- - outdir: str - directory to save log file in - logname: str - name of log file - """ - - if __name__ == '__main__' or debug: - logpath = join(outdir, logname) - logging.basicConfig(filename=logpath, level=logging.DEBUG, - format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') - - -def batch_invert_transform_parameters(config_file, invert_config_file, outdir, threads=None, noclobber=False, log=False): - """ - Create new elastix TransformParameter files that can then be used by transformix to invert labelmaps, stats etc - - Parameters - ---------- - config_file: str - path to original reg pipeline config file - - outdir: str - Absolute path to output dir - - invert_config_file: str - path to output inversion config to - - noclobber: bool - if True don't overwrite inverted parameters present - """ - common.test_installation('elastix') - - setup_logging(outdir, 'invert_transforms.log', log) - - with open(config_file, 'r') as yf: - config = yaml.load(yf) - config_dir = os.path.abspath(os.path.dirname(config_file)) - - reg_dirs = get_reg_dirs(config, config_dir) - - # Get the image basename from the first stage registration folder (rigid?) - first_stage = join(config_dir, reg_dirs[0]) - volume_names = [basename(x) for x in common.get_file_paths(first_stage, ignore_folder=IGNORE_FOLDER)] - - common.mkdir_if_not_exists(outdir) - stages_to_invert = defaultdict(list) - - jobs = [] - if not threads: - threads = 1 - else: - threads = int(threads) - - for i, vol_name in enumerate(volume_names): - - label_replacements ={ - 'FinalBSplineInterpolationOrder': '0', - 'FixedInternalImagePixelType': 'short', - 'MovingInternalImagePixelType': 'short', - 'ResultImagePixelType': 'unsigned char', - 'WriteTransformParametersEachResolution': 'false', - 'WriteResultImageAfterEachResolution': 'false' - } - - image_replacements = { - 'FinalBSplineInterpolationOrder': '3', - 'FixedInternalImagePixelType': 'float', - 'MovingInternalImagePixelType': 'float', - 'ResultImagePixelType': 'float', - 'WriteTransformParametersEachResolution': 'false', - 'WriteResultImageAfterEachResolution': 'false' - - } - - vol_id, vol_ext = splitext(vol_name) - - for r in reg_dirs: - reg_dir = join(config_dir, r) - - stage_out_dir = join(outdir, basename(reg_dir)) - - moving_dir = join(config_dir, reg_dir, vol_id) - invert_param_dir = join(stage_out_dir, vol_id) - - if not os.path.isdir(moving_dir): - logging.warning('cannot find {}'.format(moving_dir)) - continue - stage_vol_files = os.listdir(moving_dir) # All the files from the registration dir - stage_files = os.listdir(reg_dir) # The registration stage parent directory - parameter_file = next(join(reg_dir, i) for i in stage_files if i.startswith(ELX_PARAM_PREFIX)) - transform_file = next(join(moving_dir, i) for i in stage_vol_files if i.startswith(ELX_TRANSFORM_PREFIX)) - - if not isfile(parameter_file): - logging.error('elastix transform parameter file missing: {}'.fomrat(transform_file)) - continue - if not isfile(parameter_file): - logging.error('elastix registration paramter file missing: {}'.format(parameter_file)) - continue - - common.mkdir_if_not_exists(stage_out_dir) - - rel_inversion_path = os.path.basename(r) - if rel_inversion_path not in stages_to_invert['inversion_order']: - stages_to_invert['inversion_order'].insert(0, rel_inversion_path) - - if not noclobber: - common.mkdir_force(invert_param_dir) # Overwrite any inversion file that exist for a single specimen - reg_metadata = yaml.load(open(join(moving_dir, common.INDV_REG_METADATA))) - fixed_volume = abspath(join(moving_dir, reg_metadata['fixed_vol'])) # The original fixed volume used in the registration - - # Invert the Transform paramteres with options for normal image inversion - - job = { - 'invert_param_dir': invert_param_dir, - 'parameter_file': abspath(parameter_file), - 'transform_file': transform_file, - 'fixed_volume': fixed_volume, - 'param_file_output_name': 'inversion_parameters.txt', - 'image_replacements': image_replacements, - 'label_replacements': label_replacements, - 'image_transform_file': IMAGE_INVERTED_TRANSFORM, - 'label_transform_file': LABEL_INVERTED_TRANFORM, - 'noclobber': noclobber - } - - jobs.append(job) - - # with open('/home/neil/work/jobs.json', 'w') as fh: - # json.dump(jobs, fh, sort_keys=True, indent=4, separators=(',', ': ')) - # return - logging.info('inverting with {} threads: '.format(threads)) - pool = Pool(threads) - try: - pool.map(_invert_transform_parameters, jobs) - - except KeyboardInterrupt: - print('terminating inversion') - pool.terminate() - pool.join() - - reg_dir = os.path.relpath(reg_dir, outdir) - stages_to_invert['registration_directory'] = reg_dir - # Create a yaml config file so that inversions can be run seperatley - with open(invert_config_file, 'w') as yf: - yf.write(yaml.dump(dict(stages_to_invert), default_flow_style=False)) - - -def _invert_transform_parameters(args): - """ - Generate a single inverted elastix transform parameter file. This can then be used to invert labels, masks etc. - If any of the step faile, return as subsequent steps will also fail. The logging of failures is handled - within each function - """ - - # If we have both the image and label inverted transforms, don't do anything if noclobber is True - noclobber = args['noclobber'] - - image_transform_param_path = abspath(join(args['invert_param_dir'], args['image_transform_file'])) - label_transform_param_path = abspath(join(args['invert_param_dir'], args['label_transform_file'])) - - if noclobber and isfile(label_transform_param_path) and isfile(image_transform_param_path): - logging.info('skipping {} as noclobber is True and inverted parameter files exist') - return - - # Modify the elastix registration input parameter file to enable inversion (Change metric and don't write image results) - inversion_params = abspath(join(args['invert_param_dir'], args['param_file_output_name'])) # The elastix registration parameters used for inversion - _modify_param_file(abspath(args['parameter_file']), inversion_params, args['image_replacements']) # I don't think we need the replacements here!!!!!!!! - - # Do the inversion, making the inverted TransformParameters file - fixed_vol = args['fixed_volume'] - forward_tform_file = abspath(args['transform_file']) - invert_param_dir = args['invert_param_dir'] - if not _invert_tform(fixed_vol, forward_tform_file, inversion_params, invert_param_dir): - return - - # Get the resulting TransformParameters file, and create a transform file suitable for inverting normal volumes - image_inverted_tform = abspath(join(args['invert_param_dir'], 'TransformParameters.0.txt')) - - - if not _modify_inverted_tform_file(image_inverted_tform, image_transform_param_path): - return - - # Get the resulting TransformParameters file, and create a transform file suitable for inverting label volumes - - # replace the parameter in the image file with label-specific parameters and save in new file. No need to - # generate one from scratch - if not _modify_param_file(image_transform_param_path, label_transform_param_path, args['label_replacements']): - return - - _modify_inverted_tform_file(label_transform_param_path) - - - -def get_reg_dirs(config, config_dir): - """ - - """ - paths = RegPaths(config_dir, config) - reg_stages = [] - root_reg_dir = paths.get('root_reg_dir') - for i, reg_stage in enumerate(config['registration_stage_params']): - stage_id = reg_stage['stage_id'] - stage_dir = join(root_reg_dir, stage_id) - reg_stages.append(stage_dir) - return reg_stages - - -class Invert(object): - def __init__(self, config_path, invertable, outdir, threads=None, noclobber=False): - """ - Inverts a series of volumes. A yaml config file specifies the order of inverted transform parameters - to use. This config file should be in the root of the directory containing these inverted tform dirs. - - Also need to input a directory containing volumes/label maps etc to invert. These need to be in directories - named with the same name as the corresponding inverted tform file directories - - Parameters - ---------- - config_path: str - path to yaml config containing the oder of the inverted directories to use - threads: str/ None - number of threas to use. If None, use all available threads - invertable_volume: str - path to object to invert - invertable: str - dir or path. If dir, invert all objects within the subdirectories. - If path to object (eg. labelmap) invert that instead - noclobber: bool - if True do not overwrite already inverted labels - :return: - """ - - setup_logging(outdir, 'invert.log', True) - - self.noclobber = noclobber - - with open(config_path, 'r') as yf: - self.config = yaml.load(yf) - - self.invertables = invertable - self.config_dir = os.path.dirname(config_path) # The dir containing the inverted elx param files - - self.threads = threads - self.out_dir = outdir - common.mkdir_if_not_exists(self.out_dir) - - self.inverted_tform_stage_dirs = self.get_inversion_dirs() - self.forward_tform_stage_dirs = self.get_forward_tranforms() - - self.elx_param_prefix = ELX_PARAM_PREFIX - self.invert_transform_name = None # Set in subclasses - self.last_invert_dir = None - - def get_inversion_dirs(self): - - dirs = [] - for dir_name in self.config['inversion_order']: - dir_path = join(self.config_dir, dir_name) - dirs.append(dir_path) - return dirs - - def get_forward_tranforms(self): - dirs = [] - reg_dir = self.config.get('registration_directory') - for dir_name in self.config['inversion_order']: - dir_path = join(self.config_dir, reg_dir, dir_name) - dirs.append(dir_path) - return dirs - - - @staticmethod - def parse_yaml_config(config_path): - """ - Opens the yaml config file - - Parameters - ---------- - config_path: str - path to config file - - Returns - ------- - dict: - The config - """ - - try: - config = yaml.load(open(config_path, 'r')) - except Exception as e: - sys.exit("can't read the YAML config file - {}".format(e)) - return config - - def run(self): - """ - - """ - - inverting_names = os.listdir(self.inverted_tform_stage_dirs[0]) - - for i, vol_name in enumerate(inverting_names): - invertable = self.invertables - - for inversion_stage, forward_stage in zip(self.inverted_tform_stage_dirs, self.forward_tform_stage_dirs): - invert_stage_out = join(self.out_dir, basename(inversion_stage)) - if not os.path.isdir(invert_stage_out): - common.mkdir_if_not_exists(invert_stage_out) - - if self.type == 'forward': # temp bodge for mesh inversion problem - inv_tform_dir = join(forward_stage, vol_name) - transform_file = join(inv_tform_dir, self.invert_transform_name) - else: - inv_tform_dir = join(inversion_stage, vol_name) - transform_file = join(inv_tform_dir, self.invert_transform_name) - - invert_vol_out_dir = join(invert_stage_out, vol_name) - - # Do not try to invert volume if the output folder already exits - if self.noclobber and isdir(invert_vol_out_dir): - continue - - common.mkdir_if_not_exists(invert_vol_out_dir) - - print('inverting {}'.format(transform_file)) - - invertable = self._invert(invertable, transform_file, invert_vol_out_dir, self.threads) - - if not invertable: # If inversion failed or there is nocobber, will get None - continue # Move on to next volume to invert - - self.last_invert_dir = invert_stage_out - - def _invert(self): - raise NotImplementedError - - -class InvertLabelMap(Invert): - - def __init__(self, *args, **kwargs): - super(InvertLabelMap, self).__init__(*args, **kwargs) - self.invert_transform_name = LABEL_INVERTED_TRANFORM - self.type = 'normal' - - def run(self): - """ - Calls the parent run function to invert the labels. - Then optionally calculates organ volumes for the final inverted labels - """ - super(InvertLabelMap, self).run() - - def _invert(self, labelmap, tform, outdir, threads=None): - """ - Using the iverted elastix transform paramter file, invert a volume with transformix - - Parameters - ---------- - vol: str - path to volume to invert - tform: str - path to elastix transform parameter file - outdir: str - path to save transformix output - rename_output: str - rename the transformed volume to this - threads: str/None - number of threads for transformix to use. if None, use all available cpus - Returns - ------- - str/bool - path to new img if succesful else False - """ - #lm_basename = os.path.splitext(os.path.basename(labelmap))[0] - if not common.test_installation('transformix'): - raise OSError('Cannot find transformix. Is it installed') - - old_img = os.path.join(outdir, TRANSFORMIX_OUT) # where thetransformix-inverted labelmap will be - - path, base = os.path.split(os.path.normpath(outdir)) - new_output_name = os.path.join(outdir, '{}.nrrd'.format(base)) # Renamed transformix-inverted labelmap - - # if self.noclobber and isfile(new_output_name): - # Maybe need to do two types of noclobber - # 1: where if the folder does not exist, do not do it - # 2: where the folder exists but the final output file does not exist - # return None - - - cmd = [ - 'transformix', - '-in', labelmap, - '-tp', tform, - '-out', outdir - ] - - if threads: - cmd.extend(['-threads', str(threads)]) - try: - subprocess.check_output(cmd) - except Exception as e: - logging.exception('{}\ntransformix failed inverting labelmap: {}'.format(e, labelmap)) - # sys.exit() - logging.error('transformix failed with this command: {}\nerror message:'.format(cmd)) - - try: - shutil.move(old_img, new_output_name) - except IOError as e: - print - 'could not rename {}'.format(old_img) - return old_img - else: - return new_output_name - - -class InvertStats(InvertLabelMap): - """ - This class behaves almost the same as InvertLabelMap in that it inverts a single image file back onto multiple - inputs. It just uses a different elastix parameters - """ - def __init__(self, *args, **kwargs): - super(InvertStats, self).__init__(*args, **kwargs) - self.invert_transform_name = IMAGE_INVERTED_TRANSFORM - self.type = 'normal' - - -class InvertMeshes(Invert): - - def __init__(self, config_path, invertable, outdir, threads=None): - super(InvertMeshes, self).__init__(config_path, invertable, outdir, threads) - self.invert_transform_name = ELX_TRANSFORM_PREFIX - self.type = 'forward' - - def _invert(self, mesh, tform, outdir, threads=None): - """ - Using the iverted elastix transform paramter file, invert a volume with transformix - - Parameters - ---------- - vol: str - path to volume to invert - tform: str - path to elastix transform parameter file - outdir: str - path to save transformix output - rename_output: str - rename the transformed volume to this - threads: str/None - number of threads for transformix to use. if None, use all available cpus - Returns - ------- - str/bool - path to new img if succesful else False - """ - common.test_installation('transformix') - m_basename = os.path.splitext(os.path.basename(mesh))[0] - new_vtk_path = join(outdir, m_basename + '.vtk') - - cmd = [ - 'transformix', - '-def', mesh, - '-tp', tform, - '-out', outdir - ] - - if threads: - cmd.extend(['-threads', str(threads)]) - try: - subprocess.check_output(cmd) - except Exception as e: - print 'transformix failed inverting mesh: {}'.format(mesh) - logging.error('transformix failed with this command: {}\nerror message:'.format(cmd), exc_info=True) - print e - sys.exit(1) - try: - # rename the inverted points form this stage - old_vtk = os.path.join(outdir, ELX_INVERTED_POINTS_NAME) - os.rename(old_vtk, new_vtk_path) - except OSError: - - raise - else: - return new_vtk_path - - -class InvertRoi(InvertLabelMap): - def __init__(self, config_path, invertable, outdir, vol_info, voxel_size, threads=None): - super(InvertRoi, self).__init__(config_path, invertable, outdir, threads) - self.invert_transform_name = LABEL_INVERTED_TRANFORM - self.vol_info = vol_info - self.voxel_size = voxel_size - - def run(self): - super(InvertRoi, self).run() - # At this point we have a bunch of rois inverted onto the padded inputs - # We need to adjust the rois to account for the padding - out = join(self.out_dir, 'Extracted_roi') - unpad_roi(self.vol_info, self.last_invert_dir, self.voxel_size, out) - - -class InvertSingleVol(Invert): - """ - Invert volumes using the elastix inverted transform parameters. - This class is used for inverting statitistics overlays - """ - def __init__(self, *args, **kwargs): - super(InvertSingleVol, self).__init__(*args, **kwargs) - self.invert_transform_name = IMAGE_INVERTED_TRANSFORM - - def run(self, prefix=None): - """ - Parameters - ---------- - prefix: str - A prefix that is added to the stats volumes. To locate correct transform inversion files, look for files - with this prefix missing - """ - - # inverting_names = os.listdir(self.inverted_tform_stage_dirs[ # 0]) - - # for i, vol_name in enumerate(inverting_names): - # if self.batch_invert: - # invertable = self.invertables - # else: - # invertable = self.invertables[vol_name] - - volname, ext = splitext(basename(self.invertables)) - if prefix and volname.startswith(prefix): - original_vol_name = volname[len(prefix):] # remove the prfix to get the original vol name to find the tf - else: - original_vol_name = volname - invertable = self.invertables - - for inversion_stage in self.inverted_tform_stage_dirs: - invert_stage_out = join(self.out_dir, basename(inversion_stage)) - common.mkdir_if_not_exists(invert_stage_out) - - inv_tform_dir = join(inversion_stage, original_vol_name) - - transform_file = join(inv_tform_dir, IMAGE_INVERTED_TRANSFORM) - invert_vol_out_dir = join(invert_stage_out, volname) - common.mkdir_if_not_exists(invert_vol_out_dir) - - invertable = self._invert(invertable, transform_file, invert_vol_out_dir, self.threads) - - def _invert(self, volume, tform, outdir, threads=None): - """ - Using the iverted elastix transform paramter file, invert a volume with transformix - - Parameters - ---------- - vol: str - path to volume to invert - tform: str - path to elastix transform parameter file - outdir: str - path to save transformix output - rename_output: str - rename the transformed volume to this - threads: str/None - number of threads for transformix to use. if None, use all available cpus - Returns - ------- - str/bool - path to new img if succesful else False - """ - common.test_installation('transformix') - lm_basename = os.path.splitext(os.path.basename(volume))[0] - new_img_path = join(outdir, lm_basename + FILE_FORMAT) - - cmd = [ - 'transformix', - '-in', volume, - '-tp', tform, - '-out', outdir - ] - - if threads: - cmd.extend(['-threads', str(threads)]) - try: - subprocess.check_output(cmd) - except Exception as e: - print 'transformix failed inverting volume: {} Is transformix installed?. Error: {}'.format(volume, e) - print(e) - #logging.error('transformix failed with this command: {}\nerror message:'.format(cmd), exc_info=True) - sys.exit() - try: - old_img = os.path.join(outdir, TRANSFORMIX_OUT) - os.rename(old_img, new_img_path) - except OSError: - - return old_img - else: - return new_img_path - - -def _modify_param_file(elx_param_file, newfile_name, replacements): - """ - Modifies the elastix input parameter file that was used in the original transformation. - Adds DisplacementMagnitudePenalty (which is needed for inverting) - Turns off writing the image results at the end as we only need an inveterted output file. - Also changes interpolation order in the case of inverting labels - - Parameters - ---------- - elx_param_file: str - path to elastix input parameter file - newfile_name: str - path to save modified parameter file to - - """ - - try: - with open(elx_param_file) as old, open(newfile_name, "w") as new: - - for line in old: - if line.startswith("(Metric "): - line = '(Metric "DisplacementMagnitudePenalty")\n' - if line.startswith('(WriteResultImage '): - line = '(WriteResultImage "false")\n' - if line.startswith('WriteResultImageAfterEachResolution '): - continue - try: - param_name = line.split()[0][1:] - except IndexError: - continue # comment? - - if param_name in replacements: - value = replacements[param_name] - try: - int(value) - except ValueError: - # Not an int, neeed quotes - line = '({} "{}")\n'.format(param_name, value) - else: - # An int, no quotes - line = '({} {})\n'.format(param_name, value) - new.write(line) - except IOError as e: - logging.error("Error modifying the elastix parameter file: {}".format(e)) - return False - return True - - -def _invert_tform(fixed, tform_file, param, outdir): - """ - Invert the transform and get a new transform file - """ - if not common.test_installation('elastix'): - raise OSError('elastix not installed') - - - a = isfile(fixed) - b = isfile(tform_file) - c = isfile(param) - d = isdir(outdir) - - cmd = ['elastix', - '-t0', tform_file, - '-p', param, - '-f', fixed, - '-m', fixed, - '-out', outdir, - '-threads', '1' # Just use one thread within elastix as LAMA is dealing with the multithreading - ] - - - try: - subprocess.check_output(cmd) - except (Exception, subprocess.CalledProcessError) as e: - logging.exception('Inverting transform file failed. cmd: {}\n{}:'.format(cmd, str(e))) - return False - return True - - -def _modify_inverted_tform_file(elx_tform_file, newfile_name=None): - """ - Remove "NoInitialTransform" from the output transform parameter file - Set output image format to unsigned char. Writes out a modified elastix transform parameter file - that can be used for inverting volumes - - Parameters - ---------- - elx_tform_file: str - path to elastix transform file - newfile_mame: str - path to save modified transform file - """ - - if not newfile_name: # Write to temporary file before overwriting - new_file = tempfile.NamedTemporaryFile().name - else: - new_file = newfile_name - - try: - - with open(new_file, "w+") as new_tform_param_fh, open(elx_tform_file, "r") as tform_param_fh: - - for line in tform_param_fh: - if line.startswith('(InitialTransformParametersFileName'): - line = '(InitialTransformParametersFileName "NoInitialTransform")\n' - new_tform_param_fh.write(line) - new_tform_param_fh.close() - tform_param_fh.close() - - except IOError: - logging.warning("Error reading or writing transform files {}".format(elx_tform_file)) - return False - - return True - - -def is_euler_stage(tform_param): - """ - Return True if the registration used to create this param file was a Euler transform. Can't currently invert - Euler transforms with this method, and is usually not required - :param tform_param: - :return: - """ - with open(tform_param, 'r') as fh: - line = fh.readline() - if 'EulerTransform' in line: - return True - else: - return False - - -if __name__ == '__main__': - - # Log all uncaught exceptions - sys.excepthook = common.excepthook_overide - - def print_args_error(): - msg = ("\nOptions are\n\n" - "reg - make inverse transform parameter files for elastix\n" - "labels - invert a label image (including masks) using previously-generated inverse transform parameter files\n" - "vol - invert a grey scale image using previously-generated inverse transform parameter files\n" - "meshes - invert a itk mesh using previously-generated inverse transform parameter files\n" - "roi - trnsform roi coordinates using previously-generated inverse transform parameter files\n\n" - "Examples:\ninvert_volumes.py reg -c lama_config.yaml -o output/inverted_transforms, -t 8\n" - "invert_volumes.py lables -c inverted_transforms/invert.yaml -o output/inverted_lables -i label_to_invert -t 8\n") - sys.exit(msg) - - if len(sys.argv) < 2: - print_args_error() - - import argparse - - # this_script_path = os.path.dirname(os.path.realpath(__file__)) - # default_invert_parameter = join(this_script_path, ) - - if sys.argv[1] == 'labels': - parser = argparse.ArgumentParser("invert lablels and masks") - parser.add_argument('-c', '--config', dest='config', help='yaml config file. Usually root/output/inverted_transforms/invert.yaml', required=True) - parser.add_argument('-i', '--invertable', dest='invertable', help='label volume to invert', required=True) - parser.add_argument('-o', '--outdir', dest='outdir', help='output dir. Usually root/output/inverted_labels', required=True) - parser.add_argument('-t', '--threads', dest='threads', type=str, help='number of threads to use', required=False) - parser.add_argument('-noclobber', '--noclobber', dest='noclobber', default=False, action='store_true') - - args, _ = parser.parse_known_args() - inv = InvertLabelMap(args.config, args.invertable, args.outdir, threads=args.threads, noclobber=args.noclobber) - inv.run() - - elif sys.argv[1] == 'reg': - parser = argparse.ArgumentParser("invert elastix registrations to create elastix inversion parameter files") - parser.add_argument('-c', '--config', dest='config', help='Main LAMA config file with list of registration dirs', required=True) - parser.add_argument('-o', '--out', dest='outdir', help='where to put the output', required=True) - parser.add_argument('-t', '--threads', dest='threads', type=str, help='number of threads to use', required=False) - parser.add_argument('-noclobber', '--noclobber', dest='noclobber', default=False, action='store_true') - args, _ = parser.parse_known_args() - config_out = join(args.outdir, 'invert.yaml') - batch_invert_transform_parameters(args.config, config_out, args.outdir, args.threads, noclobber=args.noclobber) - - elif sys.argv[1] == 'vol': - parser = argparse.ArgumentParser("invert image volumes") - parser.add_argument('-c', '--config', dest='config', help='yaml config file. Usually root/output/inverted_transforms/invert.yaml', required=True) - parser.add_argument('-i', '--invertable', dest='invertable', help='volume to invert', required=True) - parser.add_argument('-o', '--outdir', dest='outdir', help='output dir. Usually root/output/inverted_labels', required=True) - parser.add_argument('-p', '--prefix', dest='prefix', help='A prefix added to the invertable, that is not present on the invert transform files', default=False) - - parser.add_argument('-t', '--threads', dest='threads', type=str, help='number of threads to use', required=False) - args, _ = parser.parse_known_args() - inv = InvertSingleVol(args.config, args.invertable, args.outdir) - inv.run(args.prefix) - - elif sys.argv[1] == 'meshes': - parser = argparse.ArgumentParser("invert meshes") - parser.add_argument('-c', '--config', dest='config', help='yaml config file', required=True) - parser.add_argument('-m', '--meshes', dest='mesh', help='mesh dir/mesh file', required=True) - parser.add_argument('-o', '--outdir', dest='outdir', help='output dir', required=True) - parser.add_argument('-t', '--threads', dest='threads', type=str, help='number of threads to use', required=False) - - args, _ = parser.parse_known_args() - if os.path.isdir(args.mesh): - for path in common.get_file_paths(args.mesh): - inv = InvertMeshes(args.config, path, args.outdir) - inv.run() - else: - inv = InvertMeshes(args.config, args.mesh, args.outdir) - inv.run() - - elif sys.argv[1] == 'roi': - parser = argparse.ArgumentParser("invert roi") - parser.add_argument('-c', '--config', dest='config', help='yaml config file', required=True) - parser.add_argument('-s', '--starts', dest='starts', help='roi starts (xyz)', required=True) - parser.add_argument('-e', '--ends', dest='ends', help='roi ends (xyz)', required=True, nargs=3, type=int) - parser.add_argument('-o', '--outdir', dest='outdir', help='output dir', required=True, nargs=3, type=int) - parser.add_argument('-t', '--threads', dest='threads', type=str, help='number of threads to use', required=False) - parser.add_argument('-i', '--info', dest='info', type=str, help='info on padding and full res locations, yaml', - required=False) - parser.add_argument('-v', '--voxel_size', dest='voxel_size', type=str, help='Voxel size of scaled images (um)', - required=False) - args, _ = parser.parse_known_args() - inv = InvertRoi(args.config, args.label, args.outdir, args.info, args.voxel_size, args.threads) - inv.run() - - else: - print_args_error() - diff --git a/lama/elastix/invert_transforms.py b/lama/elastix/invert_transforms.py index b8ee0047..f7180731 100644 --- a/lama/elastix/invert_transforms.py +++ b/lama/elastix/invert_transforms.py @@ -14,8 +14,8 @@ from lama.common import cfg_load from lama.registration_pipeline.validate_config import LamaConfig -from lama.elastix import (ELX_TRANSFORM_NAME, ELX_PARAM_PREFIX, LABEL_INVERTED_TRANFORM, - IMAGE_INVERTED_TRANSFORM, INVERT_CONFIG, RESOLUTION_IMGS_DIR, IMG_PYRAMID_DIR) +from lama.elastix import (ELX_TRANSFORM_NAME, ELX_PARAM_PREFIX, PROPAGATE_LABEL_TRANFORM, + PROPAGATE_IMAGE_TRANSFORM, PROPAGATE_CONFIG, RESOLUTION_IMGS_DIR, IMG_PYRAMID_DIR) LABEL_REPLACEMENTS = { 'FinalBSplineInterpolationOrder': '0', @@ -37,7 +37,7 @@ } -def batch_invert_transform_parameters(config: Union[str, LamaConfig], +def batch_invert_transform_parameters(config: Union[Path, LamaConfig], clobber=True, new_log:bool=False): """ Create new elastix TransformParameter files that can then be used by transformix to invert labelmaps, stats etc @@ -97,8 +97,8 @@ def batch_invert_transform_parameters(config: Union[str, LamaConfig], inv_stage_dir.mkdir(exist_ok=True) # Add the stage to the inversion order config (in reverse order), if not already. - if reg_stage_dir.name not in stages_to_invert['inversion_order']: - stages_to_invert['inversion_order'].insert(0, reg_stage_dir.name) + if reg_stage_dir.name not in stages_to_invert['label_propagation_order']: + stages_to_invert['label_propagation_order'].insert(0, reg_stage_dir.name) if clobber: common.mkdir_force(specimen_stage_inversion_dir) # Overwrite any inversion file that exist for a single specimen @@ -117,19 +117,17 @@ def batch_invert_transform_parameters(config: Union[str, LamaConfig], 'param_file_output_name': 'inversion_parameters.txt', 'image_replacements': IMAGE_REPLACEMENTS, 'label_replacements': LABEL_REPLACEMENTS, - 'image_transform_file': IMAGE_INVERTED_TRANSFORM, - 'label_transform_file': LABEL_INVERTED_TRANFORM, + 'image_transform_file': PROPAGATE_IMAGE_TRANSFORM, + 'label_transform_file': PROPAGATE_LABEL_TRANFORM, 'clobber': clobber, 'threads': threads } jobs.append(job) - # Run the inversion jobs. Currently using only one thread as it seems that elastix now uses multiple threads on the - # Inversions - - logging.info('inverting with {} threads: '.format(threads)) - pool = Pool(1) # 17/09/18 If we can get multithreded inversion in elastix 4.9 we can remove the python multithreading + # By putting each inverison job (a single job per registration stage) we can speed things up a bit + # If we can get multithreded inversion in elastix we can remove this python multithreading + pool = Pool(8) try: pool.map(_invert_transform_parameters, jobs) @@ -142,7 +140,7 @@ def batch_invert_transform_parameters(config: Union[str, LamaConfig], reg_dir = Path(os.path.relpath(reg_stage_dir, inv_outdir)) stages_to_invert['registration_directory'] = str(reg_dir) # Doc why we need this # Create a yaml config file so that inversions can be run seperatley - invert_config = config['inverted_transforms'] / INVERT_CONFIG + invert_config = config['inverted_transforms'] / PROPAGATE_CONFIG with open(invert_config, 'w') as yf: yf.write(yaml.dump(dict(stages_to_invert), default_flow_style=False)) diff --git a/lama/elastix/invert_volumes.py b/lama/elastix/propagate_volumes.py similarity index 64% rename from lama/elastix/invert_volumes.py rename to lama/elastix/propagate_volumes.py index 2010a856..eb581b7d 100755 --- a/lama/elastix/invert_volumes.py +++ b/lama/elastix/propagate_volumes.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- +"""propagate_volumes.py -"""invert_volumes.py +This module propagtes labels using transforms made from either 'invert_transforms.py' or 'reverse_registration.py'. -This module inverts registrations performed with elastix +Note the word 'invert' used throughout. Read as propogate. It's this way as initially the only way to propagate +labels was to use the transform inversion method. But now we can use the reverse registration method i.e register +population average to specimen image thhen priopagate atlas using the given transform. Example ------- @@ -12,14 +14,13 @@ InvertLabelMap(invert_config, label_map_path, labels_inverion_dir, threads=32).run() example config file: - - labelmap: padded_target/labelmap.nrrd - voxel_size: 28 - stage_dirs: + label_propagation_order: + - rigid + - affine - deformable_to_8 - deformable_to_128 - - affine - - rigid + +Note: The order will be reversed if doing label propagation using the inverted transform method All paths are relative to the directory containing the config file @@ -37,21 +38,23 @@ """ from pathlib import Path +from typing import List import os import subprocess from os.path import join import shutil from logzero import logger as logging +import yaml from lama import common from lama.common import cfg_load -from lama.elastix import (LABEL_INVERTED_TRANFORM, IMAGE_INVERTED_TRANSFORM, ELX_PARAM_PREFIX, TRANSFORMIX_OUT, - ELX_TRANSFORM_NAME, ELX_INVERTED_POINTS_NAME) +from lama.elastix import (PROPAGATE_LABEL_TRANFORM, PROPAGATE_IMAGE_TRANSFORM, ELX_PARAM_PREFIX, TRANSFORMIX_OUT, + ELX_TRANSFORM_NAME, ELX_INVERTED_POINTS_NAME, PROPAGATE_CONFIG) -class Invert(object): - def __init__(self, config_path: Path, invertable, outdir, threads=None, noclobber=False, ): +class Propagate(object): + def __init__(self, config_path: Path, invertable, outdir, threads=None, noclobber=False): """ Inverts a series of volumes. A yaml config file specifies the order of inverted transform parameters to use. This config file should be in the root of the directory containing these inverted tform dirs. @@ -62,11 +65,14 @@ def __init__(self, config_path: Path, invertable, outdir, threads=None, noclobbe Parameters ---------- config_path - path to yaml config containing the oder of the inverted directories to use + path to yaml config containing the oder of the inverted directories to use. The directories containing + propagation tfransfrom files should be in the same diretory threads: str/ None number of threas to use. If None, use all available threads - invertable_volume: str - path to object to invert + invertable: str + path to object to invert (raw image, mask, label map etc) + outdir + where to store inverted volumes invertable: str dir or path. If dir, invert all objects within the subdirectories. If path to object (eg. labelmap) invert that instead @@ -89,67 +95,63 @@ def __init__(self, config_path: Path, invertable, outdir, threads=None, noclobbe common.mkdir_if_not_exists(self.out_dir) self.elx_param_prefix = ELX_PARAM_PREFIX - self.invert_transform_name = None # Set in subclasses - self.last_invert_dir = None + self.PROPAGATION_TFORM_NAME = None # Set in subclasses + self.last_invert_dir = None # I thik this is used as a way to find volumes to do organ vol calculation on def run(self): """ """ - done_file = self.out_dir / 'invert.done' + # temp_tform_dir = Path('/home/neil/Desktop/t/test_propagarte/temp') + done_file = self.out_dir / 'propagation.done' common.touch(done_file) vol_ids = os.listdir(self._transform_dirs()[0]) - logging.info('inverting volumes') + logging.info('propagating volumes') for i, id_ in enumerate(vol_ids): - invertable = self.invertables + prop_out_dir: Path = self.out_dir / id_ # create a folder with vol_id in case we have multiple vols to do + prop_out_dir.mkdir(exist_ok=True) - for inversion_stage in self._transform_dirs(): - invert_stage_out = self.out_dir / inversion_stage.name + tform_root = self.config_dir + init_tform = chain_tforms(tform_root, prop_out_dir, self.PROPAGATION_TFORM_NAME, self.config) - common.mkdir_if_not_exists(invert_stage_out) + propagated = self._propagate(self.invertables, init_tform, prop_out_dir, self.threads) - invert_vol_out_dir = invert_stage_out / id_ + if not propagated: # If inversion failed or there is nocobber, will get None + continue - common.mkdir_if_not_exists(invert_vol_out_dir) + self.last_invert_dir = self.out_dir - transform_file = inversion_stage / id_ / self.invert_transform_name - - # logging.info('inverting {}'.format(transform_file)) - - invertable = self._invert(invertable, transform_file, invert_vol_out_dir, self.threads) - - if not invertable: # If inversion failed or there is nocobber, will get None - continue - - self.last_invert_dir = invert_stage_out - - def _invert(self): + def _propagate(self): raise NotImplementedError - def _transform_dirs(self): + def _transform_dirs(self) -> List[Path]: + """ + When implemented, this method returns a list of directories each containing transform parameter file for a + label propagation stage. + """ raise NotImplementedError -class InvertLabelMap(Invert): +class PropagateLabelMap(Propagate): def __init__(self, *args, **kwargs): - super(InvertLabelMap, self).__init__(*args, **kwargs) - self.invert_transform_name = LABEL_INVERTED_TRANFORM + super(PropagateLabelMap, self).__init__(*args, **kwargs) + self.PROPAGATION_TFORM_NAME = PROPAGATE_LABEL_TRANFORM def _transform_dirs(self): dirs = [] - for dir_name in self.config['inversion_order']: + for dir_name in self.config['label_propagation_order']: dir_path = self.config_dir / dir_name dirs.append(dir_path) return dirs - def _invert(self, labelmap, tform, outdir: Path, threads=None): + def _propagate(self, labelmap, tform, outdir: Path, threads=None): """ Using the iverted elastix transform paramter file, invert a volume with transformix @@ -194,7 +196,7 @@ def _invert(self, labelmap, tform, outdir: Path, threads=None): try: subprocess.check_output(cmd) except Exception as e: - logging.exception('{}\ntransformix failed inverting labelmap: {}'.format(e, labelmap)) + logging.exception('{}\ntransformix failed propagating labelmap: {}'.format(e, labelmap)) raise try: @@ -207,19 +209,19 @@ def _invert(self, labelmap, tform, outdir: Path, threads=None): return new_output_name -class InvertHeatmap(InvertLabelMap): +class PropagateHeatmap(PropagateLabelMap): """ This class behaves the same as InvertLabelap but uses a different transform parameter file """ def __init__(self, *args, **kwargs): - super(InvertHeatmap, self).__init__(*args, **kwargs) - self.invert_transform_name = IMAGE_INVERTED_TRANSFORM + super(PropagateHeatmap, self).__init__(*args, **kwargs) + self.invert_transform_name = PROPAGATE_IMAGE_TRANSFORM -class InvertMeshes(Invert): +class PropagateMeshes(Propagate): def __init__(self, config_path, invertable, outdir, threads=None): - super(InvertMeshes, self).__init__(config_path, invertable, outdir, threads) + super(PropagateMeshes, self).__init__(config_path, invertable, outdir, threads) self.invert_transform_name = ELX_TRANSFORM_NAME def _transform_dirs(self): @@ -232,12 +234,12 @@ def _transform_dirs(self): """ dirs = [] reg_dir = self.config.get('registration_directory') - for dir_name in self.config['inversion_order']: + for dir_name in self.config['label_propagation_order']: dir_path = join(self.config_dir, reg_dir, dir_name) dirs.append(dir_path) return dirs - def _invert(self, mesh, tform, outdir, threads=None): + def _propagate(self, mesh, tform, outdir, threads=None): """ Using the iverted elastix transform paramter file, invert a volume with transformix @@ -287,14 +289,14 @@ def _invert(self, mesh, tform, outdir, threads=None): return new_vtk_path -class InvertSingleVol(Invert): +class PropagateSingleVol(Propagate): """ Invert volumes using the elastix inverted transform parameters. This class is used for inverting statitistics overlays """ def __init__(self, *args, **kwargs): - super(InvertSingleVol, self).__init__(*args, **kwargs) - self.invert_transform_name = IMAGE_INVERTED_TRANSFORM + super(PropagateSingleVol, self).__init__(*args, **kwargs) + self.invert_transform_name = PROPAGATE_IMAGE_TRANSFORM def run(self, prefix=None): """ @@ -326,13 +328,13 @@ def run(self, prefix=None): inv_tform_dir = join(inversion_stage, original_vol_name) - transform_file = join(inv_tform_dir, IMAGE_INVERTED_TRANSFORM) + transform_file = join(inv_tform_dir, PROPAGATE_IMAGE_TRANSFORM) invert_vol_out_dir = join(invert_stage_out, volname) common.mkdir_if_not_exists(invert_vol_out_dir) - invertable = self._invert(invertable, transform_file, invert_vol_out_dir, self.threads) + invertable = self._propagate(invertable, transform_file, invert_vol_out_dir, self.threads) - def _invert(self, volume, tform, outdir, threads=None): + def _propagate(self, volume, tform, outdir, threads=None): """ Using the iverted elastix transform paramter file, invert a volume with transformix @@ -369,7 +371,7 @@ def _invert(self, volume, tform, outdir, threads=None): try: subprocess.check_output(cmd) except Exception as e: - logging.exception('transformix failed inverting volume: {} Is transformix installed?. Error: {}'.format(volume, e)) + logging.exception('transformix failed propagating volume: {} Is transformix installed?. Error: {}'.format(volume, e)) raise try: old_img = os.path.join(outdir, TRANSFORMIX_OUT) @@ -381,4 +383,49 @@ def _invert(self, volume, tform, outdir, threads=None): return new_img_path +def chain_tforms(root_dir: Path, new_tform_dir, tform_name, config): + + label_replacements = { + 'FinalBSplineInterpolationOrder': '0', + 'FixedInternalImagePixelType': 'short', + 'MovingInternalImagePixelType': 'short', + 'ResultImagePixelType': '"unsigned char"', + 'WriteTransformParametersEachResolution': 'false', + 'WriteResultImageAfterEachResolution': 'false' + } + + stages = config['label_propagation_order'] + + # stages = stages[::-1] + for i, stage in enumerate(stages): + stage_dir = root_dir / stage + tform_file = next(stage_dir.glob(f'**/{tform_name}')) + new_tform_file = new_tform_dir / f'{stage}_{tform_file.name}' + shutil.copyfile(tform_file, new_tform_file) + + if i + 1 < len(stages): + init_tform = new_tform_dir / f'{stages[i+1]}_{tform_file.name}' + # init_tform = f'{stages[i+1]}.txt' + + else: + init_tform = None + + new_tform_path = new_tform_dir / f'{stage}_{tform_file.name}' + + if i == 0: + file_for_transformix = new_tform_path + + with open(tform_file, 'r') as fh, open(new_tform_path, 'w') as nfh: + + for line in fh: + # + if line.startswith('(InitialTransformParametersFileName') and init_tform: + line = f'(InitialTransformParametersFileName "{str(init_tform)}")\n' + else: + for param in label_replacements: + if line.startswith(f'({param}'): + line = f'({param} {label_replacements[param]})\n' + + nfh.write(line) + return file_for_transformix \ No newline at end of file diff --git a/lama/elastix/reverse_registration.py b/lama/elastix/reverse_registration.py index 65a4eb64..6056a0bc 100644 --- a/lama/elastix/reverse_registration.py +++ b/lama/elastix/reverse_registration.py @@ -47,6 +47,7 @@ from os.path import join from typing import Union, Dict import os +import shutil from logzero import logger as logging import yaml @@ -57,8 +58,8 @@ from lama.paths import LamaSpecimenData from lama.elastix.elastix_registration import move_intemediate_volumes -from lama.elastix import (ELX_TRANSFORM_NAME, ELX_PARAM_PREFIX, LABEL_INVERTED_TRANFORM, - IMAGE_INVERTED_TRANSFORM) +from lama.elastix import (ELX_TRANSFORM_NAME, ELX_PARAM_PREFIX, PROPAGATE_LABEL_TRANFORM, + PROPAGATE_IMAGE_TRANSFORM, PROPAGATE_CONFIG) from lama.elastix.invert_transforms import (LABEL_REPLACEMENTS, IMAGE_REPLACEMENTS, ) from lama.elastix.elastix_registration import TargetBasedRegistration @@ -96,12 +97,16 @@ def run_registration_schedule(config: LamaConfig, fixed_vol, moving_vol: Path, o ------- The path to the final registrered images """ + # egp = {'WriteResultImage': 'false', # We only need the tform files not the images + # 'WriteResultImageAfterEachResolution': 'true'} elastix_stage_parameters = run_lama.generate_elx_parameters(config) # Set the moving volume dir and the fixed image for the first stage # Set the fixed volume up for the first stage. This will checnge each stage if doing population average stage_ids = [] + stage_spec_dirs = [] + for i, reg_stage in enumerate(config['registration_stage_params']): # Make the stage output dir @@ -133,33 +138,30 @@ def run_registration_schedule(config: LamaConfig, fixed_vol, moving_vol: Path, o registrator.set_target(fixed_vol) - if reg_stage['elastix_parameters']['Transform'] == 'BSplineTransform': - if config['fix_folding']: - logging.info(f'Folding correction for stage {stage_id} set') - registrator.fix_folding = True + # issues/133: switch off fix folding for label propagation until fixed + registrator.fix_folding = False + + # if reg_stage['elastix_parameters']['Transform'] == 'BSplineTransform': + # if config['fix_folding']: + # logging.info(f'Folding correction for stage {stage_id} set') + # registrator.fix_folding = True registrator.run() # Do the registrations for a single stage os.remove(elxparam_path) - # As the stage output diretory is named as the moving image, but in the case we want it named the same as the + # As the stage output diretory is named as the moving image, but in this case we want it named the same as the # fixed image stage_spec_dir = next(stage_dir.glob(f'*{moving_vol.stem}')) new_stage_spec_dir = stage_dir / fixed_vol.stem stage_spec_dir.rename(new_stage_spec_dir) - # Now delete everything we don't need - to_keep = [ELX_TRANSFORM_NAME, 'elastix.log', moving_vol.name] + stage_spec_dirs.append(new_stage_spec_dir) + moving_vol = new_stage_spec_dir / moving_vol.name - # for f in new_stage_spec_dir.iterdir(): - # if f.name not in to_keep: - # try: - # shutil.rmtree(f) - # except NotADirectoryError: - # f.unlink() src_tform_file = stage_dir / fixed_vol.stem / ELX_TRANSFORM_NAME - label_tform_file = stage_dir / fixed_vol.stem / LABEL_INVERTED_TRANFORM - image_tform_file = stage_dir / fixed_vol.stem / IMAGE_INVERTED_TRANSFORM + label_tform_file = stage_dir / fixed_vol.stem / PROPAGATE_LABEL_TRANFORM + image_tform_file = stage_dir / fixed_vol.stem / PROPAGATE_IMAGE_TRANSFORM modify_elx_parameter_file(src_tform_file, label_tform_file, LABEL_REPLACEMENTS) modify_elx_parameter_file(src_tform_file, image_tform_file, IMAGE_REPLACEMENTS) @@ -167,15 +169,25 @@ def run_registration_schedule(config: LamaConfig, fixed_vol, moving_vol: Path, o logging.info("### Reverse registration finished ###") - d = {'inversion_order': stage_ids} - with open(outdir / 'invert.yaml', 'w') as fh: + # Now delete everything we don't need + to_keep = [PROPAGATE_LABEL_TRANFORM, 'elastix.log', PROPAGATE_IMAGE_TRANSFORM] + + for s in stage_spec_dirs: + for f in s.iterdir(): + if f.name not in to_keep: + try: + shutil.rmtree(f) + except NotADirectoryError: + f.unlink() + + d = {'label_propagation_order': stage_ids} + with open(outdir / PROPAGATE_CONFIG, 'w') as fh: yaml.dump(d, fh) def modify_elx_parameter_file(elx_param_file: Path, newfile_name: str, replacements: Dict): """ Modifies the elastix input parameter file that was used in the original transformation. - Adds DisplacementMagnitudePenalty (which is needed for inverting) Turns off writing the image results at the end as we only need an inverted output file. Also changes interpolation order in the case of inverting labels diff --git a/lama/example_configs/ark_pop_avg.toml b/lama/example_configs/ark_pop_avg.toml new file mode 100644 index 00000000..7e826099 --- /dev/null +++ b/lama/example_configs/ark_pop_avg.toml @@ -0,0 +1,146 @@ +target_folder = "target" +threads = 96 +filetype = "nrrd" + +# This is the fixed valoume for the initial rigid stage only +fixed_volume = "210602_C3H_avg_n18.nrrd" + + +fixed_mask = "fixed_mask.nrrd" +stats_mask = "stats_mask.nrrd" + +# Set to 'true' to for population average creation +generate_new_target_each_stage = true +fix_folding = true +label_map = "210713_C3H_atlas_n18.nrrd" +label_info = "E14_5_atlas_v24_43_label_info.csv" + +skip_transform_inversion = false +staging = "embryo_volume" +label_propagation = "reverse_registration" + +[[registration_stage_params]] +stage_id = "rigid" + +[registration_stage_params.elastix_parameters] +Metric = "AdvancedMattesMutualInformation" +Registration = "MultiResolutionRegistration" +MaximumNumberOfIterations = 400 +NumberOfResolutions = 4 +NumberOfSpatialSamples = 20000 +Transform = "EulerTransform" +SP_a = [ 1000.0, 1000.0, 500.0, 500.0,] +SP_alpha = 0.602 +SP_A = 50.0 +UseDifferentiableOverlap = "false" + +[[registration_stage_params]] +stage_id = "similarity" +[registration_stage_params.elastix_parameters] +Registration = "MultiResolutionRegistration" +NumberOfResolutions = 4 +Transform = "SimilarityTransform" +Metric = "AdvancedMattesMutualInformation" +MaximumNumberOfIterations = 500 +NumberOfSpatialSamples = 20000 + +[[registration_stage_params]] +stage_id = "affine" +[registration_stage_params.elastix_parameters] +Registration = "MultiResolutionRegistration" +NumberOfResolutions = 4 +Transform = "AffineTransform" +Metric = "AdvancedMattesMutualInformation" +MaximumNumberOfIterations = 500 +NumberOfSpatialSamples = 20000 + + +##################### Deformable registration stages +[[registration_stage_params]] +stage_id = "deformable_128" +[registration_stage_params.elastix_parameters] +Registration = "MultiResolutionRegistration" +NumberOfResolutions = 1 +NumberOfSpatialSamples = 200000 +MaximumStepLength = 3.0 +NumberOfGradientMeasurements = 10 +NumberOfSamplesForExactGradient = 20000 +NumberOfJacobianMeasurements = 4000 +MaximumNumberOfIterations = 250 +AutomaticParameterEstimation = "true" +UseAdaptiveStepSizes = "true" +ASGDParameterEstimationMethod = "DisplacementDistribution" +Transform = "BSplineTransform" +Metric = "AdvancedMattesMutualInformation" +FinalGridSpacingInVoxels = 128 + + +[[registration_stage_params]] +stage_id = "deformable_64" +inherit_elx_params = "deformable_128" +[registration_stage_params.elastix_parameters] +FinalGridSpacingInVoxels = 64 + + + +[[registration_stage_params]] +stage_id = "deformable_32" +inherit_elx_params = "deformable_128" +[registration_stage_params.elastix_parameters] +MaximumStepLength = 2.0 +FinalGridSpacingInVoxels = 32 + + +[[registration_stage_params]] +stage_id = "deformable_16" +inherit_elx_params = "deformable_32" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +Metric = [ "AdvancedMattesMutualInformation", "TransformBendingEnergyPenalty",] +Registration = "MultiMetricMultiResolutionRegistration" +FinalGridSpacingInVoxels = 16 +MaximumStepLength = 1.0 +Metric0Weight = 1.0 +Metric1Weight = 50 + + +[[registration_stage_params]] +stage_id = "deformable_8" +inherit_elx_params = "deformable_16" +[registration_stage_params.elastix_parameters] +MaximumStepLength = 1.0 +FinalGridSpacingInVoxels = 8 + + +######## End deforamble registration + + +[global_elastix_params] +FixedInternalImagePixelType = "float" +MovingInternalImagePixelType = "float" +FixedImageDimension = 3 +MovingImageDimension = 3 +UseDirectionCosines = "true" +FixedImagePyramid = "FixedSmoothingImagePyramid" +MovingImagePyramid = "MovingSmoothingImagePyramid" +ResultImagePixelType = "short" +ResultImageFormat = "nrrd" +CompressResultImage = "true" +Interpolator = "BSplineInterpolator" +ResampleInterpolator = "FinalBSplineInterpolator" +Resampler = "DefaultResampler" +NumberOfHistogramBins = 32 +HowToCombineTransforms = "Compose" +NewSamplesEveryIteration = "true" +ImageSampler = "RandomCoordinate" +FinalBSplineInterpolationOrder = 3 +BSplineInterpolationOrder = 3 +DefaultPixelValue = 0 +WriteTransformParametersEachIteration = "false" +WriteTransformParametersEachResolution = "true" +WriteResultImage = "true" +WriteResultImageAfterEachResolution = "true" +AutomaticScalesEstimation = "true" +AutomaticTransformInitialization = "true" +Optimizer = "AdaptiveStochasticGradientDescent" +UseRandomSampleRegion = "false" diff --git a/lama/example_configs/arkell_generate_data.toml b/lama/example_configs/arkell_generate_data.toml new file mode 100644 index 00000000..ab70f44e --- /dev/null +++ b/lama/example_configs/arkell_generate_data.toml @@ -0,0 +1,167 @@ +###This dumb config is the average of the configs +target_folder = "../target" +threads = 96 +filetype = "nrrd" +fixed_volume = "210602_C3H_avg_n18.nrrd" +fixed_mask = "fixed_mask.nrrd" +stats_mask = "stats_mask.nrrd" +label_map = "210713_C3H_atlas_n18.nrrd" +label_info = "E14_5_atlas_v24_43_label_info.csv" +generate_new_target_each_stage = false +skip_transform_inversion = false +staging = "embryo_volume" +label_propagation = 'reverse_registration' +skip_forward_registration = false +fix_folding = true + +[generate_deformation_fields] +160_to_6 = ["deformable_160", "deformable_80", "deformable_40", "deformable_20", "deformable_12", "deformable_8", "deformable_6"] + + +[[registration_stage_params]] +stage_id = "rigid" + +[registration_stage_params.elastix_parameters] +Metric = "AdvancedNormalizedCorrelation" +Registration = "MultiResolutionRegistration" +MaximumNumberOfIterations = 400 +NumberOfResolutions = 2 +NumberOfSpatialSamples = 100000 +Transform = "EulerTransform" +SP_a = [ 1000.0, 1000.0, 500.0, 500.0,] +SP_alpha = 0.602 +SP_A = 50.0 +FixedLimitRangeRatio = 0.0 +MovingLimitRangeRatio = 0.0 +FixedKernelBSplineOrder = 1 +MovingKernelBSplineOrder = 3 +UseDifferentiableOverlap = "false" +[[registration_stage_params]] +stage_id = "affine" + +[registration_stage_params.elastix_parameters] +Registration = "MultiResolutionRegistration" +NumberOfResolutions = 2 +Transform = "AffineTransform" +Metric = "AdvancedNormalizedCorrelation" +MaximumNumberOfIterations = 500 +NumberOfSpatialSamples = 1000000 + +### def1 +[[registration_stage_params]] +stage_id = "deformable_160" +[registration_stage_params.elastix_parameters] +Registration = "MultiResolutionRegistration" +NumberOfResolutions = 1 +NumberOfSpatialSamples = 20000 +MaximumStepLength = 3.0 +NumberOfGradientMeasurements = 10 +NumberOfSamplesForExactGradient = 10000 +NumberOfJacobianMeasurements = 4000 +MaximumNumberOfIterations = 1000 +AutomaticParameterEstimation = "true" +UseAdaptiveStepSizes = "true" +#ASGDParameterEstimationMethod = "DisplacementDistribution" +Transform = "BSplineTransform" +Metric = "AdvancedMattesMutualInformation" +FinalGridSpacingInVoxels = 160 +FixedImagePyramidSchedule = [6] +MovingImagePyramidSchedule = [6] + +##def3 +[[registration_stage_params]] +stage_id = "deformable_80" +inherit_elx_params = "deformable_160" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +FinalGridSpacingInVoxels = 80 +FixedImagePyramidSchedule = [5] +MovingImagePyramidSchedule = [5] + +##def4 +[[registration_stage_params]] +stage_id = "deformable_40" +inherit_elx_params = "deformable_80" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +MaximumStepLength = 2.0 +FinalGridSpacingInVoxels = 40 +FixedImagePyramidSchedule = [4] +MovingImagePyramidSchedule = [4] + +##def5 +[[registration_stage_params]] +stage_id = "deformable_20" +inherit_elx_params = "deformable_40" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +Metric = "AdvancedMattesMutualInformation" +Registration = "MultiResolutionRegistration" +FinalGridSpacingInVoxels = 20 +MaximumStepLength = 1.0 +FixedImagePyramidSchedule = [3] +MovingImagePyramidSchedule = [3] + + +##def6 +[[registration_stage_params]] +stage_id = "deformable_12" +inherit_elx_params = "deformable_20" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +MaximumStepLength = 1.0 +FinalGridSpacingInVoxels = 12 +FixedImagePyramidSchedule = [2] +MovingImagePyramidSchedule = [2] + +##def7 +[[registration_stage_params]] +stage_id = "deformable_8" +inherit_elx_params = "deformable_12" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +MaximumStepLength = 1.0 +FinalGridSpacingInVoxels = 8 +FixedImagePyramidSchedule = [1] +MovingImagePyramidSchedule = [1] + +##def8 +[[registration_stage_params]] +stage_id = "deformable_6" +inherit_elx_params = "deformable_12" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +MaximumStepLength = 0.8 +FinalGridSpacingInVoxels = 6 +FixedImagePyramidSchedule = [1] +MovingImagePyramidSchedule = [1] + +[global_elastix_params] +FixedInternalImagePixelType = "float" +MovingInternalImagePixelType = "float" +FixedImageDimension = 3 +MovingImageDimension = 3 +UseDirectionCosines = "true" +FixedImagePyramid = "FixedSmoothingImagePyramid" +MovingImagePyramid = "MovingSmoothingImagePyramid" +ResultImagePixelType = "float" +ResultImageFormat = "nrrd" +CompressResultImage = "true" +Interpolator = "BSplineInterpolator" +ResampleInterpolator = "FinalBSplineInterpolator" +Resampler = "DefaultResampler" +NumberOfHistogramBins = 32 +HowToCombineTransforms = "Compose" +NewSamplesEveryIteration = "true" +ImageSampler = "RandomCoordinate" +FinalBSplineInterpolationOrder = 3 +BSplineInterpolationOrder = 3 +DefaultPixelValue = 0 +WriteTransformParametersEachIteration = "false" +WriteResultImage = "false" +WriteResultImageAfterEachResolution = "false" +AutomaticScalesEstimation = "true" +AutomaticTransformInitialization = "true" +Optimizer = "AdaptiveStochasticGradientDescent" +UseRandomSampleRegion = "true" +MaximumNumberOfSamplingAttempts = 10 diff --git a/lama/example_configs/arkell_stats.toml b/lama/example_configs/arkell_stats.toml new file mode 100644 index 00000000..582e9b73 --- /dev/null +++ b/lama/example_configs/arkell_stats.toml @@ -0,0 +1,29 @@ +# This is the new (281118) stats config for the standard stats pipeline + +stats_types = [ +'intensity', +'jacobians', +'organ_volumes' +] + +# This is the final folder in the registration sheme +reg_folder = 'deformable_6' +# The final Jacobian determinat folder +jac_folder = '160_to_6' + +# Tight mask for restricting the analysis to +mask = 'stats_mask.nrrd' +label_info = 'E14_5_atlas_v24_43_label_info.csv' +label_map = '210713_C3H_atlas_n18.nrrd' +blur_fwhm = 100 +voxel_size = 40.0 #this may need changing +invert_stats = false + +# Linearly normalise intensity data to th mean intensity withing the mask +normalise = 'histogram' + +# Have whole embryo volume in the linear model to account for developmental substage +use_staging = true + +# Enable Two-way study for interaction effects +two_way = true diff --git a/lama/img_processing/dicom_to_nrrd.sh b/lama/img_processing/dicom_to_nrrd.sh deleted file mode 100644 index c6f1672b..00000000 --- a/lama/img_processing/dicom_to_nrrd.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -#Bash script to convert DICOM files to NRRD format in batches. -#Currenty converts DICOMs to 16-bit utype (and as such may need modifiying). - -#Dependencies: -# Slicer -# dcm2niix module for Slicer (path to executable module depends on the host) - - -#TODO: add genotype detection by identifying 'wt' in the parent directory -#(uCT scans are performed before genotyping to reduce bias and therefore -#a method of simplified labelling post scanning will be required) - -#TODO: double check headers are compatible with LAMA. - -#Make directory -mkdir nrrd_out - -#loops through folders -for directory in */; -do - #Go within the specific DICOM directory: - dir_name=${directory%/*// /_} - cd ${dir_name} - - #Error trap for spaces in dicom filenames from previous PhD student - for f in *\ *; do mv "$f" "${f// /_}"; done - - #Perform the conversion - #TODO: find code to identify where dcm2niix is located. - cd ../ - /home/minc/.config/NA-MIC/Extensions-28257/SlicerDcm2nii/lib/Slicer-4.10/qt-scripted-modules/Resources/bin/dcm2niix -1 -d 0 -f ${dir_name%/} -o nrrd_out -e y -z n ${dir_name} -done - - - - - - - - - - - - - - - - - - - - diff --git a/lama/img_processing/full_img_processor.sh b/lama/img_processing/full_img_processor.sh new file mode 100644 index 00000000..6a9c047c --- /dev/null +++ b/lama/img_processing/full_img_processor.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +#DICOM to NRRD file bash script + +#TODO: add genotype detection by identifying 'wt' in the parent directory +#(uCT scans are performed before genotyping to reduce bias and therefore +#a method of simplified labelling post scanning will be required) +#May be able to intergrate code from tiff_to_minc.sh + +# get the latest dcm to niix and unzip - ToDO: decide whether to just download the zip or proper install... + +curl -fLO https://github.com/rordenlab/dcm2niix/releases/latest/download/dcm2niix_lnx.zip +unzip dcm2niix_lnx.zip + +#loops through folders +mkdir nrrd_out +for directory in */; +do + #Go within the specific DICOM directory: + dir_name=${directory%/*// /_} + cd ${dir_name} + + #Error trap for spaces in dicom filenames from previous PhD student + for f in *\ *; do mv "$f" "${f// /_}"; done + + #Make directory and perform the conversion + cd ../ + + #TODO: check if -l o/n is better!!!! + ./dcm2niix -1 -d 0 -f ${dir_name%/} -o nrrd_out -e y -z n ${dir_name} + + #Do some basic clean-up - we don't care about .json files + rm nrrd_out/*.json +done + + +# img processor.sh +mv cropper.py nrrd_out/ + + +cd nrrd_out + +mkdir cropped masked + +python3 cropper.py + +# 16-bit to 8-bit conversion + +mkdir converted + +lama_convert_16_to_8 -i cropped -o converted + +# flipping scans to matc the population average + +mv ../flipper.py converted/ + +cd converted + +python3 flipper.py + +# padding +cd ../ + +mkdir needs_padding +mkdir padded +cp -r ../target needs_padding/ + +mv converted needs_padding/ + +lama_pad_volumes -i needs_padding -o padded + +mv padded ../ + + +cd padded + +# make folders +for cond in baseline, mutants, treatment, mut_treatment; +do + mkdir ${cond} + mkdir ${cond}/inputs + mkdir ${cond}/inputs/${cond} +done + +#autosort folders +for f in *; +do + # baselines should always be C3H + # will onnly use hets for Zic2 + if grep -q "wt" <<< "$f" && grep -q "C3H"; then + mv f baseline/inputs/baseline/ + elif grep -q "het" <<< "$f" && grep -q "C3H"; then + mv f mutants/inputs/mutants/ + elif grep -q "wt" <<< "$f" ; then + mv f teatment/inputs/treatment/ + else + mv f mut_treat/inputs/mut_treat/ + fi +done + + + + + + + + + + + + + + + + + + diff --git a/lama/img_processing/normalise.py b/lama/img_processing/normalise.py index a2b0e7dc..db5edd43 100755 --- a/lama/img_processing/normalise.py +++ b/lama/img_processing/normalise.py @@ -10,10 +10,11 @@ from logzero import logger as logging import numpy as np - +import SimpleITK as sitk from lama.paths import specimen_iterator from lama import common from lama.registration_pipeline.validate_config import LamaConfig +from scipy import ndimage try: from skimage.draw import line_aa @@ -31,16 +32,18 @@ def __init__(self): def factory(type_, data_type: str): if data_type == 'intensity': - # If passing an ROI as as list - if isinstance(type_ ,(list, )): # Not working at the moment + if isinstance(type_, (list,)): # Not working at the moment if len(type_) != 3: return None - return None # RoiNormalise + return None # RoiNormalise elif type_ == 'mask': return IntensityMaskNormalise() - + elif type_ == 'histogram': + return IntensityHistogramMatch() + elif type_ == 'N4biascorrection': + return IntensityN4Normalise() else: return None @@ -59,8 +62,8 @@ def memorymap_data(self, lama_root_dir: Path) -> Dict[str, np.memmap]: imgs = OrderedDict() for line_dir, spec_dir in specimen_iterator(lama_root_dir): - config_file = common.getfile_endswith('.toml') # Get the Lama config from the specimen directory - config = LamaConfig(config_file ) + config_file = common.getfile_endswith('.toml') # Get the Lama config from the specimen directory + config = LamaConfig(config_file) reg_dir = config['root_reg_dir'] basename = os.path.basename(imgpath) loader = common.LoadImage(imgpath) @@ -88,15 +91,324 @@ def normalise(self) -> np.ndarray: raise NotImplementedError +class NonRegMaskNormalise(Normaliser): + """ + Normalise a set of volumes to the mean of voxel included in a mask. + In this case each volume needs its mask + as its not deformed. + + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, *kwargs) + self.reference_mean = None + + def get_all_wt_vols_and_masks(self, _dir): + baseline_dir = Path(_dir).parent / "baseline" + vol_paths = [path for path in common.get_file_paths(baseline_dir) if "rigid" in str(path)] + mask_paths = [path for path in common.get_file_paths(baseline_dir) if "inverted_stats_mask" in str(path)] + + vol_paths.sort(key=lambda x: os.path.basename(x)) + mask_paths.sort(key=lambda x: os.path.basename(x)) + + vols = [common.LoadImage(_path).img for _path in vol_paths] + masks = [common.LoadImage(_path).img for _path in mask_paths] + + return vols, masks + + + + def gen_otsu_masks(self, volumes: List[np.ndarray], file_names: List[Path]=None): + ''' + Creates an otsu for each scan + Parameters + ---------- + volumes - list of volumes + + Returns + ------- + + ''' + logging.info("Creating_otsu_masks") + o_masks = [None]* len(volumes) + if ~isinstance(volumes, list): + # stops code from breaking in radiomics runner + volumes = [volumes] + for i, vol in enumerate(volumes): + + Otsu = sitk.OtsuThresholdImageFilter() + + inv_mask = Otsu.Execute(vol) + o_mask = sitk.InvertIntensity(inv_mask, 1) + + o_mask = sitk.ConnectedComponent(o_mask != o_mask[0, 0, 0]) + + # sitk.WriteImage(seg, os.path.join(output, name + "_all_connected.nrrd")) + o_mask = sitk.RelabelComponent(o_mask) + o_mask = o_mask == 1 + # sitk.WriteImage(seg, os.path.join(output, name + "_largest_connected.nrrd")) + + # lets see if dilate with a tight kernal fixes getting stupid dots everywhere. + dilate = sitk.BinaryDilateImageFilter() + dilate.SetKernelRadius([1, 1, 1]) + dilate.SetKernelType(sitk.sitkBall) + o_masks[i] = dilate.Execute(o_mask) + o_masks[i].CopyInformation(vol) + + return o_masks + + def add_reference(self, ref: sitk.SimpleITK.Image, ref_mask: sitk.SimpleITK.Image): + """ + Add the + + Parameters + ---------- + + Returns + ------- + """ + logging.info('normalising intensity data to mean of the mask') + + # so when we add the reference, we're not storing the image + # so we can slice it to make computation time quicker + means = [] + def do_norm(vol, ref_mask): + img = sitk.GetArrayFromImage(vol) + mask = sitk.GetArrayFromImage(ref_mask) + + s = ndimage.find_objects(mask)[0] + + mask = mask[s[0].start:s[0].stop, + s[1].start:s[1].stop, + s[2].start:s[2].stop] + img = img[s[0].start:s[0].stop, + s[1].start:s[1].stop, + s[2].start:s[2].stop] + + # test if this improves speed + + # ignore vals outside of mask + img = img[mask == 1] + + means.append(np.mean(img)) + + if isinstance(ref, list): + for i, vol in enumerate(ref): + do_norm(vol, ref_mask[i]) + + else: + do_norm(ref, ref_mask) + + self.reference_mean = np.mean(means) + + def normalise(self, volumes: List[np.ndarray], masks: List[np.ndarray], + fold: bool = False, temp_dir: Path = None): + """ + given paths to registered images, apply linear normalisation so that the mean of the roi across all images are + the same. + + Create new diretories and place the normalised images in + + Parameters + ---------- + volumes : list of imgs + masks: list of masks + fold : performs fold difference if true + + Returns + ------- + None + Data is normalised in-place + """ + + logging.info('Normalising images to mask') + + for i, vol in enumerate(volumes): + if isinstance(vol, sitk.SimpleITK.Image): + img_a = sitk.GetArrayFromImage(vol) + else: + img_a = sitk.GetArrayFromImage(vol.img) + if isinstance(masks[i], sitk.SimpleITK.Image): + mask_a = sitk.GetArrayFromImage(masks[i]) + else: + mask_a = sitk.GetArrayFromImage(masks[i].img) + t = tempfile.TemporaryFile(dir=temp_dir) + img_a = img_a[mask_a == 1] + arr_for_mean = np.memmap(t, dtype=img_a.dtype, mode='w+', shape=img_a.shape) + arr_for_mean[:] = img_a + + try: + # get all values inside mask to calculate mean + # self.reference_mean = np.mean(img) why is this here anyway + if fold: + # this looks stupid but it stops division by zeroes + multi = sitk.MultiplyImageFilter() + vol = multi.Execute(vol, self.reference_mean) + divis = sitk.DivideImageFilter() + volumes[i] = divis.Execute(vol, np.mean(arr_for_mean)) + #arr = fold_difference * arr # imagarr = 16bit meandiff = 64bit + #tmp = sitk.GetImageFromArray(arr) + #tmp.CopyInformation(vol) + #volumes[i] = tmp + else: + mean_difference = np.mean(arr_for_mean) - self.reference_mean + subtract = sitk.SubtractImageFilter() + volumes[i] = subtract.Execute(vol, float(mean_difference)) + + except TypeError: # Could be caused by imgarr being a short + # fold difference should not be here + mean_difference = np.mean(arr_for_mean) - self.reference_mean + img_a -= int(np.round(mean_difference)) + tmp = sitk.GetImageFromArray(img_a) + tmp.CopyInformation(vol) + volumes[i] = tmp + + +class IntensityHistogramMatch(Normaliser): + """ + Normalise a set of volumes to the mean of voxel included in a mask. + In this case each volume needs its mask + as its not deformed. + + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, *kwargs) + + #try: + # ref_vol_path = Path(config.config_dir / config['reference_vol']) + # self.ref_vol = common.LoadImage(ref_vol_path) + #except KeyError: + # self.ref_vol = None + + def normalise(self, volumes: List[np.ndarray], ref_vol: np.ndarray = None): + """ + Normalises via bin matching to a reference image. + ThresholdAtMeanIntensityOn() makes + + Parameters + ---------- + + Returns + ------- + None + Data is normalised in-place + """ + + logging.info('Using Histogram Matching') + + # Get the Population average as the ref vol if not provided. + #ref_vol = self.ref_vol if self.ref_vol else ref_vol + + # Only need to load the ref volume once + matcher = sitk.HistogramMatchingImageFilter() + matcher.SetNumberOfHistogramLevels(65536) + matcher.SetNumberOfMatchPoints(20000) + #matcher.SetThresholdAtMeanIntensity(True) + + for i, img in enumerate(volumes): + try: + volumes[i] = matcher.Execute(img, ref_vol) + except RuntimeError: # needs casting + img = sitk.Cast(img, sitk.sitkFloat32) + ref_vol = sitk.Cast(ref_vol, sitk.sitkFloat32) + volumes[i] = matcher.Execute(img, ref_vol) + + +class IntensityN4Normalise(Normaliser): + """ + Use N4 normalisation to normalise images - needs to have a mask specified or else all values > 1 + are masked. + + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, *kwargs) + + #try: + # ref_vol_path = Path(config.config_dir / config['reference_vol']) + # self.ref_vol = common.LoadImage(ref_vol_path) + #except KeyError: + # self.ref_vol = None + def gen_otsu_masks(self, vol: List[np.ndarray], file_names: List[Path]=None): + ''' + Creates an otsu for each scan + Parameters + ---------- + volumes - list of volumes + + Returns + ------- + + ''' + logging.info("Creating_otsu_masks") + + Otsu = sitk.OtsuThresholdImageFilter() + + inv_mask = Otsu.Execute(vol) + o_mask = sitk.InvertIntensity(inv_mask, 1) + + o_mask = sitk.ConnectedComponent(o_mask != o_mask[0, 0, 0]) + + # sitk.WriteImage(seg, os.path.join(output, name + "_all_connected.nrrd")) + o_mask = sitk.RelabelComponent(o_mask) + o_mask = o_mask == 1 + # sitk.WriteImage(seg, os.path.join(output, name + "_largest_connected.nrrd")) + + # lets see if dilate with a tight kernal fixes getting stupid dots everywhere. + dilate = sitk.BinaryDilateImageFilter() + dilate.SetKernelRadius([1, 1, 1]) + dilate.SetKernelType(sitk.sitkBall) + o_mask = dilate.Execute(o_mask) + o_mask.CopyInformation(vol) + + #o_dir = file_names[0].parent.parent / "otsu_thresholds" + #os.makedirs(o_dir, exist_ok=True) + #sitk.WriteImage(o_mask, str(Path(o_dir) / os.path.basename(file_names[i]))) + return o_mask + + def normalise(self, img: List[np.ndarray], mask=List[np.ndarray]): + """ + Normalises via bin matching to a reference image. + ThresholdAtMeanIntensityOn() makes + + Parameters + ---------- + + Returns + ------- + None + Data is normalised in-place + """ + + logging.info('Using N4 bias correction') + + # downsample images + downsampler = sitk.ShrinkImageFilter() + + down_sampled_img = downsampler.Execute(img) + + down_sampled_mask = downsampler.Execute(mask) + + N4 = sitk.N4BiasFieldCorrectionImageFilter() + + N4_vol = N4.Execute(down_sampled_img, down_sampled_mask) + + log_bias_field = N4.GetLogBiasFieldAsImage(img) + img = img / sitk.Exp(log_bias_field) + sitk.WriteImage(img, "E:/220607_two_way/radiomics_output/test_b4.nrrd") + + + class IntensityMaskNormalise(Normaliser): """ Normalise a set of volumes to the mean of voxe included in a mask. """ + def __init__(self, *args, **kwargs): super().__init__(*args, *kwargs) self.reference_mean = None - def add_reference(self, ref: np.ndarray): """ Add the @@ -136,10 +448,9 @@ def normalise(self, volumes: List[np.ndarray]): vol -= int(np.round(mean_difference)) - if __name__ == '__main__': - import argparse + raise SystemExit('This CLI interafce needs updating') parser = argparse.ArgumentParser() diff --git a/lama/img_processing/organ_vol_calculation.py b/lama/img_processing/organ_vol_calculation.py index 1994a0dd..183c7c86 100755 --- a/lama/img_processing/organ_vol_calculation.py +++ b/lama/img_processing/organ_vol_calculation.py @@ -65,12 +65,10 @@ def _get_label_sizes(paths: List[Path]) ->pd.DataFrame: """ label_volumes = addict.Dict() - to_do = len(paths) n = 1 for label_path in paths: - print(("{} of {}".format(n, to_do))) n += 1 # Get the name of the volume volname = os.path.split(split(label_path)[0])[1] diff --git a/lama/img_processing/read_minc.py b/lama/img_processing/read_minc.py index 1a3e1742..5efe6117 100755 --- a/lama/img_processing/read_minc.py +++ b/lama/img_processing/read_minc.py @@ -19,7 +19,8 @@ import numpy as np import re from tempfile import NamedTemporaryFile - +from lama import common +from pathlib import Path minc_dtypes = {'unsigned': {'byte': np.uint8, 'short': np.uint16, 'float': np.float32}, 'signed': {'byte': np.int8, 'short': np.int16, 'float': np.float32}} diff --git a/lama/lama_radiomics/__init__.py b/lama/lama_radiomics/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/lama/lama_radiomics/__init__.py @@ -0,0 +1 @@ + diff --git a/lama/lama_radiomics/feature_reduction.py b/lama/lama_radiomics/feature_reduction.py new file mode 100644 index 00000000..30f6ab98 --- /dev/null +++ b/lama/lama_radiomics/feature_reduction.py @@ -0,0 +1,418 @@ +from logzero import logger as logging +import os +from catboost import CatBoostClassifier, Pool, sum_models, cv +import matplotlib.pyplot as plt +# import time +import shap +# from mlxtend.feature_selection import SequentialFeatureSelector as SFS +# import pickle +# from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs + +# from sklearn.ensemble import RandomForestClassifier +import numpy as np +# from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel +# from sklearn.model_selection import LeaveOneOut +from pathlib import Path +# from itertools import product +import pandas as pd +from sklearn.model_selection import train_test_split, GridSearchCV +# from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, f1_score, recall_score, matthews_corrcoef, make_scorer +from sklearn.pipeline import Pipeline +from imblearn.over_sampling import SMOTE +from imblearn.under_sampling import OneSidedSelection +from imblearn.pipeline import Pipeline +# import statistics +import seaborn as sns +from collections import Counter + +from sklearn.model_selection import KFold + + + +def correlation(dataset: pd.DataFrame, _dir: Path = None, threshold: float = 0.9, org=None): + """ + identifies correlated features in a + + Parameters + ---------- + dataset: pandas dataframem + """ + if org: + _dir = _dir / str(org) + os.makedirs(_dir, exist_ok=True) + + col_corr = set() # Set of all the names of correlated columns + corr_matrix = dataset.corr(method="spearman") + + logging.info("saving corr matrix at {}".format(_dir)) + fig, ax = plt.subplots(figsize=[50, 50]) + # cm = sns.diverging_palette(250, 15, s=100, as_cmap=True) + sns.heatmap(corr_matrix, ax=ax, + cbar_kws={'label': "Absolute value of Spearman's correlation"}, + square=True) + ax.figure.axes[-1].yaxis.label.set_size(22) + cbar = ax.collections[0].colorbar + cbar.ax.tick_params(labelsize=20) + plt.tight_layout() + plt.xticks(fontsize=12) + plt.yticks(fontsize=12) + + plt.savefig(str(_dir) + "/corr_matix.png") + plt.close() + # do the removal + for i in range(len(corr_matrix.columns)): + for j in range(i): + if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value + colname = corr_matrix.columns[i] # getting the name of column + col_corr.add(colname) + return col_corr + + +def shap_feature_ranking(data, shap_values, columns=[]): + """ + From stack-overflow, return columns + """ + if not columns: columns = data.columns.tolist() # If columns are not given, take all columns + + c_idxs = [] + for column in columns: c_idxs.append( + data.columns.get_loc(column)) # Get column locations for desired columns in given dataframe + if isinstance(shap_values, list): # If shap values is a list of arrays (i.e., several classes) + means = [np.abs(shap_values[class_][:, c_idxs]).mean(axis=0) for class_ in + range(len(shap_values))] # Compute mean shap values per class + shap_means = np.sum(np.column_stack(means), 1) # Sum of shap values over all classes + + else: # Else there is only one 2D array of shap values + assert len(shap_values.shape) == 2, 'Expected two-dimensional shap values array.' + shap_means = np.abs(shap_values).mean(axis=0) + + # Put into dataframe along with columns and sort by shap_means, reset index to get ranking + df_ranking = pd.DataFrame({'feature': columns, 'mean_shap_value': shap_means}).sort_values(by='mean_shap_value',ascending=False).reset_index( + drop=True) + df_ranking.index += 1 + return df_ranking + + +def shap_feat_select(X, shap_importance, _dir, n_feats: list, cut_off: float = -1, n_feat_cutoff: float = None, org: int = None): + """ + + """ + # m = RandomForestClassifier(n_jobs=-1, n_estimators=100, verbose=0, oob_score=True) + + org_dir = _dir / str(org) + + os.makedirs(org_dir, exist_ok=True) + + # print("plotting intrinsic RF rank") + # importances = m.feature_importances_ + # indices = np.argsort(importances) + # features = X.columns + # plt.title('Feature Importances') + # plt.figure(figsize=(15,200)) + # plt.rc('ytick', labelsize=6) + # plt.barh(range(len(indices)), importances[indices], color='b', align='center') + # plt.yticks(range(len(indices)), [features[i] for i in indices]) + # plt.xlabel('Relative Importance') + # plt.tight_layout() + # plt.savefig(str(cut_off_dir)+"/rf_rank.png") + # plt.close() + + # explainer = shap.KernelExplainer(m.predict, X, verbose=False) + + # Get the top N featues (n_feat_cutoff and plot) + if n_feat_cutoff: + shap_importance = shap_importance[0:n_feat_cutoff] + X = X[shap_importance['feature']] + return X + + # Using cut off value, select all with shap value above > cut-off and plot + if cut_off >= 0: + shap_importance = shap_importance[shap_importance['mean_shap_value'] > cut_off] + X = X[shap_importance['feature']] + + # make sure you're not adding duplicates + if X.shape[1] not in n_feats: + n_feats.append(X.shape[1]) + return X + + +def smote_oversampling(X, k: int = 6, max_non_targets: int = 300): + # gets the ratio of target to baseline + non_targets = Counter(X.index)[0] + targets = Counter(X.index)[1] + obs_ratio = targets / non_targets + logging.info("Original ratio of targets : non-targets = {}".format(obs_ratio)) + if (non_targets > max_non_targets) & (obs_ratio < 0.2): + required_ratio = 150 / non_targets + logging.info("Undersampling to 150 targets to improve SHAP speed, followed by oversampling") + steps = [('u', OneSidedSelection(sampling_strategy=required_ratio)), + ('o', SMOTE(n_jobs=-1, k_neighbors=k - 1))] + pipeline = Pipeline(steps=steps) + x_train_std_os, y_train_os = pipeline.fit_resample(X, X.index) + elif 0.9 <= obs_ratio <= 1.1: + logging.info("dataset is relatively balanced, returning original data") + x_train_std_os = X + y_train_os = X.index + else: + sm = SMOTE(n_jobs=-1, k_neighbors=k - 1) + x_train_std_os, y_train_os = sm.fit_resample(X, X.index) + + x_train_std_os.set_index(y_train_os, inplace=True) + logging.info("Rebalanced dataset: {}".format(Counter(x_train_std_os.index))) + + return x_train_std_os + + + + +def run_feat_red(X, org, rad_file_path, batch_test=None, complete_dataset: pd.DataFrame = None, test_size: float = 0.2): + logging.info("Doing org: {}".format(org)) + + + logging.info("Starting") + + # X = X[X['org']== org] + + if org: + #X['condition']= X['genotype'] + "_" + X['background'] + + #X['condition'] = X['condition'].map({'WT_C57BL6N': 0,'WT_C3HHEH': 0, 'HET_C3HHEH': 0,'HET_C57BL6N': 1, 'WT_F1': 0, 'HET_F1': 0}) + + #X.set_index('condition', inplace=True) + + X['HPE'] = X['HPE'].map({'normal': 0, 'abnormal': 1}).astype(int) + X.set_index('HPE', inplace=True) + #X = X[X['background'] == 'C3HHEH'] + #X['genotype'] = X['genotype'].map({'WT': 0, 'HET': 1}).astype(int) + #X.set_index('genotype', inplace=True) + + elif batch_test: + X = X[(X['Age'] == 'D14') & (X['Tumour_Model'] == '4T1R')] + X['Exp'] = X['Exp'].map({'MPTLVo4': 0, 'MPTLVo7': 1}) + X.set_index('Exp', inplace=True) + X.drop(['Date', 'Animal_No.'], axis=1, inplace=True) + + else: + logging.info("Tumour Time!") + X['Tumour_Model'] = X['Tumour_Model'].map({'4T1R': 0, 'CT26R': 1}).astype(int) + X.set_index('Tumour_Model', inplace=True) + X.drop(['Date', 'Animal_No.'], axis=1, inplace=True) + + X = X.select_dtypes(include=np.number) + + #do the same stuff for the complete dataset + + + # lets remove correlated variables + + corr_feats = correlation(X, rad_file_path.parent, 0.9, org=org) + + logging.info('{}: {}'.format("Number of features removed due to correlation", len(set(corr_feats)))) + + X.drop(corr_feats, axis=1, inplace=True) + + # clone X for final test + X_to_test = X + + + org_dir = rad_file_path.parent / str(org) + + + + # balancing clsses via SMOTE + logging.info("oversampling via smote") + + n_test = X[X.index == 1].shape[0] + + X = smote_oversampling(X, n_test) if n_test < 5 else smote_oversampling(X) + + logging.info("fitting model to training data") + m = CatBoostClassifier(iterations=1000, task_type='GPU', verbose=250, train_dir=org_dir) + m.fit(X, X.index.to_numpy()) + logging.info("doing feature selection using SHAP") + + shap_values = m.get_feature_importance(Pool(X, X.index.to_numpy()), type='ShapValues', )[:, :-1] + + shap_importance = shap_feature_ranking(X, shap_values) + + + if org: + n_feats = [] + shap_cut_offs = list(np.arange(0.000, 2.5, 0.025)) + full_X = [shap_feat_select(X, shap_importance,rad_file_path.parent, n_feats=n_feats, cut_off=cut_off, org=org) for cut_off in + shap_cut_offs] + full_X = [X for X in full_X if X is not None] + full_X = [X for X in full_X if (X.shape[1] > 0 & X.shape[1] < 200)] + else: + n_feats = list(np.arange(1, 29, 1)) + full_X = [shap_feat_select(X, shap_importance,rad_file_path.parent, n_feats=n, n_feat_cutoff=n, org=org) for n in n_feats] + + + # should be a better way but she'll do + + n_feats = [X.shape[1] for X in full_X] + + + # So the best way of doing this is just by limiting n_feats in max display + for i, n in enumerate(n_feats): + os.makedirs(org_dir / str(n), exist_ok=True) + shap.summary_plot(shap_values, X, show=False, max_display=n) + + plt.tight_layout() + + plt.savefig(str(org_dir / str(n)) + "/shap_feat_rank_plot.png") + + plt.close() + + + + + logging.info("n_feats: {}".format(n_feats)) + + for i, x in enumerate(full_X): + # make different models for different feature nums + models = [] + model_dir = org_dir / str(x.shape[1]) + os.makedirs(model_dir, exist_ok=True) + + # so lets do some cross validation first, as its sampling the data and doesn't care about partitioning + all_x = Pool(data=x, label=x.index.to_numpy()) + + # create a CPU or GPU model + m = CatBoostClassifier(iterations=1000, task_type="CPU", loss_function='Logloss', train_dir=str(model_dir), + custom_loss=['AUC', 'Accuracy', 'Precision', 'F1', 'Recall'], + verbose=500) + + m2 = CatBoostClassifier(iterations=1000, task_type="GPU", train_dir=str(model_dir), + custom_loss=['Accuracy', 'Precision', 'F1', 'Recall'], + verbose=500) + + # optimise via grid search + params = { + 'depth': [4, 6, 10], + 'l2_leaf_reg': [3, 5, 7], + } + + #m.grid_search(params, all_x, cv=30, verbose=1000) + + + #logging.info("grid search: Number of trees {}, best_scores {}".format(m.tree_count_, m.get_best_score())) + + loo = KFold(n_splits=all_x.num_row(), shuffle=True, random_state=42) + + cv_data = cv(params=m.get_params(), + pool=all_x, + fold_count=int(loo.get_n_splits()/2), + shuffle=True, + stratified=True, + verbose=500, + plot=False, + as_pandas=True, + return_models=False) + + cv_filename = str(model_dir) + "/" + "cross_fold_results.csv" + + logging.info("saving cv results to {}".format(cv_filename)) + cv_data.to_csv(cv_filename) + # sample 20 different train-test partitions (train size of 0.2) and create an average model + + m_results = pd.DataFrame(columns=['branch_count', 'results']) + m2_results = pd.DataFrame(columns=['branch_count', 'results']) + + + for j in range(10): + train_dir = model_dir / str(j) + os.makedirs(train_dir, exist_ok=True) + + # now train with optimised parameters on split + # if we're doing training with reduced samples, evaluate using complete_dataset + if isinstance(complete_dataset, pd.DataFrame): + full_dataset = complete_dataset[complete_dataset.columns & X.columns] + x_train = X + y_train = X.index.to_numpy() + x_test = full_dataset + y_test = full_dataset.index.to_numpy() + train_dir = model_dir + else: + x_train, x_test, y_train, y_test = train_test_split(x, x.index.to_numpy(), test_size=test_size) + + x_train_file_name = train_dir / "x_train.csv" + + logging.info(f"Saving training dataset to {x_train_file_name}") + + pd.DataFrame(x_train).to_csv(x_train_file_name) + + train_pool = Pool(data=x_train, label=y_train) + validation_pool = Pool(data=x_test, label=y_test) + + m.fit(train_pool, eval_set=validation_pool, verbose=False) + + logging.info("Eval CPU: Number of trees {}, best_scores {}".format(m.tree_count_, m.get_best_score()['validation'])) + m_results.loc[j] = [m.tree_count_, m.get_best_score()['validation']] + # perform 30 fold cross-validation + + # tests GPU training + # TODO: remove if useless + + m2.fit(train_pool, eval_set=validation_pool, verbose=False) + logging.info("Eval GPU: Number of trees {}, best_scores {}".format(m2.tree_count_, m2.get_best_score())) + + m2_results.loc[j] = [m2.tree_count_, m2.get_best_score()['validation']] + + logging.info("Saving models") + m_filename = str(rad_file_path.parent) + "/" + str(org) + "/CPU_" + str(x.shape[1]) + "_" + str(j) + ".cbm" + + m2_filename = str(rad_file_path.parent) + "/" + str(org) + "/GPU_" + str(x.shape[1]) + "_" + str(j) + ".cbm" + + m.save_model(m_filename) + m2.save_model(m2_filename) + + models.append(m) + models.append(m2) + if isinstance(complete_dataset, pd.DataFrame): + break + + + logging.info("Combining model predictions into one mega model") + + m_results.to_csv(str(rad_file_path.parent) + "/" + str(org) + "/CPU_results_" + str(x.shape[1]) + ".csv") + m_avg = sum_models(models, weights=[1.0 / len(models)] * len(models)) + + avrg_filename = str(rad_file_path.parent) + "/" + str(org) + '/GPU_results_' + str(x.shape[1]) + ".cbm" + + m_avg.save_model(avrg_filename) + + logging.info("Mega_Model: Number of trees {}, best_scores {}".format(m_avg.tree_count_, m_avg.get_best_score())) + + + + +def main(X, org, rad_file_path, batch_test=None, n_sampler: bool= False): + if n_sampler: + n_fractions = list(np.arange(0.2, 1.2, 0.2)) + + # remove comments to turn on a + # complete_dataset = X.copy() + # complete_dataset['Tumour_Model'] = complete_dataset['Tumour_Model'].map({'4T1R': 0, 'CT26R': 1}).astype(int) + # complete_dataset.set_index('Tumour_Model', inplace=True) + # complete_dataset.drop(['Date', 'Animal_No.'], axis=1, inplace=True) + # complete_dataset = complete_dataset.select_dtypes(include=np.number) + # + # sample_sizes = [np.round(X.groupby('Tumour_Model').count().to_numpy().min() * n, 0) for n in n_fractions] + + for i, n in enumerate(n_fractions): + n_dir = rad_file_path.parent / ("test_size_" + str(n)) + os.makedirs(n_dir, exist_ok=True) + + + #we just need to offer a fake file path so all files are created under n_dir + n_path = n_dir / "fake_file.csv" + + #X_sub = X.groupby('Tumour_Model').apply(lambda x: x.sample(int(n))) + #X_sub.to_csv(str(n_dir/ "sampled_dataset.csv")) + #TODO see if this needs parallelising + run_feat_red(X, org=None, rad_file_path=n_path, batch_test=batch_test, test_size = n) + + else: + run_feat_red(X, org=org, rad_file_path=rad_file_path, batch_test=batch_test) + diff --git a/lama/lama_radiomics/rad_plotting.py b/lama/lama_radiomics/rad_plotting.py new file mode 100644 index 00000000..63218c6f --- /dev/null +++ b/lama/lama_radiomics/rad_plotting.py @@ -0,0 +1,293 @@ + +from pathlib import Path + +from lama import common +import os +import seaborn as sns +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import pacmap + +def PacMapper(data, _file_name): + fig, ax = plt.subplots(figsize=[56, 60]) + # data = data[data['condition'] == 'WT_C3HHEH'] + + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=data, + # col_order=['normal', 'abnormal'], + col='condition', + col_wrap=2, + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two']) - 10, np.max(data['PaCMAP-2d-two']) + 10), + xlim=(np.min(data['PaCMAP-2d-one']) - 10, np.max(data['PaCMAP-2d-one']) + 10)) + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_all_cond_v2.png") + plt.close() + + + +def dimensionality_reduction_plots(_dir: Path, abnormal_embs: list=[], ): + file_names = [spec for spec in common.get_file_paths(folder=_dir, extension_tuple=".csv")] + file_names.sort() + + data = [pd.read_csv(spec, index_col=0).dropna(axis=1) for spec in file_names] + + + for i, df in enumerate(data): + df.index.name = 'org' + df.name = str(file_names[i]).split(".")[0].split("/")[-1] + df['genotype'] = 'HET' if 'het' in str(file_names[i]) else 'WT' + df['background'] = 'C57BL6N' if (('b6ku' in str(file_names[i])) | ('BL6' in str(file_names[i]))) else \ + 'F1' if ('F1' in str(file_names[i])) else 'C3HHEH' + + df['HPE'] = 'abnormal' if any(map(str(file_names[i]).__contains__, abnormal_embs)) else 'normal' + + data = pd.concat( + data, + ignore_index=False, keys=[os.path.splitext(os.path.basename(spec))[0] for spec in file_names], + names=['specimen', 'org']) + + line_file = _dir.parent / "full_results.csv" + + org_dir = _dir.parent / "organs" + + os.makedirs(org_dir, exist_ok=True) + print(data.columns) + + for org in data.index.get_level_values('org').unique(): + data[data.index.get_level_values('org') == org].to_csv(str(org_dir) + "/results_" + str(org) + ".csv") + + data.to_csv(line_file) + + data_subset = data.select_dtypes(include=np.number) + + data_subset = data_subset.apply(lambda x: (x - x.mean()) / x.std(), axis=0) + data_subset = data_subset.apply(lambda x: (x - x.mean()) / x.std(), axis=1) + + embedding = pacmap.PaCMAP(n_components=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0, num_iters=20000, verbose=1) + + # print(data_subset.dropna(axis='columns')) + + results = embedding.fit_transform(data_subset.dropna(axis='columns')) + + color_class = data.index.get_level_values('org') + + # fig, ax = plt.subplots(figsize=[55, 60]) + # cluster.tsneplot(score=tsne_results, show=True, theme='dark', colorlist=color_class) + + data['PaCMAP-2d-one'] = results[:, 0] + data['PaCMAP-2d-two'] = results[:, 1] + data['org'] = data.index.get_level_values('org') + data['specimen'] = data.index.get_level_values('specimen') + data['condition'] = data['genotype'] + "_" + data['background'] + + + + +def main(_dir): + abnormal_embs = ['22300_e8', '22300_e6', '50_e5'] + dimensionality_reduction_plots(_dir, abnormal_embs=abnormal_embs) + + +if __name__ == '__main__': + main() + + +# scoring = {'AUC': 'roc_auc', + # 'Accuracy': make_scorer(accuracy_score), + # 'Precision': make_scorer(precision_score), + # 'F1': make_scorer(f1_score), + # 'Recall': make_scorer(recall_score), + # 'MCC': make_scorer(matthews_corrcoef)} + + # results = [None] * len(n_feats) + # results_v2 = [None] * len(n_feats) + # X_axises = [None] * len(n_feats) + # parameters = {'iterations': list(range(200, 1000, 400))} + +# split data??? + +# gs = GridSearchCV(CatBoostClassifier(task_type="CPU"), +# param_grid=parameters, +# n_jobs=-1, +# scoring=scoring, +# cv=10, +# refit='Accuracy', verbose=0) +# gs = CatBoostClassifier(iterations=1000, task_type="CPU") +# gs_result = gs.grid_search(parameters, x, x.index.to_numpy(), plot=True) + + +# gs.fit(x, x.index.to_numpy()) +# results[i] = gs.cv_results_ +# print("iterations results: ", gs.cv_results_) + +# X_axises[i] = np.array(results[i]['param_iterations'].data, dtype=float) +# +# parameters = {'iterations': [1000]} +# +# for i, x in enumerate(full_X): +# +# gs = GridSearchCV(CatBoostClassifier(task_type="GPU"), +# param_grid=parameters, +# n_jobs=-1, +# scoring=scoring, +# cv=5, +# refit='AUC', verbose=0) +# +# gs.fit(x, x.index.to_numpy()) +# print(gs.cv_results_) +# print(gs.best_score_) +# results_v2[i] = gs.cv_results_ +# +# +# +# colours = ['k', 'b', 'c', 'm', 'r', 'y', 'g'] +# +# n_trees_lst = [] +# # so I want to make a different plot per metric +# for scorer in scoring.keys(): +# +# plt.figure(figsize=(20, 20)) +# plt.title("GridSearchCV evaluating using multiple scorers simultaneously", +# fontsize=16) +# +# plt.xlabel("Number of Trees") +# plt.ylabel("Score") +# plt.grid() +# +# ax = plt.axes() +# ax.set_xlim(0, 1000) +# ax.set_ylim(0, 1.05) +# +# for i, (x_axis, colour) in enumerate(zip(X_axises, colours)): +# +# sample_score_mean = results[i]['mean_test_%s' % (scorer)] +# print("iterations sample score", sample_score_mean) +# sample_score_std = results[i]['std_test_%s' % (scorer)] +# +# ax.fill_between(x_axis, sample_score_mean - sample_score_std, +# sample_score_mean + sample_score_std, +# alpha=0.1, color=colour) +# +# ax.plot(x_axis, sample_score_mean, '--', color=colour, +# alpha=1, +# label="%s number of feats %s" % (scorer, n_feats[i])) +# best_index = np.nonzero(results[i]['rank_test_%s' % scorer] == 1)[0][0] +# best_score = results[i]['mean_test_%s' % scorer][best_index] +# +# # Plot a dotted vertical line at the best score for that scorer marked by x +# ax.plot([x_axis[best_index], ] * 2, [0, best_score], +# linestyle='-.', color=colour, marker='x', markeredgewidth=3, ms=8) +# +# ax.annotate("%0.2f" % best_score, +# (x_axis[best_index], best_score + 0.005)) +# n_trees_lst.append(x_axis[best_index]) +# +# +# plt.legend(loc="best") +# if org: +# plt.savefig(str(org_dir) + "/" + str(scorer) + "_curve.png") +# else: +# plt.savefig(str(rad_file_path.parent) + "/" + str(scorer) + "_curve.png") +# plt.close() +# +# best_ntrees = statistics.mode(n_trees_lst) +# +# print("best_ntrees: ", best_ntrees) +# x_axis = n_feats +# +# results = pd.DataFrame(results_v2).drop(columns='params').applymap(lambda x: float(x)) +# +# plt.figure(figsize=(20, 20)) +# plt.title("GridSearchCV evaluating using multiple scorers simultaneously", +# fontsize=16) +# +# plt.xlabel("Number of Features") +# plt.ylabel("Score") +# plt.grid() +# ax = plt.axes() +# ax.set_xlim(0, 150) +# ax.set_ylim(0, 1.05) +# +# +# best_cutoff_lst = [] +# print("x_axis", x_axis) +# for scorer, colour in zip(scoring, colours): +# +# sample_score_mean = results['mean_test_%s' % (scorer)] +# +# sample_score_std = results['std_test_%s' % (scorer)] +# +# ax.fill_between(x_axis, sample_score_mean - sample_score_std, +# sample_score_mean + sample_score_std, +# alpha=0.1, color=colour) +# +# ax.plot(x_axis, sample_score_mean, '--', color=colour, +# alpha=1, +# label="%s" % (scorer)) +# best_index = np.argmax(sample_score_mean) +# +# print("sample_score_mean", sample_score_mean) +# print(best_index) +# +# best_score = results['mean_test_%s' % (scorer)][best_index] +# +# logging.info("best score: {} {}".format(scorer, best_score)) +# +# +# # Plot a dotted vertical line at the best score for that scorer marked by x +# ax.plot([x_axis[best_index], ] * 2, [0, best_score], +# linestyle='-.', color=colour, marker='x', markeredgewidth=3, ms=8) +# +# ax.annotate("%0.2f" % best_score, +# (x_axis[best_index], best_score + 0.005)) +# +# +# best_cutoff_lst.append(x_axis[best_index]) +# +# plt.legend(loc="best") +# if org: +# plt.savefig(str(org_dir) + "/feat_test_curve.png") +# else: +# plt.savefig(str(rad_file_path.parent) + "/feat_test_curve.png") +# plt.close() +# +# best_cut_off = statistics.mode(best_cutoff_lst) +# +# logging.info("best num of feats: {}".format(n_feats[n_feats.index(best_cut_off)])) +# +# best_X = full_X[n_feats.index(best_cut_off)] +# +# logging.info("doing final SHAP") +# X_to_tes t= shap_feat_select(X_to_test, _dir=rad_file_path.parent, n_feat_cutoff=n_feats[n_feats.index(best_cut_off)], +# org=org) +# +# logging.info("Splitting data into 80-20 split") +# x_train, x_test, y_train, y_test = train_test_split(X_to_test, X_to_test.index, test_size=0.2, random_state=0) +# +# +# fig, ax = plt.subplots(figsize=[50, 50]) +# sns.pairplot(x_train) +# +# ax.figure.axes[-1].yaxis.label.set_size(22) +# +# plt.tight_layout() +# plt.xticks(fontsize=12) +# plt.yticks(fontsize=12) +# +# plt.savefig(str(rad_file_path.parent) + "/pair_plot.png") +# plt.close() +# +# for x in range(20): +# model = CatBoostClassifier(iterations=best_ntrees, random_state=x) +# model.fit(x_train, y_train) +# model_file_name = str(org_dir / "finalised_model.sav") +# pickle.dump(model, open(model_file_name, 'wb')) +# # logging.info("final model score: {}". format(model.score(x_test, y_test))) +# logging.info("final ROC AUC score: {}".format(accuracy_score(y_test, model.predict(x_test)))) +# # logging.info("decision path: {}".format(model.decision_path(x_test))) +# # logging.info("leaf indices: {}".format(model.apply(x_test))) \ No newline at end of file diff --git a/lama/lama_radiomics/radiomics.py b/lama/lama_radiomics/radiomics.py new file mode 100644 index 00000000..92e0a070 --- /dev/null +++ b/lama/lama_radiomics/radiomics.py @@ -0,0 +1,515 @@ +import nrrd +from lama import common +import os +import numpy as np +from radiomics import featureextractor +import SimpleITK as sitk +import pandas as pd +from logzero import logger as logging +from pathlib import Path +from filelock import SoftFileLock, Timeout +import socket +from datetime import datetime +import sys +import signal +import tempfile +from lama.monitor_memory import MonitorMemory +from lama.img_processing import normalise +from scipy import ndimage +import raster_geometry as rg +import subprocess + +# test push +# lets try this +JOBFILE_NAME = 'radiomics_jobs.csv' + + +def extract_registrations(root_dir, labs_of_interest=None, outdir_name=None, norm_label=None, fnames = None, stats_mask: bool=False): + ''' + + either extracts the rigid registrations (i.e. the volumes) + or specific labels (if the labels are specified) + + Parameters + ---------- + root_dir + labels - label of interest + + Returns + ------- + list of either sitk images (rigid regs) or direct labels as arrays (labels) + + ''' + rad_dir = root_dir / "radiomics_output" + os.makedirs(rad_dir, exist_ok=True) + + if labs_of_interest: + # save to label folder + + outdir_string = "stage_labels" if norm_label else "stats_mask" if stats_mask else "inverted_labels" + query_string = outdir_name if outdir_name else 'inverted_stats_mask' if stats_mask else outdir_string + + outdir = rad_dir / outdir_string + os.makedirs(outdir, exist_ok=True) + + # extract the inverted labels of interest + file_paths = [spec_path for spec_path in common.get_file_paths(root_dir) if + (query_string in str(spec_path))] + + logging.info(rad_dir) + + #file_paths.sort(key=lambda x: os.path.basename(x)) + + # empty list + with tempfile.NamedTemporaryFile() as ntf: + temp_name = ntf.name + extracts = np.memmap(ntf,shape=(len(file_paths),), dtype=object) + + + for i, path in enumerate(file_paths): + # clean label_files to only contain orgs of interest + label = common.LoadImage(path).img + + label_arr = sitk.GetArrayFromImage(label) + t = tempfile.TemporaryFile() + m = np.memmap(t, dtype=label_arr.dtype, mode='w+', shape=label_arr.shape) + m[:] = label_arr + + # I think its better to just grab the single files for all orgs + # then separate the labels during radiomics calculations + if not stats_mask: + m[~np.isin(label_arr, labs_of_interest)] = 0 + extracts[i] = sitk.GetImageFromArray(m) + extracts[i].CopyInformation(label) + + else: + # extract the rigid + outdir = rad_dir / "rigids" + os.mkdir(outdir) + if outdir_name: + file_paths = [spec_path for spec_path in common.get_file_paths(root_dir) if (outdir_name in str(spec_path))] + else: + reg_paths = [spec_path for spec_path in common.get_file_paths(root_dir) if ('registrations' in str(spec_path))] + file_paths = [spec_path for spec_path in reg_paths if ('rigid' in str(spec_path))] + + + # just an easy way to load the images + extracts = [common.LoadImage(path).img for path in file_paths] + + #sort file paths + #file_paths.sort(key=lambda x: os.path.basename(x)) + # write to new_folder for job file / increase debugging speed + for i, vol in enumerate(extracts): + + file_name = str(Path(outdir / os.path.basename(file_paths[i]))) + + #print("vol.img", vol.img) + logging.info("Writing : {}". format(file_name)) + sitk.WriteImage(vol, file_name, useCompression=True) + + + return extracts, file_paths + + +def make_rad_jobs_file(jobs_file: Path, file_paths: list): + """ + Creates a joblist csv file for use with the radiomics pipeline. + Searches for all images paths and creates a job file + + Parameters + ---------- + jobfile_path: Path to save job file to + root_dir: the root project directory + is_mutants: if True search the folder for individual line sub folders + + """ + # output_dir = root_dir / 'radiomics_output' + # output_dir.mkdir(exist_ok=True) + + jobs_entries = [] + # get each file path + for i, vol_path in enumerate(file_paths): + + rel_path_to_specimen_input = str(Path(vol_path).relative_to(jobs_file.parent / "rigids")) + jobs_entries.append([rel_path_to_specimen_input,'to_run', '_', '_', '_']) + + jobs_df = pd.DataFrame.from_records(jobs_entries, columns=['job', 'status', 'host', 'start_time', 'end_time']) + + jobs_df.to_csv(jobs_file) + return True + + +def denoise(images): + ''''Lets just try out a denoiser''' + + #out_dir = _dir / "Patched_denoised" + + + denoise = sitk.PatchBasedDenoisingImageFilter() + + for i, img in enumerate(images): + + + img_arr = sitk.GetArrayFromImage(img) + + img_to_denoise = sitk.GetImageFromArray(img_arr) + + #cropped_arr = img_arr[100:300, 100:300, 100:300] + + + + #cropped_img.CopyInformation(img) + logging.info("PIZZA TIME!") + images[i] = denoise.Execute(img_to_denoise).CopyInformation(img) + return images + + + +def pyr_normaliser(_dir, _normaliser, scans_imgs, masks: list = None, fold: bool = False, ref_vol_path: Path = None, stage_for_ref: bool = False): + # create a copy so orginal files aren't overwritten + + # Do the normalisation + if isinstance(_normaliser, normalise.NonRegMaskNormalise): + + # checks if a ref mean has been calculated and then creates if missing + if not _normaliser.reference_mean: + #if you passed a non-normal label for reference + if (ref_vol_path and stage_for_ref): + ref_vol = common.LoadImage(ref_vol_path).img + + ref_mask_dir = ref_vol_path.parent.parent / "stage_labels" + ref_mask_path = ref_mask_dir / os.path.basename(ref_vol_path) + ref_mask = common.LoadImage(ref_mask_path).img + + + elif (ref_vol_path): + ref_vol = common.LoadImage(ref_vol_path).img + ref_mask = _normaliser.gen_otsu_masks(ref_vol) + + else: + # this one has multiple labels and volums but its singular to cut lines of code + ref_vol, ref_mask = _normaliser.get_all_wt_vols_and_masks(_dir) + + _normaliser.add_reference(ref_vol, ref_mask) + _normaliser.normalise(scans_imgs, masks, fold=fold, temp_dir=_dir) + + elif isinstance(_normaliser, normalise.IntensityHistogramMatch): + if ref_vol_path: + logging.info(f"Using {ref_vol_path} as the reference image") + ref_vol = common.LoadImage(ref_vol_path).img + _normaliser.normalise(scans_imgs, ref_vol) + + else: + _normaliser.normalise(scans_imgs, scans_imgs[0]) + + elif isinstance(_normaliser, normalise.IntensityN4Normalise): + otsu_masks = _normaliser.gen_otsu_masks(scans_imgs) + _normaliser.normalise(scans_imgs, otsu_masks) + + return scans_imgs + + +def pyr_calc_all_features(img, lab, name, labs_of_int, spherify=None): + full_results = pd.Series([]) + lab.CopyInformation(img) + + arr = sitk.GetArrayFromImage(lab) + + if spherify: # can be used as a control - makes label a sphere: + logging.info("Spherifying") + s = ndimage.find_objects(arr)[-1] + if spherify == 0: + sphere_dir = Path(name).parent.parent / "colat_tumours" + os.makedirs(sphere_dir, exist_ok=True) + logging.info("Placing tumour as colateral control") + lab = sitk.Flip(lab, [False, False, True]) + sphere_fname = sphere_dir / os.path.basename(name) + sitk.WriteImage(lab, str(sphere_fname)) + + elif spherify == 1: + sphere_dir = Path(name).parent.parent / "spheres" + os.makedirs(sphere_dir, exist_ok=True) + logging.info("Spherifying in centre of tumour") + midpoint = [np.round(np.mean([s[0].start, s[0].stop])) / 512, + np.round((np.mean([s[1].start, s[1].stop]))) / 512, + np.round((np.mean([s[2].start, s[2].stop]))) / 512] + arr = rg.sphere(512, 10, midpoint, smoothing=True).astype(np.int_) + mask = sitk.GetImageFromArray(arr) + sphere_fname = sphere_dir / os.path.basename(name) + sitk.WriteImage(mask, str(sphere_fname)) + + else: + sphere_dir = Path(name).parent.parent / "lateral_spheres" + os.makedirs(sphere_dir, exist_ok=True) + logging.info("Spherifying as colateral control") + midpoint = [np.round(np.mean([s[0].start, s[0].stop])) / 512, + np.round((np.mean([s[1].start, s[1].stop]))) / 512, + np.round(482 - (np.mean([s[2].start, s[2].stop]))) / 512] + arr = rg.sphere(512, 10, midpoint, smoothing=True).astype(np.int_) + mask = sitk.GetImageFromArray(arr) + sphere_fname = sphere_dir / os.path.basename(name) + sitk.WriteImage(mask, str(sphere_fname)) + + extractor = featureextractor.RadiomicsFeatureExtractor() + extractor.enableAllImageTypes() + extractor.enableAllFeatures() + + results_list =[] + # TODO: reduce dimensionality? + for i, org in enumerate(labs_of_int): + # remove other labels + + arr_spec = np.where(arr == org, 1, 0) + + if np.count_nonzero(arr_spec) < 1000: + print("null label") + continue + + mask = sitk.GetImageFromArray(arr_spec) + + + # make sure its in the same orientation as the image + mask.CopyInformation(lab) + + + result = extractor.execute(img, mask) + + features = pd.DataFrame.from_dict(result, orient='index', + columns=[org]).transpose() + + #features = features. + + features = features.drop(columns=[col for col in features.columns if 'diagnostics' in col]) + #features = features.T.rename(columns={0: org}) + results_list.append(features) + + full_results = pd.concat(results_list, axis=0) + + return full_results + + +def run_radiomics(rad_dir, rigids, labels, name, labs_of_int, + norm_method, norm_label=None, spherify=None, ref_vol_path=None): + """ + + Parameters + ---------- + norm_label : object + """ + logging.info(common.git_log()) + signal.signal(signal.SIGINT, common.service_shutdown) + mem_monitor = MonitorMemory(Path(rad_dir).absolute()) + + features = pyr_calc_all_features(rigids, labels, name, labs_of_int, spherify=spherify) + + feature_dir = rad_dir / "features" + + os.makedirs(feature_dir, exist_ok=True) + + file_name = feature_dir / str(str(os.path.splitext(os.path.basename(name))[0]) + '.csv') + + features.to_csv(file_name) + + mem_monitor.stop() + return True + + +def radiomics_job_runner(target_dir, labs_of_int=None, + norm_method=normalise.IntensityN4Normalise(), + norm_label=None, spherify=None, + ref_vol_path=None, + make_job_file: bool=False, fold: bool=False, scan_dir=None, tumour_dir=None, stage_dir=None): + ''' + Performs the pyradiomic calculations + + + Parameters + ---------- + target_dir + + labs_of_int + + Returns + ------- + + ''' + # fix label input + if labs_of_int != None: + labs_of_int = [float(i) for i in labs_of_int.split(",")] if "," in labs_of_int else [float(labs_of_int)] + else: + labs_of_int = list(range(1, 210)) + # create files if they don't exist + rad_dir = target_dir / 'radiomics_output' + + jobs_file_path = rad_dir / JOBFILE_NAME + lock_file = jobs_file_path.with_suffix('.lock') + lock = SoftFileLock(lock_file) + + if make_job_file: + # extract the registrations if the job file doesn't exist and normalise + if not os.path.exists(str(rad_dir)): + + os.makedirs(rad_dir, exist_ok=True) + logging.info("Extracting Rigids") + rigids = extract_registrations(target_dir, outdir_name=scan_dir) + logging.info("Extracting Inverted Labels") + labels = extract_registrations(target_dir, labs_of_int, outdir_name=tumour_dir) + + if norm_label: + logging.info("Extracting Stage labels") + stage_labels = extract_registrations(target_dir, labs_of_int, outdir_name=stage_dir, norm_label=True) + logging.info("Correcting Metadata") + + if scan_dir and norm_label: + #so if the user is providing different directories - it's likely BQ lab and th + for i, lab in enumerate(stage_labels): + lab.CopyInformation(rigids[i]) + labels[i].CopyInformation(rigids[i]) + + + else: + logging.info("Extracting Inverted Stats Masks") + inv_stats_masks = extract_registrations(target_dir, labs_of_int, stats_mask=True) + + else: # good for debugging if normalisation stuffs up + logging.info("loading rigids") + rigids = [common.LoadImage(path).img for path in common.get_file_paths(str(rad_dir / "rigids"))] + labels = [common.LoadImage(path).img for path in common.get_file_paths(str(rad_dir / "inverted_labels"))] + logging.info("loading stats masks") + inv_stats_masks = [common.LoadImage(path).img for path in common.get_file_paths(str(rad_dir / "stats_mask"))] + stage_labels = [common.LoadImage(path).img for path in common.get_file_paths(str(rad_dir / "stage_labels"))] + + if scan_dir and norm_label: + # so if the user is providing different directories - it's likely BQ lab and th + for i, lab in enumerate(stage_labels): + lab.CopyInformation(rigids[i]) + labels[i].CopyInformation(rigids[i]) + + + #logging.info("Denoising") + + #denoise(rigids) + + #logging.info("Writing Denoised Rigids") + #rigid_paths = [common.LoadImage(path).img_path for path in common.get_file_paths(str(rad_dir / "rigids"))] + # sort should be identical: + # rigid_paths.sort(key=lambda x: os.path.basename(x)) + + #for i, vol in enumerate(rigids): + # logging.info("Writing: {}".format(rigid_paths[i])) + # sitk.WriteImage(vol, rigid_paths[i], useCompression=True) + + + + # Normalisation should be here!!!! + logging.info("Normalising Intensities") + + def prepare_norm(meth, rigids): + if norm_label: + rigids = pyr_normaliser(rad_dir, norm_method, scans_imgs=rigids, masks=stage_labels, + ref_vol_path=ref_vol_path, + stage_for_ref=True, fold=fold) + else: + if isinstance(meth, normalise.NonRegMaskNormalise): + logging.info("Normalising based on inverted stats masks") + rigids = pyr_normaliser(rad_dir, meth, scans_imgs=rigids, masks=inv_stats_masks) + else: + rigids = pyr_normaliser(rad_dir, meth, scans_imgs=rigids, ref_vol_path=ref_vol_path) + + + if isinstance(norm_method, list): + for meth in norm_method: + prepare_norm(meth, rigids) + else: + prepare_norm(norm_method, rigids) + + + + logging.info("Writing Normalised Rigids") + rigid_paths = [common.LoadImage(path).img_path for path in common.get_file_paths(str(rad_dir / "rigids"))] + # sort should be identical: + #rigid_paths.sort(key=lambda x: os.path.basename(x)) + + for i, vol in enumerate(rigids): + logging.info("Writing: {}".format(rigid_paths[i])) + sitk.WriteImage(vol, rigid_paths[i], useCompression=True) + + logging.info("Creating a job-file for radiomics") + make_rad_jobs_file(jobs_file_path, rigid_paths) + logging.info("Job_file_created") + return True + + #df_jobs = pd.read_csv(jobs_file_path, index_col=0) + + # execute parallelisation: + while True: + try: + with lock.acquire(timeout=60): + + df_jobs = pd.read_csv(jobs_file_path, index_col=0) + # Get an unfinished job + jobs_to_do = df_jobs[df_jobs['status'] == 'to_run'] + if len(jobs_to_do) < 1: + logging.info("No more jobs left on jobs list") + + # error trap for processes that hung + logging.info("checking for hung jobs") + fin_jobs = df_jobs[df_jobs['status'] == 'complete'] + running_jobs = df_jobs[df_jobs['status'] == 'running'] + fin_indx = fin_jobs.index[-1] + fin_t = fin_jobs.at[fin_indx, 'start_time'] + fin_time = datetime.strptime(fin_t, '%Y-%m-%d %H:%M:%S') + run_t = running_jobs['start_time'] + run_times = [datetime.strptime(t, '%Y-%m-%d %H:%M:%S') < fin_time for t in run_t] + hung_jobs = running_jobs[run_times] + + if len(hung_jobs) > 0: + logging.info("Hung jobs found - rerunning") + jobs_to_do = hung_jobs + else: + break + indx = jobs_to_do.index[0] + + img_path = Path(rad_dir / 'rigids') / (jobs_to_do.at[indx, 'job']) + lab_path = Path(rad_dir / 'inverted_labels') / (jobs_to_do.at[indx, 'job']) + + img = common.LoadImage(img_path) + + lab = common.LoadImage(lab_path) + + df_jobs.at[indx, 'status'] = 'running' + df_jobs.at[indx, 'start_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + df_jobs.at[indx, 'host'] = socket.gethostname() + df_jobs.to_csv(jobs_file_path) + + except Timeout: + sys.exit('Timed out' + socket.gethostname()) + + # try: + try: + logging.info(f'trying {img.img_path} and {lab_path}') + run_radiomics(rad_dir, img.img, lab.img, img.img_path, + labs_of_int, norm_method, norm_label=norm_label, spherify=spherify) + + except Exception as e: + if e.__class__.__name__ == 'KeyboardInterrupt': + logging.info('terminating') + sys.exit('Exiting') + + status = 'failed' + print(e) + logging.exception(e) + + + else: + status = 'complete' + + finally: + with lock: + df_jobs = pd.read_csv(jobs_file_path, index_col=0) + df_jobs.at[indx, 'status'] = status + df_jobs.at[indx, 'end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + df_jobs.to_csv(jobs_file_path) + + logging.info('Exiting job_runner') + return True diff --git a/lama/paths.py b/lama/paths.py index 49409d5a..082f52df 100755 --- a/lama/paths.py +++ b/lama/paths.py @@ -8,7 +8,8 @@ # import lama import os import yaml -from lama.elastix import REG_DIR_ORDER_CFG, INVERT_CONFIG +from lama.elastix import REG_DIR_ORDER_CFG, PROPAGATE_CONFIG +from lama.common import cfg_load # TODO: Link up this code with where the folders are cerated during a LAMA run. Then when changes to folder names occur @@ -16,6 +17,7 @@ # they are replfected in this iterator + def specimen_iterator(reg_out_dir: Path) -> Iterator[Tuple[Path, Path]]: """ Given a registration output root folder , iterate over the speciemns of each line in the subfolders @@ -102,12 +104,12 @@ def __init__(self, specimen_root: Path, line='', specimen='', input_dir=None): def setup(self): # TODO: update this. I just moved this out of the constructor as it was failing there - self.reg_order, self.inversion_order = self._get_reg_order(self.specimen_root) + self.reg_order, self.label_propagation_order = self._get_reg_order(self.specimen_root) self.outroot = self.specimen_root / 'output' self.reg_dirs: Path = self.get_multistage_data(self.outroot / 'registrations') self.jacobians_dirs = self.get_multistage_data(self.outroot / 'jacobians') # Possible to have more than one self.deformations_dirs = self.get_multistage_data(self.outroot / 'deformations') - self.inverted_labels_dirs: Path = self.get_multistage_data(self.outroot / 'inverted_labels', self.inversion_order) + self.inverted_labels_dir: Path = self.outroot / 'inverted_labels' self.qc = self.specimen_root / 'output' / 'qc' self.qc_red_cyan_dirs = self.qc / 'red_cyan_overlays' self.qc_inverted_labels = self.qc / 'inverted_label_overlay' @@ -148,10 +150,10 @@ def _get_reg_order(self, spec_root): if line.strip(): reg_order.append(line.strip()) try: - with open((spec_root / 'output' / 'inverted_transforms' / INVERT_CONFIG), 'r') as fh: - c = yaml.load(fh) - for stage in c['inversion_order']: - inv_order.append(stage) + inv_order_cfg = spec_root / 'output' / 'inverted_transforms' / PROPAGATE_CONFIG + c = cfg_load(inv_order_cfg) + for stage in c['label_propagation_order']: + inv_order.append(stage) except FileNotFoundError: inv_order = None return reg_order, inv_order @@ -218,13 +220,14 @@ def __len__(self): return len(self.spec_it) -def get_specimen_dirs(root: Path, depth=4) -> List[LamaSpecimenData]: +def get_specimen_dirs(root: Path, depth=5, getn=None) -> List[LamaSpecimenData]: # Identify all lama directoris by getting the log files # lama_logs = root.rglob('**/LAMA.log') + # getn: get only n specimen dirs (good for speeding up debug) specimen_dirs = [] - for log in [x for x in walk(root, depth) if x.name == 'LAMA.log']: + for i, log in enumerate([x for x in walk(root, depth) if x.name == 'LAMA.log']): root = log.parent # Take a guess at the line, probably the name of the spec dir parent line = root.parent.name @@ -232,6 +235,9 @@ def get_specimen_dirs(root: Path, depth=4) -> List[LamaSpecimenData]: s.setup() specimen_dirs.append(s) + if getn and i >= getn - 1: + break + return specimen_dirs diff --git a/lama/qc/common.py b/lama/qc/common.py index 2c8818c6..5f6dae5c 100644 --- a/lama/qc/common.py +++ b/lama/qc/common.py @@ -18,4 +18,4 @@ def final_red_cyan_iterator(root, orientation='sagittal') -> Tuple[Path, str, st 1: line_id 2: specimen id """ - for line_dir in root.it \ No newline at end of file + # for line_dir in root.it diff --git a/lama/qc/organ_vol_plots.py b/lama/qc/organ_vol_plots.py index 75b0067d..9d104f7e 100644 --- a/lama/qc/organ_vol_plots.py +++ b/lama/qc/organ_vol_plots.py @@ -24,12 +24,15 @@ from lama.common import getfile_startswith_endswith from lama.qc import formatting +from lama.paths import specimen_iterator +from tqdm import tqdm -organ_vol = 'organ volume' # Y label -wev = 'whole embryo volume' # x label for scatter plots +ORGAN_VOL_LABEL = 'organ volume' # Y label +WEV_LABEL = 'whole embryo volume' # x label for scatter plots -def pvalue_dist_plots(null: pd.DataFrame, alt: pd.DataFrame, thresholds: pd.DataFrame, outdir: Path): +def pvalue_dist_plots(null: pd.DataFrame, alt: pd.DataFrame, thresholds: pd.DataFrame, outdir: Path, + two_way: bool = False, main_of_two_way: bool = False): """ Generate a series of histograms containing null and alternative distribution overlaid. Create a vertical line where the p-value threshold was set @@ -45,7 +48,9 @@ def pvalue_dist_plots(null: pd.DataFrame, alt: pd.DataFrame, thresholds: pd.Data where to put the plots """ - def hist(values): + def hist(values: pd.Series): + # Drop NA values as they may exist if they have been QC'd out + values.dropna(inplace=True) hist, bins = np.histogram(values, 100) hist = hist / np.sum(hist) width = 1.0 * (bins[1] - bins[0]) @@ -53,48 +58,73 @@ def hist(values): plt.bar(center, hist, align='center', width=width, alpha=0.5) x_label = 'log(p)' - alt = np.log(alt) - null = np.log(null) + + # if two_way: + # alt = alt.applymap(lambda x: np.array([float(i) for i in x])) + # print("alt: ", alt) + + # alt = alt.applymap(lambda x: x)) + + # you need to perform log different depending on the input + alt = alt.applymap(lambda x: np.log(x.astype(float))) if two_way else alt.applymap( + lambda x: float(x)) if main_of_two_way else np.log(alt) + + null = null.applymap(lambda x: np.log(x.astype(float))) if two_way else null.applymap( + lambda x: float(x)) if main_of_two_way else np.log(null) for col in alt: try: - thresh = thresholds.loc[int(col), 'p_thresh'] + thresh = thresholds.loc[col, 'p_thresh'] log_thresh = np.log(thresh) - hist(alt[col]) - hist(null[col]) + if two_way: + + p_number = 1 if np.squeeze(alt[col]).ndim == 1 else 3 + for i in range(p_number): + hist(pd.Series(np.vstack(alt[col].values).transpose()[:, i])) + hist(pd.Series(np.vstack(null[col].values)[:, i])) + plt.axvline(log_thresh.iloc[i], 0, 1, alpha=0.4) + plt.legend(labels=['p threshold = {}'.format(format(thresh.iloc[i], '.3g')), 'alt', 'null']) + # just has the one column + else: + hist(alt[col]) + hist(null[col]) + plt.xlabel(x_label) + plt.legend(labels=['p threshold = {}'.format(format(thresh, '.3g')), 'alt', 'null']) + plt.axvline(log_thresh, 0, 1, alpha=0.4, color='g') + plt.xlabel(x_label) outpath = outdir / f'{col}.png' - plt.axvline(log_thresh, 0, 1, alpha=0.4, color='g') - - plt.legend(labels=['p threshold = {}'.format(format(thresh, '.3g')), 'alt', 'null']) plt.ylabel('Density') plt.title(col) plt.savefig(outpath) plt.close() - except ValueError: + except ValueError as e: + print(e) logging.warn(f'Skipping pvalue dist plot for {col}') continue -def make_plots(mut_lines_dir: Path, - wt_organ_vols: pd.DataFrame, - wt_staging: pd.DataFrame, +def make_plots(organ_vols: pd.DataFrame, label_meta_file: Path, stats_root_dir: Path, - skip_no_analysis= False, + skip_no_analysis=False, organ_subset: List = [], - extra_dir: Path = Path('')): + extra_dir: Path = Path(''), + voxel_size: float = 27.0, + two_way: bool = False): """ Parameters ---------- mut_lines_dir - Lama registration root. eg: mutants/output with each sibdir containing a line - wt_organ_vols - Aggregated organ volumes for each baseline + Lama registration root. eg: mutants/output with each subdir containing a line + organ_vols + All organ volume. + index=spec_id, + cols = label_nums, + staging and line wt_staging Aggregated staging info for each baseline label_meta_file @@ -108,86 +138,79 @@ def make_plots(mut_lines_dir: Path, extra_dir Bit of a bodge, but if doing the non-permutation-based stats, the organ vol csv is in a directory below. Give the name here (currently 'organ_volumes') + voxel_size + For calculating correct organ volumes """ - label_meta = pd.read_csv(label_meta_file, index_col=0) - - wt_staging.rename(columns={'line': 'genotype'}, inplace=True) + if label_meta_file: + label_meta = pd.read_csv(label_meta_file, index_col=0).replace({np.nan: None}) + # Kyle - this should fix the skip_no_analysis problem + skip_no_analysis = True if 'no_analysis' in label_meta else skip_no_analysis - for mut_line_dir in mut_lines_dir.iterdir(): + else: + label_meta = None - if not mut_line_dir.is_dir(): - continue + organ_vols.rename(columns={'staging': WEV_LABEL}, inplace=True) - print(mut_line_dir.name) + # organ vols to to mm3 + um3_conv_factor = voxel_size ** 3 # To convert voxels to um3 + um3_to_mm3_conv_factor = 1e9 - stats_line_dir = stats_root_dir / mut_line_dir.name / extra_dir # extra_dir does nothing if == '' + for col in organ_vols.columns: + if col.isdigit() or col == WEV_LABEL: + organ_vols[col] = (organ_vols[col] * um3_conv_factor) / um3_to_mm3_conv_factor - line = mut_line_dir.name - - #TODO: Get file by startswith line name and endswith extension (Could be date of analysis in middle) - # Rather tan just getting any CSVs in there + if two_way: # there is no lines + lines = ['two_way'] - stats_result_file = getfile_startswith_endswith(stats_line_dir, line, '.csv') + else: + lines = organ_vols['line'].unique() + lines = lines[lines != 'baseline'] - # Get mutant staging and organ volumes - line_vols = [] - line_stage = [] + for mut_line in sorted(lines): - for spec_dir in mut_line_dir.iterdir(): - if str(spec_dir).endswith('_'): - continue + stats_line_dir = stats_root_dir / mut_line / extra_dir # extra_dir does nothing if == '' - staging = pd.read_csv(spec_dir / 'output' / 'staging_info_volume.csv', index_col=0) - organ_vols = pd.read_csv(spec_dir / 'output' / 'organ_volumes.csv', index_col=0) + # TODO: Get file by startswith line name and endswith extension (Could be date of analysis in middle) + # Rather tan just getting any CSVs in there - line_vols.append(organ_vols) - line_stage.append(staging) + stats_result_file = getfile_startswith_endswith(stats_line_dir, mut_line, '.csv') - df_stage_mut = pd.concat(line_stage, axis=0) - df_stage_mut['genotype'] = 'mutant' - df_stage_mut.rename(columns={'value': 'staging'}, inplace=True) # Get rid of this - df_vol_mut = pd.concat(line_vols, axis=0) df_hits = pd.read_csv(stats_result_file, index_col=0) - staging_df = pd.concat([wt_staging, df_stage_mut]) - staging_df.rename(columns={'staging': wev}, inplace=True) - vol_df = pd.concat([wt_organ_vols, df_vol_mut]) - - - - if 'significant_cal_p' in df_hits: # 'permutation stats hits: pd.DataFrame = df_hits[df_hits['significant_cal_p'] == True] - elif 'significant_bh_q_5' in df_hits: + elif 'significant_bh_q_5' in df_hits: # Standard stats hits: pd.DataFrame = df_hits[df_hits['significant_bh_q_5'] == True] + elif two_way: + # TODO: make this better + hits: pd.DataFrame = df_hits[ + (df_hits['significant_cal_p_geno'] == True) | (df_hits['significant_cal_p_treat'] == True) | ( + df_hits['significant_cal_p_inter'] == True)] else: - logging.error("Plots not made: Stats output file must have 'significant_cal_p' or 'significant_bh_q_5' column") + logging.error( + "Plots not made: Stats output file must have 'significant_cal_p' or 'significant_bh_q_5' column") - if ('organ_system_name' in label_meta.columns) and ('organ_system_name' not in hits): + if label_meta is not None and 'organ_system_name' in label_meta.columns and 'organ_system_name' not in hits: # Sort by organ system if present in atlas metadata hits = hits.merge(label_meta[['organ_system_name']], how='left', left_index=True, right_index=True) hits.sort_values(by='organ_system_name', inplace=True) if skip_no_analysis: # Skip organ that are flagged with no_analysis in the atlas metadata file + # Kyle - so I don't know why I have this line and it's stupid but it uses the label metadata column instead if 'no_analysis' not in hits: - hits = hits[hits['no_analysis'] != True] + hits = hits[label_meta['no_analysis'] != True] if len(hits) < 1: - logging.info(f'No hits, so Skipping organ vol plots for: {mut_line_dir.name}') + logging.info(f'No hits, so Skipping organ vol plots for: {mut_line}') continue - st = wt_staging['staging'] - normed_wt = wt_organ_vols.div(st, axis=0) - - normed_mut = df_vol_mut.div(df_stage_mut['staging'], axis=0) - numcol = 6 if len(hits) > 5 else len(hits) numrows = math.ceil(len(hits) / numcol) - figsize_y = 5 * numrows - figsize_x = 5 * numcol + figsize_y = 7 * numrows + figsize_x = 7 * numcol fig = Figure(figsize=(figsize_x, figsize_y)) FigureCanvas(fig) @@ -204,75 +227,65 @@ def make_plots(mut_lines_dir: Path, else: labels_to_plot = hits.index + # organ_vols[organ_vol] = (scattter_df[organ_vol] * um3_conv_factor) / um3_to_mm3_conv_factor + # scattter_df[wev] = (scattter_df[wev] * um3_conv_factor) / um3_to_mm3_conv_factor + # for i, (label, row) in enumerate(hits.iterrows()): - for i, label in enumerate(labels_to_plot): - label_name: str = hits.loc[label, 'label_name'] + for i, label in enumerate(tqdm(labels_to_plot)): + if 'label_name' in hits: + label_name: str = hits.loc[label, 'label_name'] + else: + label_name: str = label axes = fig.add_subplot(numrows, numcol, i + 1) axes.tick_params(labelsize=18) axes.set_yticklabels([]) label = str(label) - wt = normed_wt[[label]] - wt['genotype'] = 'baseline' - - mut = normed_mut[[label]] - mut['genotype'] = line - df = pd.concat([wt, mut]) - df.rename(columns={label: organ_vol}, inplace=True) + try: + # Check if we have a label metadata file, whether it has a short_name col, + # and whether the current label as a short_name entry + if label_meta is not None and 'short_name' in label_meta and label_meta.at[int(label), 'short_name']: + label_name = label_meta.at[int(label), 'short_name'] + else: + label_name = str(label_name) + # title is now dependent on if its rad data or not + title = str(label.split("__")[0] + " " + label_name).replace('_', ' ') if label.__contains__("__") else label_name.replace('_', ' ') - min_ = df[organ_vol].min() - (df[organ_vol].min() * 0.1) - max_ = df[organ_vol].max() + (df[organ_vol].max() * 0.1) - - - # sns.boxplot(x="genotype", y="organ volume", data=df, orient='v', - # ax=axes, boxprops=boxprops) - - axes.tick_params(labelsize=18) - - ax = sns.swarmplot(x="genotype", y='organ volume', data=df, orient='v', - ax=axes) - - ax.set_ylim(min_, max_) - - for patch in ax.artists: - r, g, b, a = patch.get_facecolor() - patch.set_facecolor((r, g, b, 0.0)) - - if 'short_name' in label_meta: - label_name = label_meta.at[int(label), 'short_name'] - title = label_name.replace('_', ' ') - title = title.capitalize() - ax.set_ylabel('') - ax.set_xlabel('') - - ax.set_title(title, fontsize=20) - - ###Scatter + except Exception: + print('p') + # Scatterplot s_axes = fig_scat.add_subplot(numrows, numcol, i + 1) s_axes.tick_params(labelsize=18) - # Get rid of hard-coding - voxel_size = 27.0 - um3_conv_factor = voxel_size ** 3 # To convert voxels to um3 - um3_to_mm3_conv_factor = 1e9 - - scattter_df = staging_df.join(vol_df[[label]]).rename(columns={label: organ_vol}) - scattter_df[organ_vol] = (scattter_df[organ_vol] * um3_conv_factor) / um3_to_mm3_conv_factor - scattter_df[wev] = (scattter_df[wev] * um3_conv_factor) / um3_to_mm3_conv_factor - - scattter_df['normalised_organ_vol'] = scattter_df[organ_vol] / scattter_df[wev] - - sax = sns.scatterplot(y=organ_vol, x=wev, ax=s_axes, hue='genotype', - data=scattter_df) + if two_way: + scatter_df = organ_vols + scatter_df = scatter_df[[label, WEV_LABEL, 'line']] + + # replace the label if organ data, ignore it if its radiomics data (index contains __) + if scatter_df.columns[0].__contains__("__"): + scatter_df.rename(columns={'line': 'condition'}, inplace=True) + sax = sns.scatterplot(y=label, x=WEV_LABEL, ax=s_axes, hue='condition', + data=scatter_df) + else: + scatter_df.rename(columns={label: label_name, 'line': 'condition'}, inplace=True) + sax = sns.scatterplot(y=label_name, x=WEV_LABEL, ax=s_axes, hue='condition', + data=scatter_df) + + else: + scatter_df = organ_vols.loc[(organ_vols.line == 'baseline') | (organ_vols.line == mut_line)] + scatter_df = scatter_df[[label, WEV_LABEL, 'line']] + scatter_df.rename(columns={label: label_name, 'line': 'genotype'}, inplace=True) + sax = sns.scatterplot(y=label_name, x=WEV_LABEL, ax=s_axes, hue='genotype', + data=scatter_df) sax.set(xlabel='Whole embryo volume (mm^3)') sax.set(ylabel='Organ volume (mm^3)') sax.set_title(title, fontsize=16) - sax.ticklabel_format(style='sci',scilimits=(0, 0)) + sax.ticklabel_format(style='sci', scilimits=(0, 0)) # x 10^7 instead of 1e7 sax.xaxis.major.formatter._useMathText = True @@ -281,28 +294,23 @@ def make_plots(mut_lines_dir: Path, formatting.label_offset(sax) fig.subplots_adjust(top=0.8) # TODO fix this for larger plot - fig.suptitle(line, fontsize=30, y=0.98) + fig.suptitle(mut_line, fontsize=30, y=0.98) # fig.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) if skip_no_analysis: - box_name = f'{line}_boxplots_no_analysis.png' + box_name = f'{mut_line}_boxplots_no_analysis.png' else: - box_name = f'{line}_boxplots.png' + box_name = f'{mut_line}_boxplots.png' # TODO: Fix the boxplot or swarm plot output # fig.savefig(stats_line_dir / box_name) fig_scat.subplots_adjust(top=0.8, wspace=0.35, hspace=0.4) # TODO fix this for larger plot - fig_scat.suptitle(line, fontsize=30, y=0.98) + fig_scat.suptitle(mut_line, fontsize=30, y=0.98) fig_scat.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) if skip_no_analysis: - scatter_name = f'{line}_scatter_plots_no_analysis_normalised.png' + scatter_name = f'{mut_line}_scatter_plots_no_analysis_normalised.png' else: - scatter_name = f'{line}_scatter_plots.png' + scatter_name = f'{mut_line}_scatter_plots.png' fig_scat.savefig(stats_line_dir / scatter_name) - - - - - diff --git a/lama/qc/qc_images.py b/lama/qc/qc_images.py index 668660f2..9c3de099 100644 --- a/lama/qc/qc_images.py +++ b/lama/qc/qc_images.py @@ -9,9 +9,7 @@ import SimpleITK as sitk from logzero import logger as logging import numpy as np -from skimage.exposure import rescale_intensity -from skimage.transform import match_histograms -from skimage import exposure +from skimage.exposure import rescale_intensity, match_histograms from skimage.io import imsave from skimage.measure import regionprops @@ -70,7 +68,7 @@ def make_qc_images(lama_specimen_dir: Path, img = common.LoadImage(img_path).array _make_red_cyan_qc_images(target, img, red_cyan_dir, greyscale_dir, img_path.stem, i, stage) - if paths.inverted_labels_dirs: + if paths.inverted_labels_dir: # First reg img will the rigid-registered image first_reg_dir = paths.reg_dirs[0] @@ -79,13 +77,13 @@ def make_qc_images(lama_specimen_dir: Path, # We have a reverse registration method of label propagation so we overlay the labels that were transformed # using the reverse registrtion transform (the final defoemable stage) as the target will have been the # Rigid input - inverted_label_dir = paths.inverted_labels_dirs[-1] + inverted_label_dir = paths.inverted_labels_dir else: # The labels were propagated using the inverse transfrom method. Therefore we overlay the labels transformed # using the tforms up to the inverted affine stage onto the rigid input. # (could do inverted rigid labels overalid on orginal input, but on rigid allllows us to compare specimens # more easily using this method) - inverted_label_dir = paths.inverted_labels_dirs[-2] # -2 should be the step after rigid + inverted_label_dir = paths.inverted_labels_dir inverted_label_overlays_dir = outdir / 'inverted_label_overlay' inverted_label_overlays_dir.mkdir(exist_ok=True) diff --git a/lama/registration_pipeline/run_lama.py b/lama/registration_pipeline/run_lama.py index e14fe432..9ff42a6c 100755 --- a/lama/registration_pipeline/run_lama.py +++ b/lama/registration_pipeline/run_lama.py @@ -112,7 +112,7 @@ import signal import shutil -from lama.elastix.invert_volumes import InvertLabelMap, InvertMeshes +from lama.elastix.propagate_volumes import PropagateLabelMap, PropagateMeshes from lama.elastix.invert_transforms import batch_invert_transform_parameters from lama.elastix.reverse_registration import reverse_registration from lama.img_processing.organ_vol_calculation import label_sizes @@ -125,7 +125,7 @@ from lama.qc.qc_images import make_qc_images from lama.qc.folding import folding_report from lama.stats.standard_stats.data_loaders import DEFAULT_FWHM, DEFAULT_VOXEL_SIZE -from lama.elastix import INVERT_CONFIG, REG_DIR_ORDER_CFG +from lama.elastix import PROPAGATE_CONFIG, REG_DIR_ORDER_CFG from lama.monitor_memory import MonitorMemory from lama.common import cfg_load from lama.segmentation_plugins import plugin_interface @@ -134,8 +134,6 @@ ELX_PARAM_PREFIX = 'elastix_params_' # Prefix the generated elastix parameter files PAD_INFO_FILE = 'pad_info.yaml' -temp_debug_fix_folding = True - # Set the spacing and origins before registration SPACING = (1.0, 1.0, 1.0) @@ -221,7 +219,7 @@ def run(configfile: Path): else: # invert_transform method is the default batch_invert_transform_parameters(config) - logging.info('inverting volumes') + logging.info('propagating volumes') invert_volumes(config) # Now that labels have been inverted, should we delete the transorm files? @@ -248,12 +246,6 @@ def run(configfile: Path): return True -# def get_whole_embryo_mask(config: LamaConfig): -# mask_root = config['out_dir'] / 'inverted_stats_masks' -# if config['label_propagation'] == 'reverse_registration': -# mask ='c' -# elif config['label_propagation'] == 'invert_transform': -# mask = config['inverted_stats_masks'] / 'rigid' def generate_staging_data(config: LamaConfig): """ @@ -280,11 +272,10 @@ def generate_staging_data(config: LamaConfig): return logging.info('Generating whole embryo volume staging data') - # Get the root of the inverted masks for thie current specimen - inv_mask_root = config['inverted_stats_masks'] - inv_mask_stage_dir = inv_mask_root / stage_to_get_volumes.name - staging_metric_maker.whole_volume_staging(inv_mask_stage_dir, config['output_dir']) + propagated_mask_dir = config['inverted_stats_masks'] + + staging_metric_maker.whole_volume_staging(propagated_mask_dir, config['output_dir']) return True @@ -316,25 +307,20 @@ def invert_volumes(config: LamaConfig): """ - invert_config = config['inverted_transforms'] / INVERT_CONFIG + invert_config = config['inverted_transforms'] / PROPAGATE_CONFIG if config['stats_mask']: mask_inversion_dir = config.mkdir('inverted_stats_masks') - InvertLabelMap(invert_config, config['stats_mask'], mask_inversion_dir, threads=config['threads']).run() + PropagateLabelMap(invert_config, config['stats_mask'], mask_inversion_dir, threads=config['threads']).run() if config['label_map']: labels_inverion_dir = config.mkdir('inverted_labels') - InvertLabelMap(invert_config, config['label_map'], labels_inverion_dir, threads=config['threads']).run() + PropagateLabelMap(invert_config, config['label_map'], labels_inverion_dir, threads=config['threads']).run() def generate_organ_volumes(config: LamaConfig): - # Get the final inversion stage - invert_config = config['inverted_transforms'] / INVERT_CONFIG - - first_stage = cfg_load(invert_config)['inversion_order'][-1] - - inverted_label_dir = config['inverted_labels'] / first_stage + inverted_label_dir = config['inverted_labels'] out_path = config['organ_vol_result_csv'] @@ -383,7 +369,7 @@ def run_registration_schedule(config: LamaConfig, first_stage_only=False) -> Pat st = config['stage_targets'] if st: with open(st, 'r') as stfh: - stage_targets = yaml.load(stfh)['targets'] + stage_targets = cfg_load(stfh)['targets'] if len(config['registration_stage_params']) != len(stage_targets): logging.error(f'Len stage targets: {len(stage_targets)}') logging.error(f'Len reg stages: {len(config["registration_stage_params"])}') @@ -470,7 +456,8 @@ def run_registration_schedule(config: LamaConfig, first_stage_only=False) -> Pat registrator.set_target(fixed_vol) if reg_stage['elastix_parameters']['Transform'] == 'BSplineTransform': - logging.info(f'Folding correction for stage {stage_id} set') + if config['fix_folding']: + logging.info(f'Folding correction for stage {stage_id} set') registrator.fix_folding = config['fix_folding'] # Curently only works for TargetBasedRegistration registrator.run() # Do the registrations for a single stage @@ -524,7 +511,9 @@ def create_glcms(config: LamaConfig, final_reg_dir): logging.info("Finished creating GLCMs") -def generate_elx_parameters(config: LamaConfig, do_pairwise: bool = False) -> OrderedDict: +def generate_elx_parameters(config: LamaConfig, + do_pairwise: bool = False, + ) -> OrderedDict: """ Generate an ordered dictionary of elastix parameters for each stage. Merge global parameters into each stage. @@ -537,6 +526,7 @@ def generate_elx_parameters(config: LamaConfig, do_pairwise: bool = False) -> Or the main config do_pairwise: Bool if True set elestix parameters to write result image + Returns ------- dict: @@ -548,7 +538,9 @@ def generate_elx_parameters(config: LamaConfig, do_pairwise: bool = False) -> Or stage_id = reg_stage['stage_id'] elxparams_formated = [] - param_vals = [] # The parameters to format + # param_vals = [] # The parameters to format + params = {} + inherit_stage = reg_stage.get('inherit_elx_params') if inherit_stage: # Use another stage's params as a starting point @@ -579,25 +571,28 @@ def generate_elx_parameters(config: LamaConfig, do_pairwise: bool = False) -> Or else: reg_stage['elastix_parameters']['WriteResultImage'] = 'false' global_params.pop('WriteResultImage', None) - param_vals.extend([global_params, reg_stage['elastix_parameters']]) - - for p in param_vals: - for param_name, param_value in p.items(): - if isinstance(param_value, list): # parameter with multiple values for each resolution - # check whether we have didgets or strings, the latter need quoting - if all(common.is_number(v) for v in param_value): - val_list = [str(s).format() for s in param_value] - val_string = ' '.join(val_list) - else: - val_list = [str(s).format() for s in param_value] - val_quoted = ['"{}"'.format(s) for s in val_list] - val_string = ' '.join(val_quoted) - elxparams_formated.append('({0} {1})\n'.format(param_name, val_string)) - else: # Find a nicer way to do this - if common.is_number(param_value): - elxparams_formated.append('({0} {1})\n'.format(param_name, param_value)) - else: - elxparams_formated.append('({0} "{1}")\n'.format(param_name, param_value)) + + # global_params.update(extra_global_params) + params.update(reg_stage['elastix_parameters']) + params.update(global_params) + + # for p in param_vals: + for param_name, param_value in params.items(): + if isinstance(param_value, list): # parameter with multiple values for each resolution + # check whether we have didgets or strings, the latter need quoting + if all(common.is_number(v) for v in param_value): + val_list = [str(s).format() for s in param_value] + val_string = ' '.join(val_list) + else: + val_list = [str(s).format() for s in param_value] + val_quoted = ['"{}"'.format(s) for s in val_list] + val_string = ' '.join(val_quoted) + elxparams_formated.append('({0} {1})\n'.format(param_name, val_string)) + else: # Find a nicer way to do this + if common.is_number(param_value): + elxparams_formated.append('({0} {1})\n'.format(param_name, param_value)) + else: + elxparams_formated.append('({0} "{1}")\n'.format(param_name, param_value)) stage_params[stage_id] = ''.join(elxparams_formated) return stage_params diff --git a/lama/registration_pipeline/validate_config.py b/lama/registration_pipeline/validate_config.py index 0a66bf48..91f5eb5e 100755 --- a/lama/registration_pipeline/validate_config.py +++ b/lama/registration_pipeline/validate_config.py @@ -128,7 +128,7 @@ def __init__(self, config: Union[Path, Dict], cfg_path: Path=None, no_validate=F # The following options are used for saving dsk space 'write_deformation_vectors': (bool, False), 'delete_inverted_transforms': (bool, False), - 'write_raw_jacobians': (bool, False), + 'write_raw_jacobians': (bool, True), 'write_log_jacobians': (bool, True), } @@ -163,6 +163,10 @@ def __init__(self, config: Union[Path, Dict], cfg_path: Path=None, no_validate=F self.check_stages() + self.check_propagation_options() + + self.check_problematic_elx_params() + def __getitem__(self, item): return self.options[item] @@ -171,6 +175,15 @@ def __setitem__(self, key, value): # For debugging self.options[key] = value + def check_propagation_options(self): + if self.options['skip_forward_registration'] and self.options['label_propagation'] == 'invert_transform': + raise LamaConfigError("'skip_forward_registration' is only abailble when 'label_propagation " + "= 'reverse_registration'") + # # Temp until a fix is made + # if self.options['label_propagation'] == 'invert_transform' and self.options['fix_folding']: + # raise LamaConfigError('invert_transfrom method of label propagation is not currently workign with the' + # ' fix_folding option. There will be a fix soon') + def resolve_output_paths(self): """ Make absolute paths from the self.output_path_names dict. @@ -352,6 +365,7 @@ def check_stages(self): path = self.options['root_reg_dir'] / stage['stage_id'] self.stage_dirs[stage['stage_id']] = path + # Check that the inherit value makes sense inherit_id = stage.get('inherit_elx_params') if inherit_id: found_id = False @@ -506,6 +520,22 @@ def pairwise_check(self): gep['WriteResultImage'] = 'false' gep['WriteResultImageAfterEachResolution'] = 'false' + def check_problematic_elx_params(self): + + if self.options.get('label_propagation') == 'invert_transform': + + all_stage_params = [x['elastix_parameters'] for x in self.config['registration_stage_params']] + all_stage_params.append(self.config['global_elastix_params']) + + for d in all_stage_params: + + if 'UseRandomSampleRegion' in d and d['UseRandomSampleRegion'] == 'true': + + raise LamaConfigError('UseRandomSampleRegion is not currently compatible with inverting transforms' + '\n"try: label_propagation" == "invert_transform" instead') + + + def convert_image_pyramid(self): """ The elastix image pyramid needs to be specified for each dimension for each resolution. diff --git a/lama/scripts/lama_ark_img_pro.py b/lama/scripts/lama_ark_img_pro.py new file mode 100644 index 00000000..1bf86525 --- /dev/null +++ b/lama/scripts/lama_ark_img_pro.py @@ -0,0 +1,42 @@ +from lama.utilities import cropper, flipper, lama_convert_16_to_8, lama_pad_volumes +import logging +from pathlib import Path + +def run(indir, target_dir): + # firstly crop volumes + indir = Path(indir) + logging.info("Cropping scans") + cropper.main(indir) + + # cropped files are in a new location + cropped_dir = indir / "cropped" + + # convert to 8-bit in place (i.e with clobber) + logging.info("Converting to 8 bit") + lama_convert_16_to_8.convert_16_bit_to_8bit(cropped_dir,'', clobber=True) + + # flip images - flipper using + logging.info("flipping") + flipper.main(cropped_dir) + + # finally pad the volumes with clobber + logging.info("Padding Volumes") + lama_pad_volumes.pad_volumes(cropped_dir, '', clobber=True) + + +def main(): + import argparse + + parser = argparse.ArgumentParser("Arkell Image Processing") + parser.add_argument('-i', '--input_folder', dest='indirs', help='Raw NRRD directory', required=True, + type=str) + parser.add_argument('-t', '--target_folder', dest='target', help='directory of LAMA target (i.e. pop avg)', + required=True, + type=str) + + args = parser.parse_args() + run(args.indirs, args.target) + + +if __name__ == '__main__': + main() diff --git a/lama/scripts/lama_get_walkthrough_data.py b/lama/scripts/lama_get_walkthrough_data.py index ae5b58b5..09df89b5 100644 --- a/lama/scripts/lama_get_walkthrough_data.py +++ b/lama/scripts/lama_get_walkthrough_data.py @@ -14,7 +14,6 @@ def main(): # Unzip into cwd unzip_dir = Path().cwd() - download_and_extract_zip(url, unzip_dir) diff --git a/lama/scripts/lama_job_runner.py b/lama/scripts/lama_job_runner.py index ccc23bc1..80601e8c 100755 --- a/lama/scripts/lama_job_runner.py +++ b/lama/scripts/lama_job_runner.py @@ -9,6 +9,7 @@ import sys import os from pathlib import Path +import logzero # Bodge until I get imports working in Docker @@ -80,7 +81,8 @@ def make_jobs_file(jobs_file: Path, root_dir: Path): def lama_job_runner(config_path: Path, root_directory: Path, - make_job_file: bool=False): + make_job_file: bool=False, + log_level=None): """ @@ -104,6 +106,8 @@ def lama_job_runner(config_path: Path, If this script terminates unexpectedly while it has a lock on the file, it will not be released and the file remains. Therefore before running this script, ensure no previous lock file is hanging around. """ + if log_level: + logzero.loglevel(log_level) if not config_path.is_file(): raise FileNotFoundError(f"can't find config file {config_path}") @@ -134,7 +138,7 @@ def lama_job_runner(config_path: Path, return except Timeout: - print(f"Make sure lock file: {lock_file} is not present on running first instance") + logging.error(f"Make sure lock file: {lock_file} is not present on running first instance") sys.exit() config_name = config_path.name @@ -152,7 +156,20 @@ def lama_job_runner(config_path: Path, if len(jobs_to_do) < 1: logging.info("No more jobs left on jobs list") - break + logging.info("checking for hung jobs") + fin_jobs = df_jobs[df_jobs['status'] == 'complete'] + running_jobs = df_jobs[df_jobs['status'] == 'running'] + fin_indx = fin_jobs.index[-1] + fin_t = fin_jobs.at[fin_indx, 'start_time'] + fin_time = datetime.strptime(fin_t, '%Y-%m-%d %H:%M:%S') + run_t = running_jobs['start_time'] + run_times = [datetime.strptime(t, '%Y-%m-%d %H:%M:%S') < fin_time for t in run_t] + hung_jobs = running_jobs[run_times] + if len(hung_jobs) < 1: + logging.info("Hung jobs found - rerunning") + jobs_to_do = hung_jobs + else: + break indx = jobs_to_do.index[0] @@ -199,8 +216,7 @@ def lama_job_runner(config_path: Path, sys.exit('Timed out' + socket.gethostname()) try: - print(f'debug {HN}, {linenum()}') - print(f'trying {vol.name}') + logging.info(f'trying {vol.name}') run_lama.run(dest_config_path) except LamaConfigError as lce: @@ -225,7 +241,7 @@ def lama_job_runner(config_path: Path, df_jobs.at[indx, 'status'] = status df_jobs.at[indx, 'end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') df_jobs.to_csv(job_file) - print('Exiting job_runner') + logging.info('Exiting job_runner') return True @@ -245,7 +261,7 @@ def main(): try: lama_job_runner(Path(args.config), Path(args.root_dir), args.make_job_file) except pd.errors.EmptyDataError as e: - logging.exception(f'poandas read failure {e}') + logging.exception(f'pandas read failure {e}') if __name__ == '__main__': diff --git a/lama/scripts/lama_machine_learning.py b/lama/scripts/lama_machine_learning.py new file mode 100644 index 00000000..65a3c81f --- /dev/null +++ b/lama/scripts/lama_machine_learning.py @@ -0,0 +1,242 @@ + + +from lama.lama_radiomics import feature_reduction +from lama import common + +from filelock import SoftFileLock, Timeout +import socket +from datetime import datetime +import sys +import signal + +from sklearn.ensemble import RandomForestClassifier, StackingClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.metrics import accuracy_score, auc, roc_auc_score, f1_score, matthews_corrcoef, precision_score, recall_score + +from logzero import logger as logging +from pathlib import Path +import pandas as pd +import os + +#from mpi4py import MPI +import sys + +from joblib import Parallel, delayed + +JOBFILE_NAME = 'ml_jobs.csv' + +def establish_model(X, stack: bool=False): + # do feature selection: + X = feat_select(X, 'accuracy') + + # set up training and test_data + x_train, x_test, y_train, y_test = train_test_split(X, X.index, stratify=y, test_size=0.2, random_state=0) + + #train models + + + rf = RandomForestClassifier() + + + #for i, mod in enumerate(estimators): + # mod.fit(x_train, y_train) + if stack: #LAMA radiomics + knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1) + mlp_adam = MLPClassifier(solver='adam') + mlp_lbfgs = MLPClassifier(solver='lbfgs') + estimators = [('knn', knn), + ('mlp_adam', mlp_adam), + ('mlp_lbfgs', mlp_lbfgs), + ('rf', rf)] + + stack_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression, n_jobs=-1) + else: #BQ Radiomics + stack_model = rf + + stack_model.fit(x_train, y_train) + y_train_predict = stack_model.predict(x_train) + y_test_predict = stack_model.predict(x_test) + + + metrics = [accuracy_score, + auc, + roc_auc_score, + f1_score, + matthews_corrcoef, + precision_score, + recall_score] + for i, met in enumerate(metrics): + train_acc = met(y_train, y_train_predict) + test_acc = met(y_test, y_test_predict) + print(met, train_acc, test_acc) + + + return stack_model + + +def make_ml_jobs_file(jobs_file: Path, file_paths: list): + """ + Creates a joblist csv file for use with the radiomics pipeline. + Searches for all images paths and creates a job file + + Parameters + ---------- + jobfile_path: Path to save job file to + root_dir: the root project directory + is_mutants: if True search the folder for individual line sub folders + + """ + # output_dir = root_dir / 'radiomics_output' + # output_dir.mkdir(exist_ok=True) + + jobs_entries = [] + # get each file path + for i, csv_path in enumerate(file_paths): + + rel_path_to_org_input = str(csv_path.relative_to(jobs_file.parent)) + jobs_entries.append([rel_path_to_org_input, 'to_run', '_', '_', '_']) + + jobs_df = pd.DataFrame.from_records(jobs_entries, columns=['job', 'status', 'host', 'start_time', 'end_time']) + + jobs_df.to_csv(jobs_file) + return True + + + +def ml_job_runner(org_dir, n_sample: bool=True): + + '''i + Performs the pyradiomic calculations + + + Parameters + ---------- + target_dir + + + + Returns + ------- + + ''' + + + # get org csv files + + org_dir = Path(org_dir) + names = common.get_file_paths(org_dir, extension_tuple=".csv") + + jobs_file_path = org_dir / JOBFILE_NAME + lock_file = jobs_file_path.with_suffix('.lock') + lock = SoftFileLock(lock_file) + + if not os.path.exists(jobs_file_path): + logging.info("Creating a job-file for ml") + make_ml_jobs_file(jobs_file_path, names) + logging.info("Job_file_created") + + + + # execute parallelisation: + while True: + try: + with lock.acquire(timeout=60): + + df_jobs = pd.read_csv(jobs_file_path, index_col=0) + # Get an unfinished job + jobs_to_do = df_jobs[df_jobs['status'] == 'to_run'] + if len(jobs_to_do) < 1: + logging.info("No more jobs left on jobs list") + + logging.info("checking for hung jobs") + + # get last job and check start-time + fin_jobs = df_jobs[df_jobs['status'] == 'complete'] + running_jobs = df_jobs[df_jobs['status'] == 'running'] + fin_indx = fin_jobs.index[-1] + fin_t = fin_jobs.at[fin_indx, 'start_time'] + fin_time = datetime.strptime(fin_t, '%Y-%m-%d %H:%M:%S') + run_t = running_jobs['start_time'] + run_times = [datetime.strptime(t, '%Y-%m-%d %H:%M:%S') < fin_time for t in run_t] + hung_jobs = running_jobs[run_times] + + + if len(hung_jobs) > 0: + logging.info("Hung jobs found - rerunning") + jobs_to_do = hung_jobs + else: + break + indx = jobs_to_do.index[0] + + org_csv_path = Path(org_dir) / (jobs_to_do.at[indx, 'job']) + + + + + df_jobs.at[indx, 'status'] = 'running' + df_jobs.at[indx, 'start_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + df_jobs.at[indx, 'host'] = socket.gethostname() + + df_jobs.to_csv(jobs_file_path) + except Timeout: + sys.exit('Timed out' + socket.gethostname()) + + # try: + logging.info(f'trying {org_csv_path}') + # get the organ file and number + org_df = pd.read_csv(org_csv_path) + try: + org = org_df['org'][0] + feature_reduction.main(org_df, org, org_dir) + except KeyError: + # BQ data should have no 'org' info + features = org_df + features = features[features.columns.drop(list(features.filter(regex="diagnostics")))] + features.drop(["scanID"], axis=1, inplace=True) + feature_reduction.main(features, org=None, rad_file_path=Path(org_dir.parent / "full_results.csv"), n_sampler=True) + + # perform feature reduction on a single organ + + # except Exception as e: + # if e.__class__.__name__ == 'KeyboardInterrupt': + # logging.info('terminating') + # sys.exit('Exiting') + + # status = 'failed' + # print(e) + # logging.exception(e) + + status = 'complete' + + with lock: + df_jobs = pd.read_csv(jobs_file_path, index_col=0) + df_jobs.at[indx, 'status'] = status + df_jobs.at[indx, 'end_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + df_jobs.to_csv(jobs_file_path) + + logging.info('Exiting job_runner') + return True + + +def main(): + import argparse + parser = argparse.ArgumentParser("Run RF models for prediction") + parser.add_argument('-i', '--input_file', dest='indirs', help='radiomics file', required=True, + type=str) + parser.add_argument('-m', '--make_org_files', dest='make_org_files', + help='Run with this option to split the full into organs', + action='store_true', default=False) + parser.add_argument('-a', '--abnormal_embs', dest='abnormal_embs', help='Run to specify abnormal embryos', + default=False) + args = parser.parse_args() + _dir = Path(args.indirs) + if args.make_org_files: + common.gather_rad_data(_dir) + + else: + ml_job_runner(_dir) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/scripts/lama_permutation_stats.py b/lama/scripts/lama_permutation_stats.py index 80950739..c48a3080 100755 --- a/lama/scripts/lama_permutation_stats.py +++ b/lama/scripts/lama_permutation_stats.py @@ -11,36 +11,102 @@ $ scripts/lama_stats.py -c -w """ from pathlib import Path +import sys -from lama.stats.permutation_stats.run_permutation_stats import run +import yaml + +from lama.stats.permutation_stats import run_permutation_stats + + +allowed_cfg_keys = [ + 'wildtype_dir', + 'mutant_dir', + 'output_dir', + 'treatment_dir', + 'interaction_dir', + 'n_permutations', + 'label_metadata', + 'label_map', + 'norm_to_whole_embryo_vol', + 'qc_file', + 'voxel_size', + 'two_way', + 'rad_dir', + 'spec_fdr' +] def main(): import argparse - import numpy as np parser = argparse.ArgumentParser("Permutation-based stats") - - # Required args - parser.add_argument('-w', '--wt_dir', dest='wt_dir', help='wildtype registration directory', required=True, type=Path) - parser.add_argument('-m', '--mut_dir', dest='mut_dir', help='mutant registration directory', required=True, type=Path) - parser.add_argument('-o', '--out_dir', dest='out_dir', help='permutation results directory', required=True, type=Path) - - # optional args - parser.add_argument('-i', '--label_info', dest='label_info', help='path to label info csv file', required=False, default=None, type=Path) - parser.add_argument('-l', '--label_map', dest='label_map', help='path to label maps image file', required=False, default=None, type=Path) - parser.add_argument('-n', '--num_perm', dest='num_perm', help='number of permutations to do', type=np.int, - required=False, default=1000) - parser.add_argument('-norm', '--normalise', dest='norm', help='normalise organ volume to whole embryo volume', - required=False, default=False, action='store_true') - parser.add_argument('-q', '--qc_file', dest='qc_file', help='QCd organs to exclude', - required=False, default=None, type=Path) + parser.add_argument('-c', '--config', dest='cfg_path', help='wildtype registration directory', required=True, + type=str) args = parser.parse_args() + run(args.cfg_path) + + +def run(cfg_path): + + def p(path): + if path is None: + return + + cfg_dir = Path(cfg_path).parent + + resolved = (cfg_dir / path).resolve() + + if not resolved.exists(): + raise FileNotFoundError(f'Cannot find: {resolved}') + return resolved + + with open(cfg_path, 'r') as fh: + cfg = yaml.load(fh, Loader=yaml.FullLoader) + + for key in cfg.keys(): + if key not in allowed_cfg_keys: + raise ValueError(f'Config key "{key}" is not in the allowed keys: {", ".join(allowed_cfg_keys)} ') + + # required parameters + try: + wt_dir = p(cfg['wildtype_dir']) + mut_dir = p(cfg['mutant_dir']) + except KeyError: + raise KeyError("'wildtype_dir', 'mutant_dir' are required parameters") + + out_dir = p(cfg.get('output_dir', Path(cfg_path).parent)) + + # Optional parameters + + n_perm = int(cfg.get('n_permutations', 1000)) + label_meta = p(cfg.get('label_metadata')) + label_map = p(cfg.get('label_map')) + wev_norm = bool(cfg.get('norm_to_whole_embryo_vol', True)) + qc_file = p(cfg.get('qc_file')) + voxel_size = float(cfg.get('voxel_size', 1.0)) + + treat_dir = p(cfg['treatment_dir']) + inter_dir = p(cfg['interaction_dir']) + two_way = bool(cfg.get('two_way', False)) + + spec_fdr = float(cfg.get('spec_fdr', 0.2)) - run(args.wt_dir, args.mut_dir, args.out_dir, args.num_perm, - label_info=args.label_info, label_map_path=args.label_map, normalise_to_whole_embryo=args.norm, - qc_file=args.qc_file) + rad_dir = p(cfg.get('rad_dir')) + run_permutation_stats.run(wt_dir=wt_dir, + mut_dir=mut_dir, + out_dir=out_dir, + num_perms=n_perm, + label_info=label_meta, + specimen_fdr=spec_fdr, + label_map_path=label_map, + normalise_to_whole_embryo=wev_norm, qc_file=qc_file, + voxel_size=voxel_size, + two_way=two_way, + treat_dir=treat_dir, + inter_dir=inter_dir, + rad_dir=rad_dir + ) if __name__ == '__main__': diff --git a/lama/scripts/lama_radiomics_runner.py b/lama/scripts/lama_radiomics_runner.py new file mode 100644 index 00000000..273c1ad7 --- /dev/null +++ b/lama/scripts/lama_radiomics_runner.py @@ -0,0 +1,77 @@ +from lama.lama_radiomics.radiomics import radiomics_job_runner +import pandas as pd +import logging +from pathlib import Path +from lama.common import cfg_load +from lama.img_processing import normalise + + +def main(): + import argparse + + parser = argparse.ArgumentParser("Schedule LAMA jobs") + + parser.add_argument('-c', '--config', dest='config', help='lama.yaml config file', + required=True) + + parser.add_argument('-m', '--make_job_file', dest='make_job_file', + help='Run with this option forst to crate a job file', + action='store_true', default=False) + + args = parser.parse_args() + + try: + # lets just get the config here - it's not that big right now + + c = cfg_load(Path(args.config)) + # c = cfg_load(Path("E:/220607_two_way/radiomics_output/generate_radiomics.toml")) + + target_dir = Path(c.get('target_dir')) + + labs_of_int = c.get('labs_of_int') + + norm_methods = [c.get('norm_methods')] + + print("from c.get", norm_methods) + + norm_label = c.get('norm_label') + + spherify = c.get('spherify') + + fold = c.get('fold') + + stage_dir = c.get('stage_dir') + + scan_dir = c.get('scan_dir') + + tumour_dir = c.get('tumour_dir') + + ref_vol_path = Path(c.get('ref_vol_path')) if c.get('ref_vol_path') is not None else None + + norm_dict = { + "histogram": normalise.IntensityHistogramMatch(), + "N4": normalise.IntensityN4Normalise(), + "subtraction": normalise.NonRegMaskNormalise(), + "none": None + } + try: + norm_meths = [norm_dict[str(x)] for x in norm_methods] + + + + except KeyError as e: + print(e) + + norm_meths = None + logging.info("Starting Radiomics") + + print(norm_meths) + radiomics_job_runner(target_dir, labs_of_int=labs_of_int, norm_method=norm_meths, spherify=spherify, + ref_vol_path=ref_vol_path, norm_label=norm_label, make_job_file=args.make_job_file, + fold=fold, scan_dir=scan_dir, stage_dir=stage_dir, tumour_dir=tumour_dir) + except pd.errors.EmptyDataError as e: + logging.exception(f'pandas read failure {e}') + + +if __name__ == '__main__': + main() diff --git a/lama/scripts/lama_reg.py b/lama/scripts/lama_reg.py index 4d3e3dfd..41c856ce 100755 --- a/lama/scripts/lama_reg.py +++ b/lama/scripts/lama_reg.py @@ -19,7 +19,7 @@ def main(): parser.add_argument('-c', dest='config', help='Config file (TOML format)', required=True) args = parser.parse_args() - run_lama.run(Path(args.config)) + run_lama.run(Path(args.config).absolute()) if __name__ == "__main__": diff --git a/lama/scripts/lama_stats.py b/lama/scripts/lama_stats.py index 55be35b3..25403920 100755 --- a/lama/scripts/lama_stats.py +++ b/lama/scripts/lama_stats.py @@ -37,17 +37,24 @@ def main(): parser.add_argument('-o', '--output_dir', dest='out_dir', help='Directory to put results from all lines. Will be made if not exists ', required=True) parser.add_argument('-t', '--target_dir', dest='target_dir', help="Directory containing all the ", required=True) parser.add_argument('-l', '--lines', dest='lines_to_process', help="Space-separated line_ids to exclusively process", nargs='*', required=False, default=False) - + parser.add_argument('-e', '--treatment_dir', dest='treatment_dir', help="treatment registration output root directory", required=False, default=False) + parser.add_argument('-n', '--interaction_dir', dest='interaction_dir', help="interaction registration output root directory", required=False, default=False) + args = parser.parse_args() # Just for testing. Work out a way to add specific lines to the analysis # lines_to_run = ['fgf9', 'nras'] + + # removes unused dirs (i.e. interaction and treatment args) + paths= [args.config, args.wt_dir, args.mut_dir, args.out_dir, args.target_dir, args.treatment_dir, args.interaction_dir] + paths = [x for x in paths if x] # In case there are any '~' in the paths - resolved_paths = [Path(x).expanduser() for x in [args.config, args.wt_dir, args.mut_dir, args.out_dir, args.target_dir]] + resolved_paths = [Path(x).expanduser() for x in paths] + run(*resolved_paths, args.lines_to_process) if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/lama/scripts/two_way_plotter.py b/lama/scripts/two_way_plotter.py new file mode 100644 index 00000000..78d0be43 --- /dev/null +++ b/lama/scripts/two_way_plotter.py @@ -0,0 +1,66 @@ +from lama.utilities import combine_spec_csv, extract_label, extract_registrations +from lama import common +import logging +import subprocess as sub +from pathlib import Path +import os +import tempfile +PLOT_SCRIPT = str(common.lama_root_dir / 'stats' / 'rscripts' / 'two_way_plot.R') + + +def main(): + import argparse + + parser = argparse.ArgumentParser("plot way data and get segmentations of interest") + parser.add_argument('-i', dest='root_dir', + help='Folder with Registration results, (i.e. wild_type_and_mutant_data)', + required=True) + parser.add_argument('-l', dest='labs', + help='labels of interest (list of numbers - look at label_info for numbers)', + required=True) + + args = parser.parse_args() + root_dir = Path(args.root_dir) + + labs = [float(i) for i in args.labs.split(",")] if "," in args.labs else float(args.labs) + + logging.info("Extracting Volumes and Labels of Interest") + extract_registrations.main(root_dir) + extract_label.main(root_dir, labs) + + + logging.info("Plotting Two-way Standard Stats") + # combine specimen organ volumes / staging volumes + combine_spec_csv.main(root_dir) + + organ_vol_file = tempfile.NamedTemporaryFile().name + staging_file = tempfile.NamedTemporaryFile().name + label_info_file = tempfile.NamedTemporaryFile().name + groups_file = tempfile.NamedTemporaryFile().name + + + + # run plotting script + + print(os.path.exists(PLOT_SCRIPT)) + print(str(root_dir / "full_organs.csv")) + print(str(root_dir / "full_staging.csv")) + print(str(root_dir.parent / 'target' / 'E14_5_atlas_v24_43_label_info.csv')) + cmd = ['Rscript', + PLOT_SCRIPT, + str(root_dir / "full_organs.csv"), + str(root_dir / "full_staging.csv"), + str(root_dir.parent / 'target' / 'E14_5_atlas_v24_43_label_info.csv'), + '40.0' + ] + try: + sub.check_output(cmd) + logging.info('R plotting suceeded') + except sub.CalledProcessError as e: + msg = "R plotting failed {}".format(e) + logging.exception(msg) + raise RuntimeError(msg) + + +if __name__ == '__main__': + main() diff --git a/lama/segmentation_plugins/plugin_interface.py b/lama/segmentation_plugins/plugin_interface.py index f752b4b1..9a411bee 100644 --- a/lama/segmentation_plugins/plugin_interface.py +++ b/lama/segmentation_plugins/plugin_interface.py @@ -2,7 +2,7 @@ from lama.registration_pipeline.validate_config import LamaConfig from lama.common import cfg_load, write_array import importlib.util -from lama.elastix import INVERT_CONFIG +from lama.elastix import PROPAGATE_CONFIG def secondary_segmentation(config: LamaConfig): @@ -26,8 +26,8 @@ def secondary_segmentation(config: LamaConfig): # Find the directories containing the segmentations # Get the final inversion stage - invert_config = config['inverted_transforms'] / INVERT_CONFIG - segmentation_dir = cfg_load(invert_config)['inversion_order'][-1] # rename to segmentation stage + invert_config = config['inverted_transforms'] / PROPAGATE_CONFIG + segmentation_dir = cfg_load(invert_config)['label_propagation_order'][-1] # rename to segmentation stage inverted_label_dir = config['inverted_labels'] / segmentation_dir initial_segmentation_path = next(inverted_label_dir.glob('**/*.nrrd')) diff --git a/lama/segmentation_plugins/secondary_seg_lateral_ventricle.py b/lama/segmentation_plugins/secondary_seg_lateral_ventricle.py index c2578777..07fb64e1 100644 --- a/lama/segmentation_plugins/secondary_seg_lateral_ventricle.py +++ b/lama/segmentation_plugins/secondary_seg_lateral_ventricle.py @@ -1,12 +1,16 @@ """ This is an example plugin for secondary segmetnation. + To implement a similar plugin there must be a run() function that takes as arguments: - 1: Path to the image to segment + 1: Path to the image to segment. + This will be either the rigidly aligned image (reverse_registration label propagation) or + The original input image (invert transforms label propagation) 2: The initial segmetation of the image done by label propagation in LAMA -This function should return a labelmap with one or more modified labels woth the rest set to zero. +This function should return a labelmap with one or more modified labels with the rest set to zero. The current module was made as the E15.5 ventricle segmetation can be varaible depending on the size of the organ. It works as follows: + * Make an ROI using labels surrounding the brain ventricle * Candiate labels are generated using otsu segmentation and * Holes are filled using morphological closing diff --git a/lama/segmentation_plugins/secondary_seg_lateral_ventricle_e145.py b/lama/segmentation_plugins/secondary_seg_lateral_ventricle_e145.py new file mode 100644 index 00000000..44f57df6 --- /dev/null +++ b/lama/segmentation_plugins/secondary_seg_lateral_ventricle_e145.py @@ -0,0 +1,167 @@ +""" +Secondary segmentation plugin for E14.5 lateral ventricles. + + +The current module was made as the E14.5 ventricle segmetation can be varaible depending on the size of the organ / +developmental substage. + +It works as follows: + +* Make an ROI using labels surrounding the brain ventricle +* Candiate labels are generated using otsu segmentation and +* Holes are filled using morphological closing +* Where +* The candidate label that overlaps most with the orginal ventricle segmentation is kept and returned +""" + +from skimage import measure, morphology +import SimpleITK as sitk +import numpy as np +from pathlib import Path +from typing import List +from lama.utilities.atlas_tools import remove_unconected + + +def write(obj, path): + try: + sitk.WriteImage(object, str(path)) + except NotImplementedError: + sitk.WriteImage(sitk.GetImageFromArray(obj), str(path), True) + + +def overalap(a, a_label, b, b_label): + """ + + + Returns + ------- + Size of overlap + """ + a = np.copy(a) + b = np.copy(b) + a[a != a_label] = 0 + a[a == a_label] = 1 + b[b != b_label] = 0 + b[b == b_label] = 1 + return a[a == b].size + + +def run(image_to_segment: Path, + initial_segmentation: Path, + outpath: Path = None) -> np.ndarray: + """ + Lama segmetnation plugin module has to implement a run function. + It currently work well for the lateral ventricles in in E15.5 micro-CT images, but currently only for the main label + region in each lateral centricle + + Parameters + ---------- + image_to_segment + initial_segmentation + + Returns + ------- + + """ + lat_ven_label = 17 + surrounding_labels = [3] # Forebrain and third ventricle + seg = segment(image_to_segment, initial_segmentation, surrounding_labels, lat_ven_label, outpath) + + return seg + + +def segment(image_to_segment_path: Path, + initial_segmentation_path: Path, + surrounding_labels: List[int], + target_label: int, + outpath=None) -> np.ndarray: + """ + Parameters + ---------- + image_to_segment_path: + The image to be segmented + initial_segmentation_path + The inital LAMA segmentation + surrounding_labels + a list of label numbers that surround the target label + target_label + The label we are trying to improve + + Returns + ------- + A new segmentation with the same dimensions as 'inital_segmentation' containing only the newly segmented + 'target_label' + + """ + + image_to_segment = sitk.GetArrayFromImage(sitk.ReadImage(str(image_to_segment_path))) + initial_segmentation = sitk.GetArrayFromImage(sitk.ReadImage(str(initial_segmentation_path))) + + initial_segmentation = remove_unconected(initial_segmentation, 3) + + + # Remove all label that are not of interest here + initial_segmentation[~np.isin(initial_segmentation, surrounding_labels + [target_label])] = 0 + + # Set all the surrouding labels to 1 + initial_segmentation[np.isin(initial_segmentation, surrounding_labels)] = 1 + + write(initial_segmentation, '/mnt/bit_nfs/neil/impc_e14.5/har/outputs/baseline/baseline_3061/secondary_seg_lv_test/interm.nrrd') + + props = measure.regionprops(initial_segmentation) + + for x in props: + if x.label == 1: # The label of interest is set to 1 + + # Get the ROI defined by the surrounding labels + b = x.bbox + image_to_segment *= -1 # invert image + image_roi = image_to_segment[b[0]:b[3], b[1]: b[4], b[2]: b[5]] + label_roi = initial_segmentation[b[0]:b[3], b[1]: b[4], b[2]: b[5]] + + # Do otsu thresholding on the roi + ventr_area_itk = sitk.GetImageFromArray(image_roi) + thresh = sitk.OtsuThreshold(ventr_area_itk) + thresh_arr = sitk.GetArrayFromImage(thresh) + thresh_arr = np.invert(thresh_arr) + + write(thresh_arr, '/mnt/bit_nfs/neil/impc_e14.5/har/outputs/baseline/baseline_3061/secondary_seg_lv_test/thres.nrrd') + + # Get connected component labels from the thresholding + threshold_labels = measure.label(thresh_arr) + new_segmentation = initial_segmentation.copy().astype(np.short) + new_segmentation[:] = 0 + + # Find the threshold label with the largest overlap with the target label + largest_overlap = 0 + largest_overlap_label = 0 + + # Get regions props for each thresholded label + thresh_props = [x for x in measure.regionprops(threshold_labels) if x.area > 500] + + # Find which thresholded label overlaps most with the original segmentation + l = [] + for pr in thresh_props: + ol = overalap(label_roi, target_label, threshold_labels, pr.label) + l.append((pr.label, ol)) + if ol > largest_overlap: + largest_overlap = ol + largest_overlap_label = pr.label + + # Wipe all candidate segmentations except largest overlap + write(threshold_labels, '/mnt/bit_nfs/neil/impc_e14.5/har/outputs/baseline/baseline_3061/secondary_seg_lv_test/thersh_labels.nrrd') + threshold_labels[threshold_labels != largest_overlap_label] = 0 + + # fill small holes + threshold_labels = morphology.closing(threshold_labels, morphology.ball(2)) + + # Insert the threshold label condidates back into the label map + new_segmentation[b[0]:b[3], b[1]: b[4], b[2]: b[5]] = threshold_labels + + # new_segmentation[new_segmentation != largest_overlap_label] = 0 + new_segmentation[new_segmentation == largest_overlap_label] = target_label + + if outpath: + write(new_segmentation.astype(np.uint8), outpath) + + return new_segmentation.astype(np.uint8) diff --git a/lama/snake_contour_tool/fixer_config.toml b/lama/snake_contour_tool/fixer_config.toml new file mode 100644 index 00000000..cfa49d5d --- /dev/null +++ b/lama/snake_contour_tool/fixer_config.toml @@ -0,0 +1,10 @@ +rigid_img_path = 'C:/Users/u5823099/Anaconda3/Lib/site-packages/lama/tests/test_data/reg_fixer_data/rigid/' + +inverted_labels_path = 'C:/Users/u5823099/Anaconda3/Lib/site-packages/lama/tests/test_data/reg_fixer_data/inverted_masks/' + +[active_contour_params] +lab_of_int = [11, 11, 12, 12, 10, 10] +kmean = [8, 6, 8, 6, 8, 6] +num_int = [200, 150, 200, 150, 200, 150] +prop_scaling = [-0.15, 0.15, -0.15, 0.15, -0.15, 0.15] + diff --git a/lama/snake_contour_tool/geodesic_active_segmentation.py b/lama/snake_contour_tool/geodesic_active_segmentation.py new file mode 100644 index 00000000..b7df16a2 --- /dev/null +++ b/lama/snake_contour_tool/geodesic_active_segmentation.py @@ -0,0 +1,69 @@ +#Kyle Drover - Geodesic Active Contour Segmentation based on existing label + +from pathlib import Path +import SimpleITK as sitk +import nrrd +from scipy import ndimage +import numpy as np + +def snake(label, cluster, label_of_interest, N, propScaling): + + # Set the distance + distance = sitk.SignedMaurerDistanceMapImageFilter() + distance.InsideIsPositiveOff() + distance.UseImageSpacingOn() + + # set the seed from the label and make the initial image from the seed + seedImage = sitk.GetImageFromArray(label.astype(np.int16), isVector=False) + + initialImage = sitk.BinaryThreshold(distance.Execute(seedImage), -1000, 10) + initialImage = sitk.Cast(initialImage, sitk.sitkFloat32) * -1 + 0.5 + + # Setting up the feature image + + # determine which cluster has the maximum overlay with the label. + c_scores = [] + for k in range(np.amax(cluster)): + overlay = np.logical_and(label == label_of_interest, cluster == k) + c_scores.append(np.sum(label[overlay == True])) + + + c_val = np.argmax(c_scores) + + # convert desired cluster into a binary image + cluster[cluster == c_val] = 8 + + + #cluster[cluster != label_of_interest] = 0 + #cluster[cluster == label_of_interest] = 10 + + # turn cluster into sitk image and blur it + featureImage = sitk.GetImageFromArray(cluster.astype(np.int16), isVector=False) + + gradientMagnitude = sitk.GradientMagnitudeRecursiveGaussianImageFilter()# feature image needs blurring for some reason + gradientMagnitude.SetSigma(0.05)# sigma is the blurring factor + + featureImage = sitk.BoundedReciprocal(gradientMagnitude.Execute(featureImage)) + # featureImage = sitk.InvertIntensity(featureImage, 1) + featureImage = sitk.Cast(featureImage, sitk.sitkFloat32) + + test_feat_arr= sitk.GetArrayFromImage(featureImage) + nrrd.write( + 'C:/Users/u5823099/Anaconda3/Lib/site-packages/lama/tests/test_data/reg_fixer_data/inverted_masks/2086980_feat_arr.nrrd', + test_feat_arr) + + # sitk.WriteImage(featureImage, 'C:/Users/u5823099/Anaconda3/Lib/site-packages/lama/tests/test_data/reg_fixer_data/inverted_masks/2086980_feature.nrrd') + # perform the active contour + geodesicActiveContour = sitk.GeodesicActiveContourLevelSetImageFilter() + geodesicActiveContour.SetPropagationScaling(propScaling) + geodesicActiveContour.SetCurvatureScaling(0.15) + geodesicActiveContour.SetAdvectionScaling(0.15) + geodesicActiveContour.SetMaximumRMSError(0.01) + geodesicActiveContour.SetNumberOfIterations(N) + + levelset = geodesicActiveContour.Execute(initialImage, featureImage) + + #convert the output of the active contour (i.e. the levelset) to a binary image + bi_image = sitk.BinaryThreshold(levelset, -1000, 0) + + return bi_image \ No newline at end of file diff --git a/lama/snake_contour_tool/label_fixer.py b/lama/snake_contour_tool/label_fixer.py new file mode 100644 index 00000000..e3690f9d --- /dev/null +++ b/lama/snake_contour_tool/label_fixer.py @@ -0,0 +1,102 @@ +""" +fixes poor labels specified within a config file (e.g. fixer_config.toml): + +config file should contain the rigidly alligned scan and the inverted labels in separate directories +specified by rigid_img_path and inverted_labels_path + +parameters for config file (each column is a separate run): +lab_of_int = label to be corrected + +kmean = number of clusters generated to seperate organs of interest + +num_int = number of interations to grow/shrink label to the cluster + +prop_scaling = scaling factor (negative to remove overlabelling, positive for labelling) + +example run: + +python3 label_fixer.py -c fixer_config.toml +""" +from pathlib import Path +import nrrd +import SimpleITK as sitk +import toml + +from lama import common +from lama.snake_contour_tool import geodesic_active_segmentation +import numpy as np +import argparse + + +def get_config(config_file): + """Gets a csv file containing the order of labels to fisx and their numb_int and kmean_vals """ + config = toml.load(config_file) + return config + + +def cluster(rigid_img, i): + """Do the kmeans clustering, i is the only parameter for this kmeans function""" + k = sitk.ScalarImageKmeans(rigid_img, [i] * i) + + out_arr = sitk.GetArrayFromImage(k) + return out_arr + + +def fix_label(rigid_img_path, inverted_labels_path, label_of_interest, kmean, num_iterations, prop_scaling): + """Uses the active contour tool to automatically segment a poorly inverted label to a cluster image""" + + rigid, r_head = nrrd.read(rigid_img_path) + inv_labels, inv_head = nrrd.read(inverted_labels_path) + + # # Get the bounding box of the inverted label - # Kyle - you don't need an ROI to fix the segmentation. + # # just get the label of interest + single_label, _ = nrrd.read(inverted_labels_path) + single_label[single_label != label_of_interest] = 0 + + # Perform the snake contouring to get a corrected label + rigid_img = sitk.GetImageFromArray(rigid) + out_arr = cluster(rigid_img, kmean) + + corrected_label = geodesic_active_segmentation.snake(single_label, out_arr, label_of_interest, num_iterations, + prop_scaling) + + # Replace the old label with the corrected label + corrected_array = sitk.GetArrayFromImage(corrected_label) + + # set over-labelled vals in original label to 0 + overlabelled_vals = np.logical_and(inv_labels == label_of_interest, corrected_array != 1) + inv_labels[overlabelled_vals == True] = 0 + + # set non-labelled values to the label of interest and write the file + inv_labels[corrected_array == 1] = label_of_interest + nrrd.write(str(inverted_labels_path), inv_labels, header=inv_head) + + +def main(): + parser = argparse.ArgumentParser("fix specified labels using the active contour tool") + parser.add_argument('-c', '--config', dest='config_file', help='config file for label_fixer') + + args = parser.parse_args() + + # get configuration by reading the toml file + config = get_config(args.config_file) + + # Set the rigid img and inverted label paths from the config + rigid_img_path = config["rigid_img_path"] + inverted_labels_path = config["inverted_labels_path"] + + # there should be multiple rigid images and labels, therefore get all filepaths + rigid_paths = common.get_file_paths(Path(rigid_img_path)) + label_paths = common.get_file_paths(Path(inverted_labels_path)) + + # for each volume, perform multiple active contours in the order and with the parameters specified within the config + for i in range(len(rigid_paths)): + parameters = config['active_contour_params'] + for (lab_of_int, kmean, num_int, prop_scaling) in zip(parameters['lab_of_int'], parameters['kmean'], + parameters['num_int'], + parameters['prop_scaling']): + fix_label(Path(rigid_paths[i]), Path(label_paths[i]), lab_of_int, kmean, num_int, prop_scaling) + + +if __name__ == '__main__': + main() diff --git a/lama/staging/staging_metric_maker.py b/lama/staging/staging_metric_maker.py index 308fca56..13793ffe 100755 --- a/lama/staging/staging_metric_maker.py +++ b/lama/staging/staging_metric_maker.py @@ -69,19 +69,19 @@ def scaling_factor_staging(root_registration_dir: Path, outdir: Path): return scaling_factor -def whole_volume_staging(inverted_mask_dir: Path, outdir: Path): +def whole_volume_staging(propagated_mask_dir: Path, outdir: Path): """ Generate a csv of whole embryo volumes. Parameters ---------- - inverted_mask_dir: masks that have been inverted back to rigid or original inputs + propagated_mask_dir: masks that have been inverted back to rigid or original inputs outdir: where to put the resulting staging csv """ output = {} - for mask_folder in inverted_mask_dir.iterdir(): + for mask_folder in propagated_mask_dir.iterdir(): if not mask_folder.is_dir(): continue diff --git a/lama/stats/cluster_plots.py b/lama/stats/cluster_plots.py index 0a63832e..1dc13610 100755 --- a/lama/stats/cluster_plots.py +++ b/lama/stats/cluster_plots.py @@ -41,15 +41,18 @@ def _plot(data: pd.DataFrame, title: str, outpath): Plt the results of the clustering """ - ax = sns.scatterplot(x='x', y='y', data=data) + ax = sns.scatterplot(x='x', y='y', data=data, hue='Embryo') + plt.title(title) + plt.legend(bbox_to_anchor=(1.01, 1), + borderaxespad=0) i = 1 id_map = [] for spec_id, row in data.iterrows(): - ax.text(row['x'] + 0.08, row['y'], str(i), horizontalalignment='center', size='medium', color='black', weight='semibold') + #ax.text(row['x'] + 0.08, row['y'], row['Org'], horizontalalignment='center', size='small', color='black', weight='semibold') id_map.append([i, spec_id]) i += 1 @@ -62,16 +65,26 @@ def _plot(data: pd.DataFrame, title: str, outpath): -def umap_organs(data: pd.DataFrame, outpath: Path, title=''): +def umap_organs(data: pd.DataFrame, outpath: Path, title='', _metadata: pd.DataFrame=None): + + + embedding = umap.UMAP(metric='correlation').fit_transform(data) - embedding = umap.UMAP(n_neighbors=2, - min_dist=0.2, - metric='correlation').fit_transform(data) + print(embedding) + + #n_neighbors = 2, + #min_dist = 0.2, + #metric = 'correlation' df = pd.DataFrame(embedding[:, 0:2], index=data.index, columns=['x', 'y']) + # merge df and metadata + df = pd.concat([_metadata, df], axis=1) + + print(df) + _plot(df, title, outpath) diff --git a/lama/stats/common.py b/lama/stats/common.py index 9b87f739..028f8858 100644 --- a/lama/stats/common.py +++ b/lama/stats/common.py @@ -9,4 +9,5 @@ def cohens_d(x, y): dof = nx + ny - 2 # For testing - return (mean(x) - mean(y)) / sqrt(((nx - 1) * std(x, ddof=1) ** 2 + (ny - 1) * std(y, ddof=1) ** 2) / dof) \ No newline at end of file + return (mean(x) - mean(y)) / sqrt(((nx - 1) * std(x, ddof=1) ** 2 + (ny - 1) * std(y, ddof=1) ** 2) / dof) + diff --git a/lama/stats/heatmap.py b/lama/stats/heatmap.py index 83dd4495..c22eb5ab 100644 --- a/lama/stats/heatmap.py +++ b/lama/stats/heatmap.py @@ -4,42 +4,48 @@ import matplotlib.pyplot as plt import pandas as pd +from logzero import logger as logging import numpy as np import seaborn as sns from typing import Union import matplotlib +import scipy.spatial as sp, scipy.cluster.hierarchy as hc +from scipy.stats import zscore -def heatmap(data: pd.DataFrame, title, use_sns=False): - # import matplotlib.pylab as pylab - # params = {'legend.fontsize': 'x-large', - # 'axes.labelsize': 'x-large', - # 'axes.titlesize': 'x-large', - # 'xtick.labelsize': 'x-large', - # 'ytick.labelsize': 'x-large'} - # pylab.rcParams.update(params) +def heatmap(data: pd.DataFrame, title, use_sns=False, rad_plot: bool = False): + fig, ax = plt.subplots(figsize=[56, 60]) + # use_sns = False - - - fig, ax = plt.subplots(figsize = [14,15]) + font_size = 14 if rad_plot else 22 if use_sns: # sns.palplot(sns.color_palette("coolwarm")) - sns.heatmap(data, cmap=sns.color_palette("coolwarm", 100), ax=ax, cbar_kws={'label': 'mean volume ratio'}, - square=True) + if data.isnull().values.all(): + return + try: + sns.heatmap(data, center=1.00, cmap=sns.color_palette("coolwarm", 100), ax=ax, + cbar_kws={'label': 'mean volume ratio', + 'fraction': 0.05}, square=(not rad_plot)) + except ValueError: + ... - ax.figure.axes[-1].yaxis.label.set_size(22) + ax.figure.axes[-1].yaxis.label.set_size(font_size) cbar = ax.collections[0].colorbar - cbar.ax.tick_params(labelsize=20) + cbar.ax.tick_params(labelsize=22) + # adjust cbar fraction + else: ax.imshow(data) xlabels = data.columns ylabels = [x.replace('_', ' ') for x in data.index] + # We want to show all ticks... - # ax.set_xticks(np.arange(len(xlabels))) - # ax.set_yticks(np.arange(len(ylabels))) + ax.set_xticks(np.arange(len(xlabels))) + + ax.set_yticks(np.arange(len(ylabels))) # ... and label them with the respective list entries - ax.set_xticklabels(xlabels, rotation = 90, ha="center") + ax.set_xticklabels(xlabels, rotation=90, ha="center") ax.set_yticklabels(ylabels) # Rotate the tick labels and set their alignment. @@ -49,9 +55,59 @@ def heatmap(data: pd.DataFrame, title, use_sns=False): # plt.xticks(np.arange(len(gamma_range)) + 0.5, gamma_range, rotation=45, ) # plt.xticks(rotation=90) # ax.tick_params(axis="x", direction="right", pad=-22) - plt.xticks(fontsize=12) - plt.yticks(fontsize=12) + # Note for radiomics data make this stuff smaller + plt.xticks(fontsize=font_size) + plt.yticks(fontsize=font_size) + + return True + + +def clustermap(data: pd.DataFrame, title, use_sns=False, rad_plot: bool = False, z_norm: bool=False): + + font_size = 10 if rad_plot else 22 + + # use_sns = False + if use_sns: + # sns.palplot(sns.color_palette("coolwarm")) + if data.isnull().values.all(): + return + try: + if rad_plot: + cg = sns.clustermap(data, + metric="euclidean", + cmap=sns.diverging_palette(250, 15, l=70, s=400, sep=1, n=512, center="light", + as_cmap=True), + center=1, + cbar_kws={'label': 'mean ratio of radiological measurement'}, square=True, + figsize=[30, len(data)*0.3]) + + ylabels = [x.replace('_', ' ') for x in data.index] + + cg.ax_heatmap.set_yticks(np.arange(len(ylabels))) + cg.ax_heatmap.tick_params(axis='y', labelsize=font_size) + cg.ax_heatmap.set_yticklabels(ylabels, fontsize=font_size, rotation=0) + elif z_norm: + cg = sns.clustermap(data, + z_score=0, + metric="euclidean", + cmap=sns.diverging_palette(250, 15, l=70, s=400, sep=40, n=512, center="light", + as_cmap=True), + cbar_kws={'label': 'mean volume ratio'}, + square=True) + + else: + cg = sns.clustermap(data, + metric="euclidean", + cmap=sns.diverging_palette(250, 15, l=70, s=400, sep=40, n=512, center="light", + as_cmap=True), + cbar_kws={'label': 'mean volume ratio'}, + square=True) + + + except ValueError as e: + print(e) + ... - # ax.set_title(title) + return True diff --git a/lama/stats/linear_model.py b/lama/stats/linear_model.py index c02da80b..bbbe752a 100644 --- a/lama/stats/linear_model.py +++ b/lama/stats/linear_model.py @@ -6,7 +6,6 @@ rp2y interface is on the todo list """ - import subprocess as sub import os import struct @@ -15,23 +14,25 @@ from typing import Tuple import shutil +from itertools import chain from logzero import logger as logging import numpy as np import pandas as pd +import statsmodels.formula.api as smf from lama import common +from tqdm import tqdm LM_SCRIPT = str(common.lama_root_dir / 'stats' / 'rscripts' / 'lmFast.R') # If debugging, don't delete the temp files used for communication with R so they can be used for R debugging. -DEBUGGING = False - +DEBUGGING = True -def lm_r(data: np.ndarray, info: pd.DataFrame, plot_dir:Path=None, boxcox:bool=False, use_staging: bool=True) -> Tuple[np.ndarray, np.ndarray]: +def lm_r(data: np.ndarray, info: pd.DataFrame, plot_dir: Path = None, boxcox: bool = False, use_staging: bool = True, + two_way: bool = False) -> Tuple[np.ndarray, np.ndarray]: """ Fit multiple linear models and get the resulting p-values - Parameters ---------- data @@ -48,15 +49,11 @@ def lm_r(data: np.ndarray, info: pd.DataFrame, plot_dir:Path=None, boxcox:bool=F whether to apply boxcox transformation to the dependent variable use_staging if true, uae staging as a fixed effect in the linear model - - Returns + Returns: ------- pvalues for each label or voxel t-statistics for each label or voxel - """ - # if np.any(np.isnan(data)): - # raise ValueError('Data passed to linear_model.py has NAN values') input_binary_file = tempfile.NamedTemporaryFile().name line_level_pval_out_file = tempfile.NamedTemporaryFile().name @@ -64,7 +61,11 @@ def lm_r(data: np.ndarray, info: pd.DataFrame, plot_dir:Path=None, boxcox:bool=F groups_file = tempfile.NamedTemporaryFile().name # create groups file - if use_staging: + if use_staging and two_way: + groups = info[['genotype', 'treatment', 'staging']] + formula = 'genotype,treatment,staging' + + elif use_staging: groups = info[['genotype', 'staging']] formula = 'genotype,staging' @@ -90,6 +91,7 @@ def lm_r(data: np.ndarray, info: pd.DataFrame, plot_dir:Path=None, boxcox:bool=F try: sub.check_output(cmd) + logging.info('R linear model suceeded') except sub.CalledProcessError as e: msg = "R linear model failed: {}".format(e) logging.exception(msg) @@ -99,37 +101,59 @@ def lm_r(data: np.ndarray, info: pd.DataFrame, plot_dir:Path=None, boxcox:bool=F # The start of the binary file will contain values from the line level call # the specimen-level calls are appended onto this and need to be split accordingly. try: - p_all = np.fromfile(line_level_pval_out_file, dtype=np.float64).astype(np.float32) - t_all = np.fromfile(line_level_tstat_out_file, dtype=np.float64).astype(np.float32) + + if two_way: + # if you're doing a two-way, you need to get three arrays (i.e. genotype, treatment and interaction) + # converted to np.ndarray with three dimensions + + p_all = np.array( + [np.fromfile((line_level_pval_out_file + '_genotype'), dtype=np.float64).astype(np.float32), + np.fromfile((line_level_pval_out_file + '_treatment'), dtype=np.float64).astype(np.float32), + np.fromfile((line_level_pval_out_file + '_interaction'), dtype=np.float64).astype(np.float32)]) + + t_all = np.array( + [np.fromfile((line_level_tstat_out_file + '_genotype'), dtype=np.float64).astype(np.float32), + np.fromfile((line_level_tstat_out_file + '_treatment'), dtype=np.float64).astype(np.float32), + np.fromfile((line_level_tstat_out_file + '_interaction'), dtype=np.float64).astype(np.float32)]) + else: + p_all = np.fromfile(line_level_pval_out_file, dtype=np.float64).astype(np.float32) + t_all = np.fromfile(line_level_tstat_out_file, dtype=np.float64).astype(np.float32) + except FileNotFoundError as e: print(f'Linear model file from R not found {e}') raise FileNotFoundError('Cannot find LM output'.format(e)) if not DEBUGGING: os.remove(input_binary_file) - os.remove(line_level_pval_out_file) - os.remove(line_level_tstat_out_file) os.remove(groups_file) + + if two_way: + os.remove(line_level_pval_out_file + '_genotype') + os.remove(line_level_pval_out_file + '_treatment') + os.remove(line_level_pval_out_file + '_interaction') + os.remove(line_level_tstat_out_file + '_genotype') + os.remove(line_level_tstat_out_file + '_treatment') + os.remove(line_level_tstat_out_file + '_interaction') + else: + os.remove(line_level_pval_out_file) + os.remove(line_level_tstat_out_file) return p_all, t_all def _numpy_to_dat(mat: np.ndarray, outfile: str): """ Convert a numpy array to a binary file for reading in by R - Parameters ---------- mat the data to be send to r outfile the tem file name to store the binary file - - """ # mat = mat.as_matrix() # create a binary file with open(outfile, 'wb') as binfile: - # and write out two integers with the row and column dimension + # and write out two integers with the row and column dimension header = struct.pack('2I', mat.shape[0], mat.shape[1]) binfile.write(header) @@ -139,18 +163,75 @@ def _numpy_to_dat(mat: np.ndarray, outfile: str): binfile.write(data) -def lm_sm(data: np.ndarray, info: pd.DataFrame, plot_dir:Path=None, boxcox:bool=False, use_staging: bool=True): +def lm_sm(data: np.ndarray, info: pd.DataFrame, plot_dir: Path = None, + boxcox: bool = False, use_staging: bool = True, two_way: bool = False): """ - Parameters ---------- + two_way data info plot_dir boxcox use_staging - + Notes + ----- + If a label column is set to all 0, it means a line has all the mutants qc's and it's not for analysis. Returns ------- - """ + + pvals = [] + tvals = [] + + # We need to add some non-digit before column names or patsy has a fit + d = pd.DataFrame(data, index=info.index, columns=[f'x{x}' for x in range(data.shape[1])]) + df = pd.concat([d, info], axis=1) # Data will be given numberic labels + + for col in range(data.shape[1]): + + if not df[f'x{col}'].any(): + p = np.nan + t = np.nan + else: + # so is meant to improve performance + formula = f'x{col} ~ genotype * treatment + staging' if two_way else f'x{col} ~ genotype + staging' + fit = smf.ols(formula=formula, data=df, missing='drop').fit() + + p = fit.pvalues[~fit.pvalues.index.isin(['Intercept', 'staging'])] if two_way else fit.pvalues[ + 'genotype[T.wt]'] + t = fit.tvalues[~fit.tvalues.index.isin(['Intercept', 'staging'])] if two_way else fit.tvalues[ + 'genotype[T.wt]'] + pvals.append(p) + tvals.append(t) + + # print(dir(fit)) + p_all = np.array(pvals) + + + + # weird output from smf.ols.fit.pvalues - NEVER touch this code it's slow but it works + if p_all.ndim == 1: + # Coerces all the data to be the right shape + # so Neil's atlas has 187 organs but the rad data has > 187 + #TODO: test with non-rad data and see if you can use np.shape(data)[1] + p_all = np.reshape(p_all, (np.shape(data)[1], 1)) + + + if isinstance(p_all[0][0], pd.Series): + # coerces the series in p_all to np.float + p_all = [np.float64(my_p) for my_pvals in p_all for my_p in my_pvals] + #p_all = np.array(p_all, dtype=np.float64).flatten() + + # now reshape properly + if len(np.shape(p_all)) == 1: + # Coerces all the data to be the right shape (i.e 187x1 or 187x3) + if isinstance(p_all[0], np.ndarray): + p_all = [np.array(3 * [np.nan]) if np.isnan(org).any() else org for org in p_all] + p_all = np.vstack(p_all) + else: + p_all = np.reshape(p_all, (np.shape(data)[1], 1)) + + t_all = np.negative(np.array(tvals)) # The tvaue for genotype[T.mut] is what we want + + return p_all, t_all \ No newline at end of file diff --git a/lama/stats/penetrence_expressivity_plots.py b/lama/stats/penetrence_expressivity_plots.py index 2a638fee..8e609301 100644 --- a/lama/stats/penetrence_expressivity_plots.py +++ b/lama/stats/penetrence_expressivity_plots.py @@ -5,68 +5,159 @@ from pathlib import Path from typing import Iterable import pandas as pd -from lama.stats.heatmap import heatmap +from lama.stats.heatmap import heatmap, clustermap import matplotlib.pyplot as plt from logzero import logger as logging +import numpy as np +from tqdm import tqdm - -def heatmaps_for_permutation_stats(root_dir: Path): +def heatmaps_for_permutation_stats(root_dir: Path, two_way: bool = False, label_info_file: Path = None, rad_plot: bool = False): """ This function works on the output of the premutation stats. For the non-permutation, may need to make a different function to deal with different directory layout """ + # Yeah there should a be better way of me doing this but there is not + if label_info_file: + label_info = pd.read_csv(label_info_file, index_col=0) + skip_analysis = True if 'no_analysis' in label_info.columns else False + if skip_analysis: + good_labels = label_info[label_info['no_analysis'] != True].label_name + else: + good_labels = None + + if two_way: # read data.csv to get the conditions + data_path = root_dir / "radiomics_data.csv" if rad_plot else root_dir / "input_data.csv" + data = pd.read_csv(data_path, index_col=0) + #if rad_plot: + # data.set_index(['vol'], inplace=True) + group_info = data['line'] + + # TODO: think whether to truly put mut_treat in main comparisons + mut_names = data[group_info == 'mutants'].index + treat_names = data[group_info == 'treatment'].index + inter_names = data[group_info == 'mut_treat'].index for line_dir in root_dir.iterdir(): + # bodge way to fix it but she'll do + if two_way: + line_dir = root_dir / 'two_way' + spec_dir = line_dir / 'specimen_level' + + file_lists = { + 'inter': [], + 'treat': [], + 'geno': [] + } + + for s_dir in spec_dir.iterdir(): + scsv = next(s_dir.iterdir()) + if s_dir.name in inter_names: + file_lists['inter'].append(scsv) + elif s_dir.name in mut_names: + file_lists['geno'].append(scsv) + elif s_dir.name in treat_names: + file_lists['treat'].append(scsv) - spec_dir = line_dir / 'specimen_level' - spec_csvs = [] + else: + spec_dir = line_dir / 'specimen_level' + spec_csvs = [] - for s_dir in spec_dir.iterdir(): - scsv = next(s_dir.iterdir()) - spec_csvs.append(scsv) + for s_dir in spec_dir.iterdir(): + scsv = next(s_dir.iterdir()) + spec_csvs.append(scsv) try: line_hits_csv = next(line_dir.glob(f'{line_dir.name}_organ_volumes*csv')) + except StopIteration: logging.error(f'cannot find stats results file in {str(line_dir)}') return - line_specimen_hit_heatmap(line_hits_csv, spec_csvs, line_dir, line_dir.name) + if two_way: + + line_specimen_hit_heatmap(line_hits_csv, file_lists['geno'], line_dir, "geno", two_way=two_way, + good_labels=good_labels, rad_plot=rad_plot) + line_specimen_hit_heatmap(line_hits_csv, file_lists['treat'], line_dir, "treat", two_way=two_way, + good_labels=good_labels, rad_plot=rad_plot) + line_specimen_hit_heatmap(line_hits_csv, file_lists['inter'], line_dir, "inter", two_way=two_way, + good_labels=good_labels, rad_plot=rad_plot) + # there's only one iteration + break + else: + line_specimen_hit_heatmap(line_hits_csv, spec_csvs, line_dir, line_dir.name, two_way=two_way, + good_labels=good_labels) + + def line_specimen_hit_heatmap(line_hits_csv: Path, specimen_hits: Iterable[Path], outdir: Path, line: str, - sorter_csv=None): - + sorter_csv=None, + two_way: bool = False, good_labels=None, rad_plot: bool = False): dfs = {} # All line and speciemn hit dfs - line_hits = pd.read_csv(line_hits_csv, index_col=0) - dfs[line] = line_hits + #line_hits = pd.read_csv(line_hits_csv, index_col=0) + + #dfs[line] = line_hits for spec_file in specimen_hits: d = pd.read_csv(spec_file, index_col=0) - - # d = d[d['significant_cal_p'] == True] - - # small_id = get_specimen_id(hits_file.name) for now use full name - # dfs[small_id] = d + print("d", d) dfs[spec_file.name] = d # get the superset of all hit labels hit_lables = set() - for k, x in dfs.items(): - hit_lables.update(x[x['significant_cal_p'] == True].label_name) + for k, x in tqdm(dfs.items()): + + # get significance c + col = [_col for _col in x.columns if _col.__contains__("significant_cal")] + # interaction has more than one p_val + # [0] converts list to str + col = "significant_cal_p_inter" if len(col) > 1 else col[0] + + if 'label_name' in x: + # filtering for orgs of int + if len(good_labels) > 1: + + good_hits = x[(x[col] == True) & (~x['no_analysis'].fillna(False))] + + good_hits = good_hits[good_hits['label_name'].isin(good_labels)].label_name + + hit_lables.update(good_hits) + + else: + hit_lables.update(x[x[col] == True].label_name) + else: + + hit_lables.update(x[x[col] == True].index.values) + + # For each hit table, keep only those in the hit superset and create heat_df t = [] - for line_or_spec, y in dfs.items(): - y = y[y['label_name'].isin(hit_lables)] - y['label_num']= y.index - y.set_index('label_name', inplace=True, drop=True) + for line_or_spec, y in tqdm(dfs.items()): + # If we have label_name, set as index. Otherwise leave label num as index + if rad_plot: + y = y[y['label_name'].isin(hit_lables)] + # attach the label name to the index column + #y['radiomic_name'] = [str(index.str.split("__")[0] + " " + row['label_name']).replace('_', ' ') for index, row in y.iterrows()] + # thanks ChatGPT + y['radiomic_name'] = y.apply(lambda row: f"{row.name.split('__')[0]} {row['label_name']}".replace('_', ' '), + axis=1) + #y.set_index('label', inplace=True, drop=True) + y.set_index('radiomic_name', inplace=True, drop=True) + + elif 'label_name' in y: + y = y[y['label_name'].isin(hit_lables)] + y.set_index('label_name', inplace=True, drop=True) + print("y", y) + else: + y.index = y.index.astype(str) - y.loc[y.significant_cal_p == False, 'mean_vol_ratio'] = None + y['label_num'] = y.index + y.loc[y[col] == False, 'mean_vol_ratio'] = None if 'mean_vol_ratio' in y: col_for_heatmap = 'mean_vol_ratio' @@ -96,43 +187,75 @@ def line_specimen_hit_heatmap(line_hits_csv: Path, # Try to order lines by litter ids = list(heat_df.columns) line_id = ids.pop(0) - - ids.sort(key=lambda x: x[-2]) + ids.sort(key=lambda x: x[-3]) sorted_ids = [line_id] + ids heat_df = heat_df[sorted_ids] - # Shorten some of the longer organ names - # label_meta = pd.read_csv('/mnt/bit_nfs/neil/Harwell_E14_5_latest/padded_target_ras/E14_5_atlas_v24_40_label_info_nouse.csv', index_col=0) - # label_meta.set_index('label_name', inplace=True, drop=True) - # heat_df = heat_df.merge(label_meta[['short_name']], how='left', left_index=True, right_index=True) + if rad_plot: + heat_df.columns = [col.rsplit('_', 3)[0] for col in heat_df.columns] - def sn_mapper(x): - sn = heat_df.loc[x, 'short_name'] - if not isinstance(sn, float): # float nan - idx = sn - else: - idx = x - return idx.lower() + try: + if two_way: + if not rad_plot: + heat_df.columns = [x.split("org")[0] for x in heat_df.columns] + + if not heatmap(heat_df, title=title, use_sns=True, rad_plot=rad_plot): + logging.info(f'Skipping heatmap for {line} as there are no results') + + plt.tight_layout() + + plt.savefig(outdir / f"{line}_organ_hit_heatmap.png") + plt.close() + + #sns.clustermap needs non-nan values to calculate distances - # heat_df.index = heat_df.index.map(sn_mapper) - # heat_df.drop(columns='short_name', inplace=True) + heat_df.dropna(how='all', inplace=True) + heat_df.fillna(1, inplace=True) + heat_df.clip(upper=2, inplace=True) - heatmap(heat_df, title=title, use_sns=True) + if not clustermap(heat_df, title=title, use_sns=True, rad_plot=rad_plot): + logging.info(f'Skipping heatmap for {line} as there are no results') + + plt.tight_layout() + + plt.savefig(outdir / f"{line}_organ_hit_clustermap.png") + plt.close() + + + logging.info("Creating Additional z-normalised plots") + if not clustermap(heat_df, title=title, use_sns=True, rad_plot=rad_plot, z_norm=True): + logging.info(f'Skipping heatmap for {line} as there are no results') + + plt.tight_layout() + + plt.savefig(outdir / f"{line}_organ_hit_clustermap_z_normed.png") + plt.close() + + + else: + if not heatmap(heat_df, title=title, use_sns=True): + logging.info(f'Skipping heatmap for {line} as there are no results') - plt.tight_layout() + #plt.tight_layout() - plt.savefig(outdir / f"{line}_organ_hit_heatmap.png") - plt.close() + plt.savefig(outdir / f"{line}_organ_hit_heatmap.png") + plt.close() + except ValueError as e: + print(e) + logging.warn('No heatmap produced') if __name__ == '__main__': - spec_dir = Path('/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/JAX_E15_5_test_120720/stats/archive/organ_vol_perm_091020/lines/Cox7c/specimen_level') + spec_dir = Path( + '/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/JAX_E15_5_test_120720/stats/archive/organ_vol_perm_091020/lines/Cox7c/specimen_level') spec_csvs = [] for s_dir in spec_dir.iterdir(): scsv = next(s_dir.iterdir()) spec_csvs.append(scsv) - line_specimen_hit_heatmap(Path('/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/JAX_E15_5_test_120720/stats/archive/organ_vol_perm_091020/lines/Cox7c/Cox7c_organ_volumes_2020-10-09.csv'), - spec_csvs, - Path('/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/JAX_E15_5_test_120720/stats/archive/organ_vol_perm_091020/lines/Cox7c'), - 'Cox7c') \ No newline at end of file + line_specimen_hit_heatmap(Path( + '/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/JAX_E15_5_test_120720/stats/archive/organ_vol_perm_091020/lines/Cox7c/Cox7c_organ_volumes_2020-10-09.csv'), + spec_csvs, + Path( + '/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/JAX_E15_5_test_120720/stats/archive/organ_vol_perm_091020/lines/Cox7c'), + 'Cox7c') diff --git a/lama/stats/permutation_stats/bin_and_norm.py b/lama/stats/permutation_stats/bin_and_norm.py new file mode 100644 index 00000000..b4c073e6 --- /dev/null +++ b/lama/stats/permutation_stats/bin_and_norm.py @@ -0,0 +1,75 @@ +from lama.img_processing import normalise +from logzero import logger as logging +from lama import common +import os +import nrrd +from pathlib import Path +from scipy import ndimage +import numpy as np +import SimpleITK as sitk + +import pandas as pd + +from lama.stats.permutation_stats import bin_heatmap +from lama.utilities import prep_for_man_valid as pfmv + + +def main(): + print("something") + wt_dir = Path( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210519_int_anal/wt") + mut_dir = Path( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210519_int_anal/non_wt") + + mask, mask_h = nrrd.read( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210423_g_by_e_stand_out/210415_g_by_e_anal/target/stats_mask.nrrd") + + pop_avg, pop_h = nrrd.read( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210423_g_by_e_stand_out/210415_g_by_e_anal/target/210224_pop_avg_deformable_8.nrrd") + + s = ndimage.find_objects(mask)[0] + + # get the images + wt_imgs, wt_names = pfmv.get_images(wt_dir, s) + + mut_imgs, mut_names = pfmv.get_images(mut_dir, s) + + int_norm = normalise.IntensityMaskNormalise() + + # normalise the images + int_norm.add_reference(wt_imgs) + + int_norm.normalise(mut_imgs) + + int_norm.normalise(wt_imgs) + + wt_arrays = [] + + for img in wt_imgs: + binned = bin_heatmap.make_blocks_vectorized(img, 40) + # Summarise each bin by the non-zero mean. i.e. the faces/stickers of the + # Rubik's cube + face_val = [np.mean(cube[cube != 0]) for cube in binned] + + wt_arrays.append(face_val) + + # write to csv + wt_df = pd.DataFrame(wt_arrays, index=wt_names) + wt_df.to_csv("test_wt.csv") + + mut_arrays = [] + + for img in mut_imgs: + binned = bin_heatmap.make_blocks_vectorized(img, 40) + # Summarise each bin by the non-zero mean. i.e. the faces/stickers of the + # Rubik's cube + face_val = [np.mean(cube[cube != 0]) for cube in binned] + + mut_arrays.append(face_val) + + # write to csv + wt_df = pd.DataFrame(mut_arrays, index=mut_names) + wt_df.to_csv("test_mut.csv") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/stats/permutation_stats/bin_heatmap.py b/lama/stats/permutation_stats/bin_heatmap.py new file mode 100644 index 00000000..3214b0ac --- /dev/null +++ b/lama/stats/permutation_stats/bin_heatmap.py @@ -0,0 +1,116 @@ +""" +bin heatmaps (i.e. jacobians and intensities) into in a 3 * 4 * 8 array for permutation testing +""" + +from pathlib import Path +import nrrd +import SimpleITK as sitk +import toml +from scipy import ndimage +from lama import common +import pandas as pd +import numpy as np +import os +import argparse + + +def get_heatmaps(dir, s): + heatmap_list = [] + spec_name_list = [] + map_paths = [spec_path for spec_path in common.get_file_paths(dir) if ('log_jac' in str(spec_path))] + # enumerating for speed only + print(map_paths) + for i, heatmap_path in enumerate(map_paths): + heatmap, map_h = nrrd.read(heatmap_path) + # only get heatmap vals inside of the mask + + spec_name_list.append(os.path.splitext(heatmap_path.name)[0].replace("log_jac_","")) + heatmap = heatmap[s[0].start:s[0].stop, + s[1].start:s[1].stop, + s[2].start:s[2].stop] + heatmap_list.append(heatmap) + return heatmap_list, spec_name_list + + +def make_blocks_vectorized(x, d): + p, m, n = x.shape + + # extend each axis of the array until it is divisible by x + while p % d != 0: + p += 1 + while m % d != 0: + m += 1 + while n % d != 0: + n += 1 + + # reshape the array + ref_shape = np.zeros((p, m, n,)) + + ref_shape[:x.shape[0], :x.shape[1], :x.shape[2]] = x + + x = ref_shape + + return x.reshape(-1, m // d, d, n // d, d).transpose(1, 3, 0, 2, 4).reshape(-1, d, d, d) + +def main(): + wt_dir = Path( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210423_g_by_e_stand_out/210415_g_by_e_anal/g_by_e_data/baseline/output/baseline") + mut_dir = Path( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210423_g_by_e_stand_out/210415_g_by_e_anal/g_by_e_data/mutants/output/mutants") + mask, mask_h = nrrd.read( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210423_g_by_e_stand_out/210415_g_by_e_anal/target/stats_mask.nrrd") + ref, ref_h = nrrd.read( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210423_g_by_e_stand_out/210415_g_by_e_anal/target/210224_pop_avg_deformable_8.nrrd") + + # get bounding box of mask for cropping to mask + # used multiple times + s = ndimage.find_objects(mask)[0] + + # get reference cubes + ref = ref[s[0].start:s[0].stop, + s[1].start:s[1].stop, + s[2].start:s[2].stop] # crop to mask + + # ref_binned = make_blocks_vectorized(ref, 40) + + # write the reference cubes: + # for i, cube in enumerate(ref_binned): + # nrrd.write("ref_cube_" + str(i) + ".nrrd", cube) + + # get heatmaps + wt_htmps, wt_names = get_heatmaps(wt_dir, s) + + # turn heatmap into rubik's cube + #wt_arrays = [] + + #for htmp in wt_htmps: + # binned = make_blocks_vectorized(htmp, 40) + # # Summarise each bin by the non-zero mean. i.e. the faces/stickers of the + # # Rubik's cube + # face_val = [np.mean(cube[cube != 0]) for cube in binned] + + # wt_arrays.append(face_val) + + #write to csv + wt_df = pd.DataFrame(wt_htmps, index=wt_names) + wt_df.to_csv("test_wt.csv") + + mut_htmps, mut_names = get_heatmaps(mut_dir, s) + + # turn heatmap into rubik's cube + #mut_arrays = [] + + #for htmp in mut_htmps: + # binned = make_blocks_vectorized(htmp, 40) + # Summarise each bin by the non-zero mean. i.e. the faces/stickers of the + # Rubik's cube + # face_val = [np.mean(cube[cube != 0]) for cube in binned] + + # mut_arrays.append(face_val) + + #write to csv + wt_df = pd.DataFrame(mut_htmps, index=mut_names) + wt_df.to_csv("test_mut.csv") + +if __name__ == '__main__': + main() diff --git a/lama/stats/permutation_stats/distributions.py b/lama/stats/permutation_stats/distributions.py index d0313257..0d9baa18 100644 --- a/lama/stats/permutation_stats/distributions.py +++ b/lama/stats/permutation_stats/distributions.py @@ -31,6 +31,8 @@ from typing import Union, Tuple, List import random from pathlib import Path +import math +from collections import Counter import pandas as pd import numpy as np @@ -39,14 +41,256 @@ from joblib import Parallel, delayed import datetime from logzero import logger +from tqdm import tqdm +import random +import itertools +from functools import reduce + from lama.stats.linear_model import lm_r, lm_sm home = expanduser('~') +def random_combination(iterable, r): + "Random selection from itertools.combinations(iterable, r)" + pool = tuple(iterable) + n = len(pool) + indices = sorted(random.sample(range(n), r)) + return tuple(pool[i] for i in indices) + +def recursive_comb_maker(lst, n, steps, i, recurs_results): + """makes a combination from a list given the n_groups""" + # generate first set of combinations + #combs_gen = list(itertools.combinations(lst, n // (steps + 1))) + # Randomly choose a combination + #comb_result = random.choices(combs_gen, k=1) + comb_result = random_combination(lst, n // (steps + 1)) + #comb_result = random.choices(combs_gen, k=1) + # append result into a list + recurs_results.append(comb_result) + #print(list(lst)) + # break if you've done the right amount of steps or you have a lot of combinations, else call it recurvisely + if i == steps: + return recurs_results + else: + return recursive_comb_maker(list(filter(lambda val: val not in comb_result[0], lst)), n, steps, i + 1, recurs_results) + + +def generate_random_combinations(data: pd.DataFrame, num_perms): + logger.info('generating permutations') + data = data.drop(columns='staging', errors='ignore') + line_specimen_counts = get_line_specimen_counts(data) + + result = {} + # now for each label calcualte number of combinations we need for each + for label in line_specimen_counts: + label_indices_result = [] + counts = line_specimen_counts[label].value_counts() + + number_of_lines = counts[counts.index != 0].sum() # Drop the lines with zero labels (have been qc'd out) + ratios = counts[counts.index != 0] / number_of_lines + num_combs = num_perms * ratios + + # get wt data for label + label_data = data[[label, 'line']] + label_data = label_data[label_data.line == 'baseline'] + label_data = label_data[~label_data[label].isna()] + + # Soret out the numnbers + + # Sort out the numbers + records = [] + + for n, n_combs_to_try in num_combs.items(): + n_combs_to_try = math.ceil(n_combs_to_try) + max_combs = int(comb(len(label_data), n)) + # logger.info(f'max_combinations for n={n} and wt_n={len(label_data)} = {max_combs}') + records.append([n, n_combs_to_try, max_combs]) + df = pd.DataFrame.from_records(records, columns=['n', 'num_combs', 'max_combs'], index='n').sort_index( + ascending=True) + + # test whether it's possible to have this number of permutations with data structure + print(f'Max combinations for label {label} is {df.max_combs.sum()}') + if num_perms > df.max_combs.sum(): + raise ValueError(f'Max number of combinations is {df.max_combs.sum()}, you requested {num_perms}') + + # Now spread the overflow from any ns to other groups + + # Kyle - What is this? + while True: + df['overflow'] = df.num_combs - df.max_combs # Get the 'overflow' num permutations over maximumum unique + groups_full = df[df.overflow >= 0].index + + df['overflow'][df['overflow'] < 0] = 0 + extra = df[df.overflow > 0].overflow.sum() + + df.num_combs -= df.overflow + + if extra < 1: # All combimation amounts have been distributed + break + + num_non_full_groups = len(df[df.overflow >= 0]) + + top_up_per_group = math.ceil(extra / num_non_full_groups) + for n, row in df.iterrows(): + if n in groups_full: + continue + # Add the topup amount + row.num_combs += top_up_per_group + + # now generate the indices + indx = label_data.index + for n, row in df.iterrows(): + + combs_gen = itertools.combinations(indx, n) + for i, combresult in enumerate(combs_gen): + if i == row.num_combs: + break + label_indices_result.append(combresult) + + result[label] = label_indices_result + return result + + +def generate_random_two_way_combinations(data: pd.DataFrame, num_perms): + logger.info('generating permutations') + data = data.drop(columns='staging', errors='ignore') + line_specimen_counts = get_line_specimen_counts(data, two_way=True) + + n_groups = get_two_way_n_groups(data) + + result = {} + # now for each label calculate number of combinations we need for each + for label in tqdm(line_specimen_counts, total=line_specimen_counts.shape[0]): + label_indices_result = [] + + counts = line_specimen_counts[label].value_counts() + + number_of_lines = counts[counts.index != 0].sum() # Drop the lines with zero labels (have been qc'd out) + ratios = counts[counts.index != 0] / number_of_lines + num_combs = num_perms * ratios + + # get wt data for label + label_data = data[[label, 'line']] + label_data = label_data[label_data.line == 'baseline'] + label_data = label_data[~label_data[label].isna()] + + # Soret out the numnbers + + # Sort out the numbers + records = [] + + for n, n_combs_to_try in num_combs.items(): + n_combs_to_try = math.ceil(n_combs_to_try) + max_combs = two_way_max_combinations(n, n_groups) + # logger.info(f'max_combinations for n={n} and wt_n={len(label_data)} = {max_combs}') + records.append([n, n_combs_to_try, max_combs]) + df = pd.DataFrame.from_records(records, columns=['n', 'num_combs', 'max_combs'], index='n').sort_index( + ascending=True) + + # test whether it's possible to have this number of permutations with data structure + # so for the two-way this wont change + #print(f'Max combinations for label {label} is {max_combs}') + if float(num_perms) > float(df.max_combs.sum()): + raise ValueError(f'Max number of combinations is {max_combs}, you requested {num_perms}') + + + indx = label_data.index + + label_indices_result = [] + + # generate combinations for no. of desired perms + for perm in range(num_perms): + full_combination = recursive_comb_maker(indx, len(indx), n_groups - 1, i=1, recurs_results=[]) + # print(full_combination) + + label_indices_result.append(full_combination) + # reset result list + recurs_results = [] + + result[label] = label_indices_result + return result + + +def max_combinations(num_wts: int, line_specimen_counts: dict) -> int: + """ + + num_wts + Total number of wild types + lines_n + {line_n: number of lines} + Returns + ------- + Maximum number or permutations + """ + # calcualte the maximum number of permutations allowed given the WT n and the mutant n line structure. + results = {} + + counts = line_specimen_counts.iloc[:, 0].value_counts() + for n, num_lines in counts.items(): + # Total number of combinations given WT n and this mut n + total_combs_for_n = int(comb(num_wts, n)) + # Now weight based on how many lines have this n + total_combs_for_n /= num_lines + results[n] = total_combs_for_n + + return int(min(results.values())) + + +def two_way_max_combinations(num_wts: int, n_groups: int) -> int: + """So the number of max combinations two the two-way stuff is completely different + basically, its PI (product notation) i = 0 -> 3 [n_wts-(n_wts/4)* i C n_wts/4] + once again - I don't think I care about lines right now""" + # I'll leave the line architecture alive for now + + # Total number of combinations given WT n and this mut n + # I dont need n - you can get the total per group from the wts + n_per_group = num_wts // n_groups + comb_per_group = [(comb(num_wts - (n_per_group * i), n_per_group)) for i in range(n_groups)] + total_combs_for_n = reduce(lambda x, y: x * y, comb_per_group) + # Now weight based on how many lines have this n + return int(total_combs_for_n) + + +def get_two_way_n_groups(input_data: pd.DataFrame) -> int: + """just get the number of different groups for two_way""" + return len(input_data.groupby('line').count()) + + +def get_line_specimen_counts(input_data: pd.DataFrame, two_way: bool = False) -> pd.DataFrame: + """ + For each mutant line get the number of specimens per label. Does not inlude specimen labels that are NAN thereby + accounting for QC'd out labels + + Parameters + ---------- + input_data + index: specimen_id + cols: label numbers (sppended with x for use with statsmodels) eg. x1, x2 + line id e.g baseline + + Returns + ------- + index: line_id + cols: label numbers (sppended with x for use with statsmodels) eg. x1, x2 + + + """ + if 'line' in input_data: + col = 'line' + elif 'genotype' in input_data: + col = 'genotype' + if two_way: + # So for the two-way I need the wild-type counts for the null distribution + line_specimen_counts = input_data[input_data[col] == 'baseline'].groupby(col).count() + else: + line_specimen_counts = input_data[input_data[col] != 'baseline'].groupby(col).count() + return line_specimen_counts + + def null(input_data: pd.DataFrame, - num_perm: int,) -> Tuple[pd.DataFrame, pd.DataFrame, List]: + num_perm: int, two_way: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame, List]: """ Generate null distributions for line and specimen-level data @@ -62,6 +306,9 @@ def null(input_data: pd.DataFrame, num_perm number of permutations + two_way + makes it a two-way null + Returns ------- line-level null distribution @@ -79,6 +326,7 @@ def null(input_data: pd.DataFrame, label_names = input_data.drop(['staging', 'line'], axis='columns').columns + # Store p-value and t-value results. One tuple (len==num labels) per iteration spec_p = [] @@ -86,8 +334,10 @@ def null(input_data: pd.DataFrame, baselines = input_data[input_data['line'] == 'baseline'] # Get the line specimen n numbers. Keep the first column - line_specimen_counts = input_data[input_data['line'] != 'baseline'].groupby('line').count() - line_specimen_counts = list(line_specimen_counts.iloc[:, 0]) + # line_specimen_counts = get_line_specimen_counts(input_data) + # Pregenerate all the combinations + wt_indx_combinations = generate_random_two_way_combinations(input_data, num_perm) if two_way \ + else generate_random_combinations(input_data, num_perm) # Split data into a numpy array of raw data and dataframe for staging and genotype fpr the LM code data = baselines.drop(columns=['staging', 'line']).values @@ -95,41 +345,100 @@ def null(input_data: pd.DataFrame, # Get the specimen-level null distribution. i.e. the distributuion of p-values obtained from relabelling each # baseline once. Loop over each specimen and set to 'synth_hom' - for index, _ in info.iterrows(): - info.loc[:, 'genotype'] = 'wt' # Set all genotypes to WT - info.loc[[index], 'genotype'] = 'synth_hom' # Set the ith baseline to synth hom - row = data[info.index.get_loc(index), :] - - # Get columns (labels) where the mutant specimen (as it's line level) - # has a null value (i.e. This specimen is QC-flagged at these labels) - # Set all values to zero - labels_to_skip = np.isnan(row) - if any(labels_to_skip): - # Set the whole label column to zero. R:lm() will return NaN for this column - d = np.copy(data) - d[:, labels_to_skip] = 0.0 - else: - d = data - - # Get a p-value for each organ - p, t = lm_r(d, info) # TODO: move this to statsmodels - - # Check that there are equal amounts of p-value than there are data points - if len(p) != data.shape[1]: - raise ValueError(f'The length of p-values results: {data.shape[1]} does not match the length of the input data: {len(p)}') - - spec_p.append(p) + if two_way: # two-way flag + # so for two_way we have three spec tests - geno, treat and int + geno_info = info.copy() + treat_info = info.copy() + inter_info = info.copy() + for index, meta in tqdm(info.iterrows(), total=info.shape[0]): + + geno_info.loc[:, 'genotype'] = 'wt' # Set all genotypes to WT + geno_info.loc[:, 'treatment'] = 'veh' # Set all treatments to vehicle + + treat_info.loc[:, 'genotype'] = 'wt' # Set all genotypes to WT + treat_info.loc[:, 'treatment'] = 'veh' # Set all treatments to vehicle + + inter_info.loc[:, 'genotype'] = 'wt' # Set all genotypes to WT + inter_info.loc[:, 'treatment'] = 'veh' # Set all treatments to vehicle + + + # now relabel based on the combinations + + #for geno effect label index as just synth mut: + geno_info.loc[[index], 'genotype'] = 'synth_mut' + + #similarly for treatment: + treat_info.loc[[index], 'treatment'] = 'synth_treat' + + # So the first logical thing is synth index as the interaction + inter_info.loc[[index], 'genotype'] = 'synth_mut' + inter_info.loc[[index], 'treatment'] = 'synth_treat' + # then randomly assign 1/3 as mut and 1/3 as treat + # generate list but fuck the index off: + all_rows = info.drop(index).index.values + + #should be a list of two + combs = recursive_comb_maker(all_rows,n=len(all_rows),steps=2,i=1,recurs_results=[]) + + #assign synth int mutants + inter_info.loc[combs[0][0], 'genotype'] = 'synth_mut' + + #assign synth int treatments + inter_info.loc[combs[1][0], 'treatment'] = 'synth_treat' + + for _info in [geno_info,treat_info,inter_info]: + row = data[_info.index.get_loc(index), :] + labels_to_skip = np.isnan(row) + if any(labels_to_skip): + # Set the whole label column to zero. R:lm() will return NaN for this column + d = np.copy(data) + d[:, labels_to_skip] = 0.0 + else: + d = data + p,t = lm_sm(d, _info, two_way=True) # TODO: move this to statsmodels + if len(p) != data.shape[1]: + raise ValueError( + f'The length of p-values results: {data.shape[1]} does not match the length of the input data: {len(p)}') + + spec_p.append(p) + + else: + for index, _ in info.iterrows(): + info.loc[:, 'genotype'] = 'wt' # Set all genotypes to WT + info.loc[[index], 'genotype'] = 'synth_hom' # Set the ith baseline to synth hom + row = data[info.index.get_loc(index), :] + + # Get columns (labels) where the mutant specimen (as it's line level) + # has a null value (i.e. This specimen is QC-flagged at these labels) + # Set all values to zero + labels_to_skip = np.isnan(row) + if any(labels_to_skip): + # Set the whole label column to zero. R:lm() will return NaN for this column + d = np.copy(data) + d[:, labels_to_skip] = 0.0 + else: + d = data + + # Get a p-value for each organ + p, t = lm_sm(d, info) # TODO: move this to statsmodels + + # Check that there are equal amounts of p-values than there are data points + if len(p) != data.shape[1]: + raise ValueError( + f'The length of p-values results: {data.shape[1]} does not match the length of the input data: {len(p)}') + + spec_p.append(p) spec_df = pd.DataFrame.from_records(spec_p, columns=label_names) - line_df = null_line(line_specimen_counts, baselines, num_perm) - + line_df = null_line(wt_indx_combinations, baselines, num_perm, two_way=two_way) + #print(line_df['x3']) return strip_x([line_df, spec_df]) -def null_line(line_specimen_counts: List, +def null_line(wt_indx_combinations: dict, data: pd.DataFrame, - num_perms=1000) -> pd.DataFrame: + num_perms=1000, two_way:bool=False) -> pd.DataFrame: """ Generate pvalue null distributions for all labels in 'data' NaN values are excluded potentailly resultnig in different sets of specimens for each label. This makes it tricky to @@ -149,19 +458,38 @@ def null_line(line_specimen_counts: List, Returns ------- DataFrame of null distributions. Each label in a column + + Notes + ----- + If QC has been applied to the data, we may have some NANs """ def prepare(label): - return data[[label, 'staging', 'genotype']] + if two_way: + return data[[label, 'staging', 'genotype','treatment']] + else: + return data[[label, 'staging', 'genotype']] - data = data.rename(columns={'line': 'genotype'}) + if two_way: + #make genotype and treatment cols + data['genotype'] = 'wt' + data['treatment'] = 'veh' + data = data.drop(['line'], axis='columns') + else: + data = data.rename(columns={'line': 'genotype'}) starttime = datetime.datetime.now() - cols = list(data.drop(['staging', 'genotype'], axis='columns').columns) + cols = list(data.drop(['staging', 'genotype', 'treatment'], axis='columns').columns) if two_way else \ + list(data.drop(['staging', 'genotype'], axis='columns').columns) - pdists = Parallel(n_jobs=-1)( - delayed(_null_line_thread)(prepare(i), num_perms, line_specimen_counts) for i in cols) + # Run each label on a thread + if two_way: + pdists = Parallel(n_jobs=-1)(delayed(_two_way_null_line_thread) + (prepare(i), num_perms, wt_indx_combinations, i) for i in tqdm(cols)) + else: + pdists = Parallel(n_jobs=-1)(delayed(_null_line_thread) + (prepare(i), num_perms, wt_indx_combinations, i) for i in tqdm(cols)) line_pdsist_df = pd.DataFrame(pdists).T line_pdsist_df.columns = cols @@ -172,7 +500,7 @@ def prepare(label): return line_pdsist_df -def _null_line_thread(*args) ->List[float]: +def _null_line_thread(*args) -> List[float]: """ Create a null distribution for a single label. This can put put onto a thread or process @@ -180,40 +508,92 @@ def _null_line_thread(*args) ->List[float]: ------- pvalue distribution """ - data, num_perms, line_spec_counts = args + data, num_perms, wt_indx_combinations, label = args - # remove all the NaN rows, which are QC-flagged labels - data_nonan = data[~data[data.columns[0]].isna()] label = data.columns[0] - data_nonan = data_nonan.astype({label: np.float, - 'staging': np.float}) + data = data.astype({label: np.float, + 'staging': np.float}) synthetics_sets_done = [] line_p = [] perms_done = 0 - while perms_done < num_perms: - for n in line_spec_counts: # mutant lines - - if perms_done == num_perms: - break - if not _label_synthetic_mutants(data_nonan, n, synthetics_sets_done): - continue + # Get combinations of WT indices for current label + indxs = wt_indx_combinations[label] + for comb in indxs: + data.loc[:, 'genotype'] = 'wt' + data.loc[data.index.isin(comb), 'genotype'] = 'synth_hom' + # _label_synthetic_mutants(data, n, synthetics_sets_done) - perms_done += 1 + perms_done += 1 - model = smf.ols(formula=f'{label} ~ C(genotype) + staging', data=data_nonan) - fit = model.fit() - p = fit.pvalues['C(genotype)[T.wt]'] + model = smf.ols(formula=f'{label} ~ C(genotype) + staging', data=data, missing='drop') + fit = model.fit() + p = fit.pvalues['C(genotype)[T.wt]'] - line_p.append(p) - print(f'Done {label}') + line_p.append(p) return line_p +def _two_way_null_line_thread(*args) -> List[float]: + """ + same as _null_line_thread but for two way + TODO: merge two_way and null_threads + Parameters + ---------- + args + + Returns + ------- + + """ + data, num_perms, wt_indx_combinations, label = args + # print('Generating null for', label) + + label = data.columns[0] + + data = data.astype({label: np.float, + 'staging': np.float}) + + synthetics_sets_done = [] + + line_p = [] + + perms_done = 0 + + # Get combinations of WT indices for current label + indxs = wt_indx_combinations[label] + + formula = f'{label} ~ genotype * treatment + staging' + for comb in indxs: + # set up genotype and treatment + data.loc[:, 'genotype'] = 'wt' + data.loc[:,'treatment'] = 'veh' + + # mains + data.loc[data.index.isin(comb[0]), 'genotype'] = 'synth_mut' + data.loc[data.index.isin(comb[1]), 'treatment'] = 'synth_treat' + + # interactions + data.loc[data.index.isin(comb[2]), 'genotype'] = 'synth_mut' + data.loc[data.index.isin(comb[2]), 'treatment'] = 'synth_treat' + + # _label_synthetic_mutants(data, n, synthetics_sets_done) + + perms_done += 1 + + fit = smf.ols(formula=formula, data=data, missing='drop').fit() + # get all pvals except intercept and staging + + # fit.pvalues is a series - theefore you have to use .index + p = fit.pvalues[~fit.pvalues.index.isin(['Intercept','staging'])] + #pvalues go in the order of genotype, treatment, interaction. + line_p.append(p.values) + return line_p + def _label_synthetic_mutants(info: pd.DataFrame, n: int, sets_done: List) -> bool: """ Given a dataframe of wild type data, relabel n baselines as synthetic mutant in place. @@ -222,7 +602,10 @@ def _label_synthetic_mutants(info: pd.DataFrame, n: int, sets_done: List) -> boo Parameters ---------- info - dataframe with 'genotype' column + columns + label_num with 'x' prefix e.g. 'x1' + staging + genotype n how many specimens to relabel sets_done @@ -237,22 +620,22 @@ def _label_synthetic_mutants(info: pd.DataFrame, n: int, sets_done: List) -> boo # Set all to wt genotype info.loc[:, 'genotype'] = 'wt' - # label n number of baselines as mutants - - max_comb = int(comb(len(info), n)) - - for i in range(max_comb): - synthetics_mut_indices = random.sample(range(0, len(info)), n) - i += 1 - if not set(synthetics_mut_indices) in sets_done: - break - - if i > max_comb - 1: - msg = f"""Cannot find unique combinations of wild type baselines to relabel as synthetic mutants - With a baseline n of {len(info)}\n. Choosing {n} synthetics. - Try increasing the number of baselines or reducing the number of permutations""" - - return False + # label n number of baselines as mutants from the maximum number of combination + + # max_comb = int(comb(len(info), n)) + # + # for i in range(max_comb): + # synthetics_mut_indices = random.sample(range(0, len(info)), n) + # i += 1 + # if not set(synthetics_mut_indices) in sets_done: + # break + # + # if i > max_comb - 1: + # msg = f"""Cannot find unique combinations of wild type baselines to relabel as synthetic mutants + # With a baseline n of {len(info)}\n. Choosing {n} synthetics. + # Try increasing the number of baselines or reducing the number of permutations""" + # logger.warn(msg) + # raise ValueError(msg) sets_done.append(set(synthetics_mut_indices)) @@ -269,7 +652,7 @@ def strip_x(dfs): def alternative(input_data: pd.DataFrame, plot_dir: Union[None, Path] = None, - boxcox: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + boxcox: bool = False, two_way: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Generate alterntive (mutant) distributions for line and pecimen-level data @@ -289,10 +672,13 @@ def alternative(input_data: pd.DataFrame, """ # Group by line and sequntaily run - info_columns = ['staging', 'line', 'genotype'] # Columns of non-organ volumes in input_data + info_columns =['staging', 'line', 'genotype', 'treatment'] if two_way \ + else ['staging', 'line', 'genotype'] # Columns of non-organ volumes in input_data line_groupby = input_data.groupby('line') + + label_names = list(input_data.drop(['staging', 'line'], axis='columns').columns) baseline = input_data[input_data['line'] == 'baseline'] @@ -304,66 +690,161 @@ def alternative(input_data: pd.DataFrame, alt_spec_t = [] # Get line-level alternative distributions - for line_id, line_df in line_groupby: - - if line_id == 'baseline': - continue - - line_df.loc[:, 'genotype'] = 'hom' - line_df.drop(['line'], axis=1) - - df_wt_mut = pd.concat([baseline, line_df]) - - # Lm code needs daatpoints in numpy array and genotype+staging in dataframe - data_df = df_wt_mut.drop(columns=info_columns) + two_way_df = pd.DataFrame() + if two_way: + # create the proper two-by-two groups + for line_id, line_df in line_groupby: + if line_id == 'baseline': + line_df.loc[:, 'genotype'] = 'wt' + line_df.loc[:, 'treatment'] = 'veh' + elif line_id == 'mutants': + line_df.loc[:, 'genotype'] = 'mut' + line_df.loc[:, 'treatment'] = 'veh' + if line_id == 'treatment': + line_df.loc[:, 'genotype'] = 'wt' + line_df.loc[:, 'treatment'] = 'treat' + if line_id == 'mut_treat': + line_df.loc[:, 'genotype'] = 'mut' + line_df.loc[:, 'treatment'] = 'treat' + # merge the labels + two_way_df=pd.concat([two_way_df, line_df]) + + two_way_df.drop(['line'], axis=1) + data_df = two_way_df.drop(columns=info_columns) # Get columns (labels) where all speciemns have null values (i.e. all the line is QC-flagged at these labels) labels_to_skip = [col for col, isany in line_df.any().iteritems() if not isany] + if labels_to_skip: # Set the whole label column to zero. R:lm() will return NaN for this column data_df[labels_to_skip] = 0.0 - # Get a numpy array of the organ volumes data = data_df.values - info = df_wt_mut[['staging', 'line', 'genotype']] + info = two_way_df[['staging', 'treatment', 'genotype']] p: np.array t: np.array - p, t = lm_r(data, info) # returns p_values for all organs, 1 iteration - res_p = [line_id] + list(p) # line_name, label_1, label_2 ...... + p, t = lm_sm(data, info, two_way=True) + + res_p = ['two_way'] + list(p) # line_name, label_1, label_2 ...... alt_line_pvalues.append(res_p) - res_t = [line_id] + list(t) + res_t = ['two_way'] + list(t) alt_line_t.append(res_t) - ### Get specimen-level alternative distributions ### - mutants = input_data[input_data['line'] != 'baseline'] - # baselines = input_data[input_data['line'] == 'baseline'] - for specimen_id, row in mutants.iterrows(): - row['genotype'] = 'hom' - line_id = row['line'] - df_wt_mut = baseline.append(row) - data_df = df_wt_mut.drop(columns=['line', 'genotype', 'staging']) - - # Get columns (labels) where the mutant specimen (as it's line level) - # has a null value (i.e. This specimen is QC-flagged at these labels) - labels_to_skip = [col for col, isany in pd.DataFrame(row).T.any().iteritems() if not isany] - if labels_to_skip: - # Set the whole label column to zero. R:lm() will return NaN for this column - data_df[labels_to_skip] = 0.0 + ### Great now the specimen values ### + # split the df again? ##TODO make this less stupid? + two_grped = two_way_df.groupby(['genotype','treatment']) - data = data_df.values + baseline = two_grped.get_group(('wt','veh')) + mutants = two_grped.get_group(('mut','veh')) + treatment = two_grped.get_group(('wt','treat')) + interaction = two_grped.get_group(('mut','treat')) + + def get_effects(group, inter=False): + for specimen_id, row in tqdm(group.iterrows(), total=group.shape[0]): + line_id = 'two_way' + df_wt_mut = baseline.append(row) + if inter: + df_wt_mut = df_wt_mut.append(mutants) + df_wt_mut = df_wt_mut.append(treatment) - info = df_wt_mut[['genotype', 'staging']] + data_df = df_wt_mut.drop(columns=['line', 'genotype', 'treatment', 'staging']) - p, t = lm_r(data, info) # returns p_values for all organs, 1 iteration - res_p = [line_id, specimen_id] + list(p) - alt_spec_pvalues.append(res_p) + # Get columns (labels) where the mutant specimen (as it's line level) + # has a null value (i.e. This specimen is QC-flagged at these labels) + labels_to_skip = [col for col, isany in pd.DataFrame(row).T.any().iteritems() if not isany] + if labels_to_skip: + # Set the whole label column to zero. R:lm() will return NaN for this column + data_df[labels_to_skip] = 0.0 - res_t = [specimen_id] + list(t) - alt_spec_t.append(res_t) + data = data_df.values + + info = df_wt_mut[['genotype', 'treatment', 'staging']] + #print("data", data) + #print("info", info) + #print("type of info", type(info)) + #print('Type of data', type(data[-1]), data[-1][0]) + p, t = lm_sm(data, info, two_way=True) # returns p_values for all organs, 1 iteration + + res_p = [line_id, specimen_id] + list(p) + alt_spec_pvalues.append(res_p) + + res_t = [specimen_id] + list(t) + alt_spec_t.append(res_t) + + # genotype effect + get_effects(mutants) + + # treatment effect + get_effects(treatment) + + get_effects(interaction, inter=True) + + else: + for line_id, line_df in line_groupby: + + if line_id == 'baseline': + continue + + line_df.loc[:, 'genotype'] = 'hom' + line_df.drop(['line'], axis=1) + + df_wt_mut = pd.concat([baseline, line_df]) + + # Lm code needs daatpoints in numpy array and genotype+staging in dataframe + data_df = df_wt_mut.drop(columns=info_columns) + + # Get columns (labels) where all speciemns have null values (i.e. all the line is QC-flagged at these labels) + labels_to_skip = [col for col, isany in line_df.any().iteritems() if not isany] + if labels_to_skip: + # Set the whole label column to zero. R:lm() will return NaN for this column + data_df[labels_to_skip] = 0.0 + + # Get a numpy array of the organ volumes + data = data_df.values + + info = df_wt_mut[['staging', 'line', 'genotype']] + + p: np.array + t: np.array + + p, t = lm_sm(data, info) # returns p_values for all organs, 1 iteration + + res_p = [line_id] + list(p) # line_name, label_1, label_2 ...... + alt_line_pvalues.append(res_p) + + res_t = [line_id] + list(t) + alt_line_t.append(res_t) + + ### Get specimen-level alternative distributions ### + mutants = input_data[input_data['line'] != 'baseline'] + # baselines = input_data[input_data['line'] == 'baseline'] + for specimen_id, row in mutants.iterrows(): + row['genotype'] = 'hom' + line_id = row['line'] + df_wt_mut = baseline.append(row) + data_df = df_wt_mut.drop(columns=['line', 'genotype', 'staging']) + + # Get columns (labels) where the mutant specimen (as it's line level) + # has a null value (i.e. This specimen is QC-flagged at these labels) + labels_to_skip = [col for col, isany in pd.DataFrame(row).T.any().iteritems() if not isany] + if labels_to_skip: + # Set the whole label column to zero. R:lm() will return NaN for this column + data_df[labels_to_skip] = 0.0 + + data = data_df.values + + info = df_wt_mut[['genotype', 'staging']] + + p, t = lm_sm(data, info) # returns p_values for all organs, 1 iteration + res_p = [line_id, specimen_id] + list(p) + alt_spec_pvalues.append(res_p) + + res_t = [specimen_id] + list(t) + alt_spec_t.append(res_t) # result dataframes have either line or specimen in index then labels alt_line_df = pd.DataFrame.from_records(alt_line_pvalues, columns=['line'] + label_names, index='line') @@ -372,6 +853,6 @@ def alternative(input_data: pd.DataFrame, alt_line_t_df = pd.DataFrame.from_records(alt_line_t, columns=['line'] + label_names, index='line') alt_spec_t_df = pd.DataFrame.from_records(alt_spec_t, columns=['specimen'] + label_names, - index='specimen') + index='specimen') return strip_x([alt_line_df, alt_spec_df, alt_line_t_df, alt_spec_t_df]) diff --git a/lama/stats/permutation_stats/p_thresholds.py b/lama/stats/permutation_stats/p_thresholds.py index 7f8f1c48..a6fa8aee 100644 --- a/lama/stats/permutation_stats/p_thresholds.py +++ b/lama/stats/permutation_stats/p_thresholds.py @@ -8,12 +8,14 @@ import pandas as pd import numpy as np - TESTING = False # If set to true the p-threshold will be set high an dthe fdr < 0.05 + + # to get some positive hits for testing -def get_thresholds(null_dist: pd.DataFrame, alt_dist: pd.DataFrame, target_threshold: float=0.05) -> pd.DataFrame: +def get_thresholds(null_dist: pd.DataFrame, alt_dist: pd.DataFrame, target_threshold: float = 0.05, + two_way: bool = False) -> pd.DataFrame: """ Calculate the per-organ p-value thresholds Given a wild type null distribution of p-values and a alternative (mutant) distribution @@ -48,79 +50,166 @@ def get_thresholds(null_dist: pd.DataFrame, alt_dist: pd.DataFrame, target_thres alt_dist = alt_dist.copy() for label in null_dist: - print(label) - wt_pvals = null_dist[label].values - mut_pvals = alt_dist[label].values + if two_way: + # TODO seem if you can improve performance + # convert this back to an array - # Debugging - wt_pvals.sort() - mut_pvals.sort() + wt_pvals = np.vstack(null_dist[label].values).transpose() + mut_pvals = np.vstack(alt_dist[label].values).transpose() - # Merge the p-values together get a list of available thresholds to use - all_p = list(wt_pvals) + list(mut_pvals) + # Join array and vstack them - transpose so the effects are rows + all_p = np.concatenate((wt_pvals, mut_pvals), axis=1) - all_p.sort() + # sort for each effect - pthresh_fdrs = [] + for row in all_p: + row.sort() - # For every available p-value from the null + alternative distributions, That is lower than 0.05 - # get the associated FDR for that threshold + # crete empty lists - all_p = [x for x in all_p if x <= 0.05] - for p_to_test in all_p: + p_fdr_df = [] - fdr_at_thresh = fdr_calc(wt_pvals, mut_pvals, p_to_test) + # iterate for each effect + for i, row in enumerate(all_p): + pthresh_fdrs = [] + row = [x for x in row if x <= 0.05] + # what to do if the rows are empty + for p_to_test in row: + # basically index only compares the correct effect - if fdr_at_thresh is not None: - pthresh_fdrs.append((p_to_test, fdr_at_thresh)) + fdr_at_thresh = fdr_calc(wt_pvals[i], mut_pvals[i], p_to_test) - # Create a DataFrame of p-value thresholds and associated FDRs - p_fdr_df = pd.DataFrame.from_records(pthresh_fdrs, columns=['p', 'fdr']) + if fdr_at_thresh is not None: + pthresh_fdrs.append((p_to_test, fdr_at_thresh)) - if len(p_fdr_df) > 0: + p_fdr = pd.DataFrame.from_records(pthresh_fdrs, columns=['p', 'fdr']) - p_under_target_fdr = p_fdr_df[p_fdr_df.fdr <= target_threshold] + p_fdr_df.append(p_fdr) - if len(p_under_target_fdr) < 1: - # No acceptable p-value threshold for this label. Choose minimum fdr. - lowest_fdr_row = p_fdr_df.loc[p_fdr_df['fdr'].idxmax()] - p_thresh = lowest_fdr_row['p'] - best_fdr = lowest_fdr_row['fdr'] - else: - row = p_fdr_df.loc[p_under_target_fdr.p.idxmax()] - p_thresh = row['p'] - best_fdr = row['fdr'] + # enumerate for performance + for i, p_fdr in enumerate(p_fdr_df): + + if len(p_fdr) > 0: + p_under_target_fdr = p_fdr[p_fdr.fdr <= target_threshold] + + if len(p_under_target_fdr) < 1: + lowest_fdr_row = p_fdr.loc[p_fdr['fdr'].idxmin()] + p_thresh = lowest_fdr_row['p'] + best_fdr = lowest_fdr_row['fdr'] + else: + row = p_fdr.loc[p_under_target_fdr.p.idxmax()] + p_thresh = row['p'] + best_fdr = row['fdr'] + + num_hits = len(mut_pvals[i][mut_pvals[i] <= p_thresh]) + num_null = len(wt_pvals[i]) + num_alt = len(mut_pvals[i]) + + num_null_lt_thresh = len(wt_pvals[i][wt_pvals[i] <= p_thresh]) + + else: + best_fdr = 1 + p_thresh = np.NAN + num_hits = 0 + num_null, num_null_lt_thresh, num_alt = ['NA'] * 3 - # Total number of paramerters across all lines that are below our p-value threshold - num_hits = len(mut_pvals[mut_pvals <= p_thresh]) + effect_list = ['genotype', 'treatment', 'interaction'] - num_null = len(wt_pvals) - num_alt = len(mut_pvals) + # test if not having an int here works + results.append([label, effect_list[i], p_thresh, best_fdr, + num_null, num_null_lt_thresh, num_alt, num_hits]) - num_null_lt_thresh = len(wt_pvals[wt_pvals <= p_thresh]) - # num_alt_lt_thresh = len(mut_pvals[mut_pvals <= p_thresh]) else: - best_fdr = 1 - p_thresh = np.NAN - num_hits = 0 - num_null, num_null_lt_thresh, num_alt = ['NA'] * 3 + wt_pvals = null_dist[label].values + mut_pvals = alt_dist[label].values + + wt_pvals.sort() + mut_pvals.sort() + all_p = list(wt_pvals) + list(mut_pvals) + all_p.sort() + pthresh_fdrs = [] + # For every available p-value from the null + alternative distributions, That is lower than 0.05 + # get the associated FDR for that threshold + all_p = [x for x in all_p if x <= 0.05] + for p_to_test in all_p: + + fdr_at_thresh = fdr_calc(wt_pvals, mut_pvals, p_to_test) + + if fdr_at_thresh is not None: + pthresh_fdrs.append((p_to_test, fdr_at_thresh)) + + # create a dataframe of p-value thresholds and associated fdrs + p_fdr_df = pd.dataframe.from_records(pthresh_fdrs, columns=['p', 'fdr']) + + if len(p_fdr_df) > 0: + p_under_target_fdr = p_fdr_df[p_fdr_df.fdr <= target_threshold] + + if len(p_under_target_fdr) < 1: + lowest_fdr_row = p_fdr_df.loc[p_fdr_df['fdr'].idxmin()] + p_thresh = lowest_fdr_row['p'] + best_fdr = lowest_fdr_row['fdr'] + else: + row = p_fdr_df.loc[p_under_target_fdr.p.idxmax()] + p_thresh = row['p'] + best_fdr = row['fdr'] + + # Total number of parameters across all lines that are below our p-value threshold + num_hits = len(mut_pvals[mut_pvals <= p_thresh]) + + num_null = len(wt_pvals) + num_alt = len(mut_pvals) + num_null_lt_thresh = len(wt_pvals[wt_pvals <= p_thresh]) + # num_alt_lt_thresh = len(mut_pvals[mut_pvals <= p_thresh]) + + else: + best_fdr = 1 + p_thresh = np.NAN + num_hits = 0 + num_null, num_null_lt_thresh, num_alt = ['NA'] * 3 + + results.append([int(label), p_thresh, best_fdr, + num_null, num_null_lt_thresh, num_alt, num_hits]) + + # iteration over each group needs to be done separately + + # if n_accept_pvals < 1: + # # No acceptable p-value threshold for this label. Choose minimum fdr. + # lowest_fdr_row = [p_fdr.loc[p_fdr['fdr'].idxmin()] for p_fdr in p_fdr_df] + # p_thresh = [row['p'] for row in lowest_fdr_row] + # best_fdr = [row['fdr'] for row in lowest_fdr_row] + # + # elif two_way: + # # this is amazing if it works + # rows = [] + # for i, p_fdr in enumerate(p_fdr_df): + # rows.append(p_fdr.loc[p_under_target_fdr[i].p.idxmax()]) + # p_thresh = [row['p'] for row in rows] + # best_fdr = [row['fdr'] for row in rows] # TODO: what about if the labels are not numbers - results.append([int(label), p_thresh, best_fdr, - num_null, num_null_lt_thresh, num_alt, num_hits]) - header = ['label', 'p_thresh', 'fdr', - 'num_null', 'num_null_lt_thresh', 'num_alt', 'num_alt_lt_thresh'] + if two_way: + header = ['label', 'effect', 'p_thresh', 'fdr', + 'num_null', 'num_null_lt_thresh', 'num_alt', 'num_alt_lt_thresh'] + + result_df = pd.DataFrame.from_records(results, columns=header, index='label') + result_df.sort_values(by=['label','effect'], inplace=True) + + else: + header = ['label', 'p_thresh', 'fdr', + 'num_null', 'num_null_lt_thresh', 'num_alt', 'num_alt_lt_thresh'] + + result_df = pd.DataFrame.from_records(results, columns=header, index='label') + result_df.sort_values(by='label', inplace=True) + - result_df = pd.DataFrame.from_records(results, columns=header, index='label') - result_df.sort_values(by='label', inplace=True) return result_df -def fdr_calc(null_pvals, alt_pvals, thresh) -> float: +def fdr_calc(null_pvals, alt_pvals, thresh, two_way=False) -> float: """ Calculate the False Discovery Rate for a given p-value threshold and a null and alternative distribution Parameters @@ -133,25 +222,26 @@ def fdr_calc(null_pvals, alt_pvals, thresh) -> float: Returns ------- fdr [0.0,1.0] + or None if both ratio_wt_under_thresh / ratio_mut_under_threshold are zero (Currently looking into ths) """ null_pvals = np.sort(null_pvals) + alt_pvals = np.sort(alt_pvals) + ratio_wt_under_thresh = len(null_pvals[null_pvals < thresh]) / len(null_pvals) ratio_mut_under_threshold = len(alt_pvals[alt_pvals < thresh]) / len(alt_pvals) + + try: + # if there are no wild-types, the FDR is 0? fdr = ratio_wt_under_thresh / ratio_mut_under_threshold except ZeroDivisionError: # No mutants at this threshold. return None # If the null is skewed to the right, we might get FDR values greater than 1, which does not make sense + fdr = np.clip(fdr, 0, 1) return fdr - - - - - - diff --git a/lama/stats/permutation_stats/run_permutation_stats.py b/lama/stats/permutation_stats/run_permutation_stats.py index d3acf592..96bab124 100644 --- a/lama/stats/permutation_stats/run_permutation_stats.py +++ b/lama/stats/permutation_stats/run_permutation_stats.py @@ -41,34 +41,40 @@ from pathlib import Path from datetime import date - +import re import pandas as pd import numpy as np from scipy.stats import zmap from logzero import logger as logging import yaml - +from itertools import compress from lama import common from lama.stats.permutation_stats import distributions from lama.stats.permutation_stats import p_thresholds from lama.paths import specimen_iterator, get_specimen_dirs, LamaSpecimenData from lama.qc.organ_vol_plots import make_plots, pvalue_dist_plots -from lama.common import write_array, read_array, init_logging, LamaDataException +from lama.common import write_array, read_array, init_logging, git_log, LamaDataException from lama.stats.common import cohens_d from lama.stats.penetrence_expressivity_plots import heatmaps_for_permutation_stats GENOTYPE_P_COL_NAME = 'genotype_effect_p_value' +TREAT_P_COL_NAME = 'treatment_effect_p_value' +INTER_P_COL_NAME = 'interaction_effect_p_value' + PERM_SIGNIFICANT_COL_NAME = 'significant_cal_p' + +PERM_SIGNIFICANT_COL_LIST = ['significant_cal_p_geno', 'significant_cal_p_treat', 'significant_cal_p_inter'] + PERM_T_COL_NAME = 't' -def write_specimen_info(wt_wev, mut_wev, outfile, sd=2.0): +def write_specimen_info(wt_wev, mut_wev, outfile): """ Write a csv with some summary info on specimens currently only returns Z-score of mutants """ + def sortwev(x): - print(x) return x wev_z = zmap(mut_wev.staging, wt_wev.staging) mut_wev['WEV_zscore'] = wev_z @@ -76,6 +82,63 @@ def sortwev(x): mut_wev.to_csv(outfile) +def get_radiomics_data(rad_dir: Path, wt_dir: Path, mut_dir: Path, treat_dir: Path, inter_dir: Path) -> pd.DataFrame: + """ + Given a root registration directory, collate all the organ volume CSVs into one file. + Write out the combined organ volume CSV into the root registration directory. + + Parameters + ---------- + root_dir + The path to the root registration directory + + Returns + ------- + The combined data frame of all the organ volumes + specimen id in index organs in rows + """ + + + + # get the features_per_embryo and convert it into per organs + feature_dir = rad_dir / "features" + #common.gather_rad_data(feature_dir) + + org_dir = rad_dir / "organs" + + # get the organ data and load it as one massive file + file_names = [spec for spec in common.get_file_paths(folder=org_dir, extension_tuple=".csv") if "0." in str(spec)] + file_names.sort() + df_list = [] + + + staging = pd.concat([get_staging_data(_dir) for _dir in [wt_dir, mut_dir, treat_dir, inter_dir]]).rename( + columns={'value': 'staging'}) + + for org_name in file_names: + # read dataset + d = pd.read_csv(org_name, index_col=0).dropna(axis=1) + # tag the columns with the organ_number + + # For some reason, "." stuffs up the pipeline and adds a space, just remove it + org = str(d.org[0]).replace(".0","") + d.drop(columns=['HPE', 'genotype', 'background', 'org'], inplace=True) + + # patsy has a fit with "-" thinks I'm subtracting + # I use '__' as a method to identifiy radiomics data + d.set_axis([(col + '__' + org).replace("-","_") for col in d.columns], axis=1, inplace=True) + d = d.reindex(staging.index) + d.divide(staging['staging']) + df_list.append(d) + + # horizontal merge - hope it works + data = pd.concat(df_list, axis=1) + #data = data.loc[:, data.columns.str.contains('shape')] + data = data.loc[:, ~data.columns.str.contains('2D')] + data = pd.concat([data, staging], axis=1) + return data + + def get_organ_volume_data(root_dir: Path) -> pd.DataFrame: """ Given a root registration directory, collate all the organ volume CSVs into one file. @@ -97,7 +160,7 @@ def get_organ_volume_data(root_dir: Path) -> pd.DataFrame: s: LamaSpecimenData for s in get_specimen_dirs(root_dir): - # for line_dir, specimen_dir in specimen_iterator(output_dir): + # for line_dir, specimen_dir in specimen_iterator(output_dir): # organ_vol_file = specimen_dir / 'output' / common.ORGAN_VOLUME_CSV_FILE organ_vol_file = s.outroot / common.ORGAN_VOLUME_CSV_FILE @@ -141,7 +204,7 @@ def get_staging_data(root_dir: Path) -> pd.DataFrame: s: LamaSpecimenData for s in get_specimen_dirs(root_dir): - # for line_dir, specimen_dir in specimen_iterator(output_dir): + # for line_dir, specimen_dir in specimen_iterator(output_dir): # staging_info = specimen_dir / 'output' / common.STAGING_INFO_FILENAME staging_info = s.outroot / common.STAGING_INFO_FILENAME @@ -166,13 +229,15 @@ def get_staging_data(root_dir: Path) -> pd.DataFrame: def annotate(thresholds: pd.DataFrame, lm_results: pd.DataFrame, lines_root_dir: Path, - line_level: bool = True, + is_line_level: bool = True, label_info: Path = None, label_map: Path = None, write_thresholded_inv_labels=False, - fdr_threshold: float=0.05, - t_values: pd.DataFrame=None, - organ_volumes: pd.DataFrame=None): + fdr_threshold: float = 0.05, + t_values: pd.DataFrame = None, + organ_volumes: pd.DataFrame = None, + two_way: bool = False, + main_of_two_way: bool = False) -> pd.DataFrame: """ Using the p_value thresholds and the linear model p-value results, create the following CSV files @@ -190,7 +255,7 @@ def annotate(thresholds: pd.DataFrame, cols: labels (+ line_id for specimen_level) lines_root_dir The root directory to save the annotated CSV files. Each line to go in a subfolder - line_level + is_line_level if not True, place results in specimen-level sub directory label_info CSV to map label number to name @@ -199,38 +264,85 @@ def annotate(thresholds: pd.DataFrame, organ_volumes All the organ volumes for baselines and mutants (as it was used in lm(), so probably normalised to whole embryo + Returns + ------- + Aggregated hit dataframe Notes ----- TODO: Add file number prefixes so we don't overwrite mulyiple analyses done on the same day TODO: the organ_volumes folder name is hard-coded. What about if we add a new analysis type to the permutation stats pipeline? """ - collated = [] + hit_dataframes = [] if label_map: label_map = read_array(label_map) + if two_way: + thresholds = thresholds.pivot(columns='effect') # Iterate over each line or specimen (for line or specimen-level analysis) for id_, row in lm_results.iterrows(): # Create a dataframe containing a p-value column. each row an organ df = row.to_frame() - - if not line_level: + if not is_line_level: # specimen-level has an extra line column we need to remove df = df.T.drop(columns=['line']).T # Rename the line_specimen column to be more informative - df.rename(columns={id_: GENOTYPE_P_COL_NAME}, inplace=True) - if line_level: + if (two_way and not main_of_two_way): + df.drop(labels=['line'], axis=0, errors='ignore', inplace=True) + + try: + fixed_vals = np.stack(df[id_]) + df['genotype_effect_p_value'] = pd.to_numeric(fixed_vals[:, 0]) + df['interaction_effect_p_value'] = pd.to_numeric(fixed_vals[:, 2]) + df['treatment_effect_p_value'] = pd.to_numeric(fixed_vals[:, 1]) + df.drop(columns=['two_way'], errors='ignore', inplace=True) + df.drop(labels=['line'], axis=0, errors='ignore', inplace=True) + + except IndexError: + # data wrangling - remove brackets and convert values to float + fixed_vals = pd.DataFrame([re.sub('\[|\]', '', val).split(' ')[0:3] + for val in df[id_]], index=df.index) + df['genotype_effect_p_value'] = pd.to_numeric(fixed_vals[0], errors='coerce') + df['interaction_effect_p_value'] = pd.to_numeric(fixed_vals[2], errors='coerce') + df['treatment_effect_p_value'] = pd.to_numeric(fixed_vals[1], errors='coerce') + + # fix up the specimen main two-ways + elif main_of_two_way: + df.drop(labels=['line'], axis=0, errors='ignore', inplace=True) + + try: + spec_name = df.columns + df = pd.DataFrame(np.stack(df.iloc[:, 0]), index=df.index) + # print("fixed_val ", fixed_vals, type(fixed_vals)) + # df = pd.DataFrame(fixed_vals, index=df.index) + # print("numeric val", df) + df.rename(columns={0: GENOTYPE_P_COL_NAME}, inplace=True) + + except IndexError: + # this is only really for testing where the the arrays are not properly written by to_csv + + fixed_vals = pd.DataFrame([re.sub('\[|\]', '', val) for val in df.iloc[:, 0]], index=df.index) + df = pd.DataFrame(pd.to_numeric(fixed_vals[0]), index=df.index) + df.rename(columns={0: GENOTYPE_P_COL_NAME}, inplace=True) + + else: + df.rename(columns={id_: GENOTYPE_P_COL_NAME}, inplace=True) + + if is_line_level: line = id_ else: line = row['line'] + spec_id = id_ # Merge the permutation results (p-thresh, fdr, number of hit lines for this label) with the mutant results - df.index = df.index.astype(np.int64) # Index needs to be cast from object to enable merge + + df.index = df.index.astype(str) # Index needs to be cast from object to enable merge df = df.merge(thresholds, left_index=True, right_index=True, validate='1:1') + df.index.name = 'label' # Merge the t-statistics @@ -243,7 +355,7 @@ def annotate(thresholds: pd.DataFrame, t_df.columns = ['t'] t_df.drop(columns=['line'], errors='ignore', inplace=True) # this is for speciem-level results - t_df.index = t_df.index.astype(np.int64) + t_df.index = t_df.index.astype(str) # index must be string for radiomics data, organ data doesn't seem to care? df = df.merge(t_df, left_index=True, right_index=True, validate='1:1') if len(df) < 1: @@ -251,42 +363,111 @@ def annotate(thresholds: pd.DataFrame, # Add mean organ vol difference and cohens d df['mean_vol_ratio'] = None - if line_level: + if is_line_level: df['cohens_d'] = None for label, row in df.iterrows(): + # Organ vols are prefixed with x so it can work with statsmodels - label_col = f'x{label}' + label_col = f'{label}'if str(label).__contains__("__") else f'x{label}' label_organ_vol = organ_volumes[[label_col, 'line']] - wt_ovs = label_organ_vol[label_organ_vol.line == 'baseline'][f'x{label}'] - mut_ovs = label_organ_vol[label_organ_vol.line == line][f'x{label}'] - df.loc[label, 'mean_vol_ratio'] = mut_ovs.mean() / wt_ovs.mean() - if line_level: - cd = cohens_d(mut_ovs,wt_ovs) - df.loc[label, 'cohens_d'] = cd + wt_ovs = label_organ_vol.loc[label_organ_vol.line == 'baseline',label_col] + + if two_way or main_of_two_way: + # I think this is the only way to get the combs.... + + # Giving ChatGPT a chance to shine - it loves using .loc, checking data is not null convert it to numpy + mut_ovs = label_organ_vol.loc[ + (label_organ_vol.line == 'mutants') & label_organ_vol[label_col].notnull(), + label_col + ].to_numpy() + + treat_ovs = label_organ_vol.loc[ + (label_organ_vol.line == 'treatment') & label_organ_vol[label_col].notnull(), + label_col + ].to_numpy() + + int_ovs = label_organ_vol.loc[ + (label_organ_vol.line == 'mut_treat') & label_organ_vol[label_col].notnull(), + label_col + ].to_numpy() + + non_int_ovs = label_organ_vol.loc[ + label_organ_vol.line.isin(['baseline', 'mutants', 'treatment']) & label_organ_vol[ + label_col].notnull(), + label_col + ].to_numpy() + + #line.values should be string + if 'mut_treat' in label_organ_vol.line.values: + num_ovs = int_ovs + dem_ovs = non_int_ovs + elif 'treatment' in label_organ_vol.line.values: + num_ovs = treat_ovs + dem_ovs = wt_ovs + else: + num_ovs = mut_ovs + dem_ovs = wt_ovs + + # Specimen level - overwrite the num_ovs to be the single emb of interest + if not is_line_level and two_way: + num_ovs = label_organ_vol.loc[label_organ_vol.index == row.index[0], label_col] + + elif not is_line_level and main_of_two_way: + num_ovs = label_organ_vol.loc[label_organ_vol.index == spec_name[0], label_col] + + + df.loc[label, 'mean_vol_ratio'] = num_ovs.mean() / dem_ovs.mean() + if is_line_level: + df.loc[label, 'cohens_d'] = cohens_d(num_ovs, dem_ovs) + + else: + mut_ovs = label_organ_vol[label_organ_vol.line == line][label_col] + + df.loc[label, 'mean_vol_ratio'] = mut_ovs.mean() / wt_ovs.mean() + if is_line_level: + cd = cohens_d(mut_ovs, wt_ovs) + df.loc[label, 'cohens_d'] = cd output_name = f'{id_}_organ_volumes_{str(date.today())}.csv' line_output_dir = lines_root_dir / line line_output_dir.mkdir(exist_ok=True) - if not line_level: + if not is_line_level: # If dealing with specimen-level stats, make subfolder to put results in line_output_dir = line_output_dir / 'specimen_level' / id_ line_output_dir.mkdir(parents=True, exist_ok=True) output_path = line_output_dir / output_name - add_significance(df, fdr_threshold) + + add_two_way_significance(df, fdr_threshold) if two_way else add_significance(df, fdr_threshold) if label_info: - df = add_label_names(df , label_info) + df = add_label_names(df, label_info) df.to_csv(output_path) - hit_df = df[df['significant_cal_p'] == True] - collated.append(hit_df) + if two_way: + # print(any(df[PERM_SIGNIFICANT_COL_LIST] == True, axis = 'columns')) + + eff_there = [(GENOTYPE_P_COL_NAME in df.columns), + (TREAT_P_COL_NAME in df.columns), + (INTER_P_COL_NAME in df.columns)] + + PERM_COL_LIST = list(compress(PERM_SIGNIFICANT_COL_LIST, eff_there)) + + hit_df = df[(df[PERM_COL_LIST] == True).any(axis='columns')] + else: + hit_df = df[df['significant_cal_p'] == True] + hit_df['line'] = line + + if not is_line_level: + hit_df['specimen'] = spec_id + + hit_dataframes.append(hit_df) hit_labels_out = line_output_dir / f'{line}__hit_labels.nrrd' @@ -294,7 +475,9 @@ def annotate(thresholds: pd.DataFrame, if write_thresholded_inv_labels: _write_thresholded_label_map(label_map, hits, hit_labels_out) - return collated + + collated_df = pd.concat(hit_dataframes) + return collated_df def _write_thresholded_label_map(label_map: np.ndarray, hits, out: Path): @@ -314,11 +497,23 @@ def _write_thresholded_label_map(label_map: np.ndarray, hits, out: Path): def add_label_names(df: pd.DataFrame, label_info: Path) -> pd.DataFrame: - + """ + Added label names to hits dataframe with merge on label metadata + """ label_df = pd.read_csv(label_info, index_col=0) - - df = df.merge(right=label_df[['label_name']], left_index=True, right_index=True) - + #if its radiomics data, the columns will have __ + if df.index[0].__contains__("__"):# this is for radiomics data + # 3D stuffs up labelling + label_nums = [int(re.findall('\d+', _row.replace('3D', ""))[0]) for _row in df.index] + df['label_name'] = [label_df.loc[num]['label_name'] for num in label_nums] + # so this just adds the label_name and no_analysis columns by matching the label number with the feature + if 'no_analysis' in label_df: + df['no_analysis'] = [label_df.loc[num]['no_analysis'] for num in label_nums] + else: + label_df.index = label_df.index.astype(str) + df = df.merge(right=label_df[['label_name']], left_index=True, right_index=True) + if 'no_analysis' in label_df: + df = df.merge(right=label_df[['no_analysis']], left_index=True, right_index=True) return df @@ -334,15 +529,46 @@ def add_significance(df: pd.DataFrame, threshold: float): df.sort_values(by=[PERM_SIGNIFICANT_COL_NAME, GENOTYPE_P_COL_NAME], ascending=[False, True], inplace=True) +def add_two_way_significance(df: pd.DataFrame, threshold: float): + """ + Add a significance column to the output csv in place. + Set significance to True if the genotype p-value is lower than the p threshold for that organ + and the fdr is lower than fdr threshold. + And sort values by significance + """ + + eff_there = [(GENOTYPE_P_COL_NAME in df.columns), + (TREAT_P_COL_NAME in df.columns), + (INTER_P_COL_NAME in df.columns)] + + P_COL_LIST = [('genotype', GENOTYPE_P_COL_NAME), + ('treatment', TREAT_P_COL_NAME), + ('interaction', INTER_P_COL_NAME)] + + # cond_list = ['genotype', 'treatment', 'interaction'] + sort_list = list(compress([False, False, False], eff_there)) + + PERM_COL_LIST = list(compress(PERM_SIGNIFICANT_COL_LIST, eff_there)) + + for i, cond in enumerate(list(compress(P_COL_LIST, eff_there))): + df[PERM_COL_LIST[i]] = (df[cond[1]] <= df[('p_thresh', cond[0])]) \ + & (df[('fdr', cond[0])] <= threshold) + + df.sort_values(by=PERM_COL_LIST, ascending=sort_list, inplace=True) + + def prepare_data(wt_organ_vol: pd.DataFrame, wt_staging: pd.DataFrame, mut_organ_vol: pd.DataFrame, mut_staging: pd.DataFrame, label_meta: Path = None, normalise_to_whole_embryo=False, - qc_file: Path = None) -> pd.DataFrame: + qc_file: Path = None, + two_way_data: list = [], + two_way: bool = False, + rad_data: bool = False) -> pd.DataFrame: """ - Merge the mutant ans wildtype dtaframes + Merge the mutant and wildtype dtaframes Optionally normalise to staging metric (Usually whole embryo volume) Optionally remove any qc-flagged organs (These will be set to 'nan') @@ -359,21 +585,42 @@ def prepare_data(wt_organ_vol: pd.DataFrame, mut_staging.rename(columns={'value': 'staging'}, inplace=True) wt_staging.index = wt_staging.index.astype(str) + if two_way: + # unpack data + treat_staging, treat_organ_vol, inter_staging, inter_organ_vol = two_way_data + # Now do essentially the same stuff as wt and muts + treat_staging.rename(columns={'value': 'staging'}, inplace=True) + inter_staging.rename(columns={'value': 'staging'}, inplace=True) + + else: + # just the one-way + # Ensure all indices are same type + for d in [wt_organ_vol, mut_organ_vol, wt_staging, mut_staging]: + d.index = d.index.astype(str) + if normalise_to_whole_embryo: + logging.info('Normalising organ volume to whole embryo volume') wt_organ_vol = wt_organ_vol.divide(wt_staging['staging'], axis=0) mut_organ_vol = mut_organ_vol.divide(mut_staging['staging'], axis=0) - logging.info('Normalising organ volume to whole embryo volume') + if two_way: + # normalise the other groups + treat_organ_vol = treat_organ_vol.divide(treat_staging['staging'], axis=0) + inter_organ_vol = inter_organ_vol.divide(inter_staging['staging'], axis=0) # merge the organ vol - organ_vols = pd.concat([wt_organ_vol, mut_organ_vol]) + # list comprehension cause why not + organ_vols = pd.concat([wt_organ_vol, mut_organ_vol, treat_organ_vol, inter_organ_vol]) if two_way \ + else pd.concat([wt_organ_vol, mut_organ_vol]) # Drop any organ columns that has only zero values. These are the gaps in the label map caused by merging labels + # in the atlas organ_vols = organ_vols.loc[:, (organ_vols != 0).any(axis=0)] - # For the statsmodels linear mode to work, column names cannot start with a digid. Prefix with 'x' + # For the statsmodels linear mode to work, column names cannot start with a digit. Prefix with 'x' organ_vols.columns = [f'x{x}' if x.isdigit() else x for x in organ_vols.columns] - staging = pd.concat([wt_staging, mut_staging]) + staging = pd.concat([wt_staging, mut_staging, treat_staging, inter_staging]) if two_way \ + else pd.concat([wt_staging, mut_staging]) # Merge staging to the organvolume dataframe. First drop line so we don't get duplicate entries # staging.drop(columns=['line'], inplace=True) @@ -385,24 +632,28 @@ def prepare_data(wt_organ_vol: pd.DataFrame, label_meta = pd.read_csv(label_meta, index_col=0) - if 'no_analysis' in label_meta: # If we have a no_analysis column, drop labels that are flagged - flagged_lables = label_meta[label_meta.no_analysis == True].index - data.drop(columns=[f'x{x}' for x in flagged_lables if f'x{x}' in data] , inplace=True) + if 'no_analysis' in label_meta: + # If we have a no_analysis column, drop labels that are flagged + + flagged_labels = label_meta[label_meta.no_analysis == True].index + + data.drop(columns=[f'x{x}' for x in flagged_labels if f'x{x}' in data], inplace=True) # QC-flagged organs from specimens specified in QC file are set to None if qc_file: - logging.info(f'Excluding specimen organs from {qc_file}') - qc = pd.read_csv(qc_file, index_col=0) + logging.info(f'Excluding organ volumes specified in: {qc_file}') + qc = pd.read_csv(qc_file) - for idx, row in qc.iterrows(): + for _, row in qc.iterrows(): + qc_id = str(row.id) - if idx not in data.index: - raise LamaDataException(f'QC flagged specimen {idx} does not exist in dataset') + if qc_id not in data.index: + raise LamaDataException(f'QC flagged specimen {row.id} does not exist in dataset') if f'x{row.label}' not in data: - raise LamaDataException(f'QC flagegd label, {row.label}, does not exist in dataset') + raise LamaDataException(f'QC flagged label, {row.label}, does not exist in dataset') - data.loc[idx, f'x{row.label}'] = None + data.loc[qc_id, f'x{row.label}'] = None return data @@ -416,7 +667,12 @@ def run(wt_dir: Path, line_fdr: float = 0.05, specimen_fdr: float = 0.2, normalise_to_whole_embryo: bool = True, - qc_file: Path = None): + qc_file: Path = None, + voxel_size: float = 1.0, + two_way: bool = False, + treat_dir: Path = None, + inter_dir: Path = None, + rad_dir: Path = None): """ Run the permutation-based stats pipeline @@ -450,34 +706,82 @@ def run(wt_dir: Path, - line: the line id - label: the label to exclude (int) - label_name (optional) + voxel_size + For calcualting organ volumes + two_way + Activates the two-way simulation """ # Collate all the staging and organ volume data into csvs np.random.seed(999) init_logging(out_dir / 'stats.log') - logging.info(common.git_log()) + logging.info(git_log()) logging.info(f'Running {__name__} with following commands\n{common.command_line_agrs()}') - logging.info('Searching for staging data') - wt_staging = get_staging_data(wt_dir) - mut_staging = get_staging_data(mut_dir) - - logging.info('searching for organ volume data') - wt_organ_vol = get_organ_volume_data(wt_dir) - mut_organ_vol = get_organ_volume_data(mut_dir) - data = prepare_data(wt_organ_vol, + # data + # index: spec_id + # cols: label_nums, with staging and line columns at the end + if rad_dir: + logging.info('Searching for staging data') + wt_staging = get_staging_data(wt_dir) + mut_staging = get_staging_data(mut_dir) + logging.info('Collecting Radiomics data') + data = get_radiomics_data(rad_dir, wt_dir, mut_dir, treat_dir, inter_dir) + # turn on textures at your own risk + data.to_csv(out_dir / 'radiomics_data.csv') + + else: + logging.info('Searching for staging data') + wt_staging = get_staging_data(wt_dir) + mut_staging = get_staging_data(mut_dir) + + logging.info('searching for organ volume data') + wt_organ_vol = get_organ_volume_data(wt_dir) + mut_organ_vol = get_organ_volume_data(mut_dir) + if two_way: + logging.info('Searching for two-way staging and organ volume data') + treat_staging = get_staging_data(treat_dir) + inter_staging = get_staging_data(inter_dir) + treat_organ_vol = get_organ_volume_data(treat_dir) + inter_organ_vol = get_organ_volume_data(inter_dir) + two_way_data = [treat_staging, treat_organ_vol, + inter_staging, inter_organ_vol] + else: + two_way_data = [] + data = prepare_data(wt_organ_vol, wt_staging, mut_organ_vol, mut_staging, label_meta=label_info, normalise_to_whole_embryo=normalise_to_whole_embryo, - qc_file=qc_file) + qc_file=qc_file, + two_way=two_way, + two_way_data=two_way_data) + + data.to_csv(out_dir / 'input_data.csv') + + + + # get rad data + + + + # Make plots + # data_for_plots = data.copy() + # data_for_plots.columns = [x.strip('x') for x in data_for_plots.columns] # Strip any xs + # # If data has been normalised to WEV revert back for plots + # if normalise_to_whole_embryo: + # for col in data_for_plots.columns: + # if col.isdigit(): + # data_for_plots[col] = data_for_plots[col] * data_for_plots['staging'] + lines_root_dir = out_dir / 'lines' + # make_plots(data_for_plots, label_info, lines_root_dir, voxel_size=voxel_size) # Keep a record of the input data used in the analsysis - data.to_csv(out_dir / 'input_data.csv') + # Keep raw data for plotting - raw_wt_vols = wt_organ_vol.copy() + # raw_wt_vols = wt_organ_vol.copy() # These includes QCd speciemns need to remove out_dir.mkdir(exist_ok=True, parents=True) # Root directory for output @@ -486,20 +790,23 @@ def run(wt_dir: Path, dists_out.mkdir(exist_ok=True) # Get the null distributions - line_null, specimen_null = distributions.null(data, num_perms) + logging.info('Generating null distribution') + line_null, specimen_null = distributions.null(data, num_perms, two_way=two_way) # with open(dists_out / 'null_ids.yaml', 'w') as fh: # yaml.dump(null_ids, fh) - null_line_pvals_file = dists_out / 'null_line_dist_pvalues.csv' null_specimen_pvals_file = dists_out / 'null_specimen_dist_pvalues.csv' + null_line_pvals_file = dists_out / 'null_line_dist_pvalues.csv' + # Write the null distributions to file line_null.to_csv(null_line_pvals_file) specimen_null.to_csv(null_specimen_pvals_file) # Get the alternative p-value distribution (and t-values now (2 and 3) - line_alt, spec_alt, line_alt_t, spec_alt_t = distributions.alternative(data) + logging.info('Generating alternative distribution') + line_alt, spec_alt, line_alt_t, spec_alt_t = distributions.alternative(data, two_way=two_way) line_alt_pvals_file = dists_out / 'alt_line_dist_pvalues.csv' spec_alt_pvals_file = dists_out / 'alt_specimen_dist_pvalues.csv' @@ -508,57 +815,143 @@ def run(wt_dir: Path, line_alt.to_csv(line_alt_pvals_file) spec_alt.to_csv(spec_alt_pvals_file) - line_organ_thresholds = p_thresholds.get_thresholds(line_null, line_alt) - specimen_organ_thresholds = p_thresholds.get_thresholds(specimen_null, spec_alt) + line_organ_thresholds = p_thresholds.get_thresholds(line_null, line_alt, two_way=two_way) line_thresholds_path = dists_out / 'line_organ_p_thresholds.csv' - spec_thresholds_path = dists_out / 'specimen_organ_p_thresholds.csv' - line_organ_thresholds.to_csv(line_thresholds_path) - specimen_organ_thresholds.to_csv(spec_thresholds_path) + + # let's tidy up our data from the specimen calls in the two_way + if two_way: + # TODO: Don't hard-code this + specimen_inter_nulls = specimen_null[specimen_null.iloc[:, 0].str.len() == 3] + + specimen_main_nulls = specimen_null[specimen_null.iloc[:, 0].str.len() == 1] + specimen_geno_nulls, specimen_treat_nulls = np.vsplit(specimen_main_nulls, 2) + + + specimen_inter_alt = spec_alt[spec_alt.iloc[:, 1].str.len() == 3] + + + specimen_main_alt = spec_alt[spec_alt.iloc[:, 1].str.len() == 1] + + + + # so firstly let's get the names and conditions from the data + group_info = data['line'] + + + # TODO: think whether to truly put mut_treat in main comparisons + mut_names = group_info[(group_info == 'mutants') | (group_info == 'mut_treat')].index + treat_names = group_info[(group_info == 'treatment') | (group_info == 'mut_treat')].index + + specimen_geno_alt = specimen_main_alt[specimen_main_alt.index.isin(mut_names)] + specimen_treat_alt = specimen_main_alt[specimen_main_alt.index.isin(treat_names)] + + geno_alt_path = dists_out / 'specimen_geno_pvals.csv' + treat_alt_path = dists_out / 'specimen_treat_pvals.csv' + inter_alt_path = dists_out / 'specimen_inter_pvals.csv' + + specimen_geno_alt.to_csv(geno_alt_path) + specimen_treat_alt.to_csv(treat_alt_path) + specimen_inter_alt.to_csv(inter_alt_path) + + geno_thresholds = p_thresholds.get_thresholds(specimen_geno_nulls, specimen_geno_alt, two_way=two_way) + treat_thresholds = p_thresholds.get_thresholds(specimen_treat_nulls, specimen_treat_alt, two_way=two_way) + inter_thresholds = p_thresholds.get_thresholds(specimen_inter_nulls, specimen_inter_alt, two_way=two_way) + + geno_thresholds_path = dists_out / 'specimen_geno_p_thresholds.csv' + treat_thresholds_path = dists_out / 'specimen_treat_p_thresholds.csv' + inter_thresholds_path = dists_out / 'specimen_inter_p_thresholds.csv' + + geno_thresholds.to_csv(geno_thresholds_path) + treat_thresholds.to_csv(treat_thresholds_path) + inter_thresholds.to_csv(inter_thresholds_path) + + else: + specimen_organ_thresholds = p_thresholds.get_thresholds(specimen_null, spec_alt, two_way=two_way) + spec_thresholds_path = dists_out / 'specimen_organ_p_thresholds.csv' + specimen_organ_thresholds.to_csv(spec_thresholds_path) logging.info('Annotating lines') - lines_root_dir = out_dir / 'lines' - lines_root_dir.mkdir(exist_ok=True) + # The lines root doesn't really exist in a two-way + if two_way: + lines_root_dir = out_dir + else: + lines_root_dir = out_dir / 'lines' + lines_root_dir.mkdir(exist_ok=True) # Annotate lines logging.info(f"Annotating lines, using a FDR threshold of {line_fdr}") - annotate(line_organ_thresholds, line_alt, lines_root_dir, label_info=label_info, - label_map=label_map_path, write_thresholded_inv_labels=True,fdr_threshold=line_fdr, t_values=line_alt_t, - organ_volumes=data) + line_hits = annotate(line_organ_thresholds, line_alt, lines_root_dir, label_info=label_info, + label_map=label_map_path, write_thresholded_inv_labels=True, fdr_threshold=line_fdr, + t_values=line_alt_t, + organ_volumes=data, two_way=two_way) + + line_hits.to_csv(out_dir / 'line_hits.csv') # Annotate specimens logging.info(f"Annotating specimens, using a FDR threshold of {specimen_fdr}") - annotate(specimen_organ_thresholds, spec_alt, lines_root_dir, line_level=False, - label_info=label_info, label_map=label_map_path, fdr_threshold=specimen_fdr, t_values=spec_alt_t, - organ_volumes=data) + if two_way: + geno_spec_hits = annotate(geno_thresholds, specimen_geno_alt, lines_root_dir, is_line_level=False, + label_info=label_info, label_map=label_map_path, fdr_threshold=specimen_fdr, + t_values=spec_alt_t, + organ_volumes=data, main_of_two_way=True) + + treat_spec_hits = annotate(treat_thresholds, specimen_treat_alt, lines_root_dir, is_line_level=False, + label_info=label_info, label_map=label_map_path, fdr_threshold=specimen_fdr, + t_values=spec_alt_t, + organ_volumes=data, main_of_two_way=True) + + inter_spec_hits = annotate(inter_thresholds, specimen_inter_alt, lines_root_dir, is_line_level=False, + label_info=label_info, label_map=label_map_path, fdr_threshold=specimen_fdr, + t_values=spec_alt_t, + organ_volumes=data, two_way=True) + geno_spec_hits.to_csv(out_dir / 'specimen_level_geno_hits.csv') + treat_spec_hits.to_csv(out_dir / 'specimen_level_treat_hits.csv') + inter_spec_hits.to_csv(out_dir / 'specimen_level_inter_hits.csv') + + else: + spec_hits = annotate(specimen_organ_thresholds, spec_alt, lines_root_dir, is_line_level=False, + label_info=label_info, label_map=label_map_path, fdr_threshold=specimen_fdr, + t_values=spec_alt_t, + organ_volumes=data) + + spec_hits.to_csv(out_dir / 'specimen_level_hits.csv') # Make plots - mut_dir_ = mut_dir / 'output' - make_plots(mut_dir_, raw_wt_vols, wt_staging, label_info, lines_root_dir) + data_for_plots = data.copy() + data_for_plots.columns = [x.strip('x') for x in data_for_plots.columns] # Strip any xs + # If data has been normalised to WEV revert back for plots + if normalise_to_whole_embryo: + for col in data_for_plots.columns: + if col.isdigit(): + data_for_plots[col] = data_for_plots[col] * data_for_plots['staging'] + + make_plots(data_for_plots, label_info, lines_root_dir, voxel_size=voxel_size, two_way=two_way) # Get specimen info. Currently just the WEV z-score to highlight specimens that are too small/large spec_info_file = out_dir / 'specimen_info.csv' - write_specimen_info(wt_staging, mut_staging, spec_info_file) + #write_specimen_info(wt_staging, mut_staging, spec_info_file) dist_plot_root = out_dir / 'distribution_plots' line_plot_dir = dist_plot_root / 'line_level' line_plot_dir.mkdir(parents=True, exist_ok=True) - pvalue_dist_plots(line_null, line_alt, line_organ_thresholds, line_plot_dir) + pvalue_dist_plots(line_null, line_alt, line_organ_thresholds, line_plot_dir, two_way=two_way) specimen_plot_dir = dist_plot_root / 'specimen_level' specimen_plot_dir.mkdir(parents=True, exist_ok=True) - pvalue_dist_plots(specimen_null, spec_alt.drop(columns=['line']), specimen_organ_thresholds, specimen_plot_dir) - - heatmaps_for_permutation_stats(lines_root_dir) - - - - - - - - - - + if two_way: + # fix up vals. + pvalue_dist_plots(specimen_geno_nulls, specimen_geno_alt.drop(columns=['line']), geno_thresholds, + specimen_plot_dir, main_of_two_way=True) + pvalue_dist_plots(specimen_treat_nulls, specimen_treat_alt.drop(columns=['line']), treat_thresholds, + specimen_plot_dir, main_of_two_way=True) + pvalue_dist_plots(specimen_inter_nulls, specimen_inter_alt.drop(columns=['line']), inter_thresholds, + specimen_plot_dir, two_way=True) + else: + pvalue_dist_plots(specimen_null, spec_alt.drop(columns=['line']), specimen_organ_thresholds, specimen_plot_dir) + + + rad_plot = True if rad_dir else False + heatmaps_for_permutation_stats(lines_root_dir, two_way=two_way, label_info_file=label_info, rad_plot=rad_plot) diff --git a/lama/stats/rscripts/lmFast.R b/lama/stats/rscripts/lmFast.R index f27d6502..e5aa6965 100755 --- a/lama/stats/rscripts/lmFast.R +++ b/lama/stats/rscripts/lmFast.R @@ -4,9 +4,12 @@ library(MASS) +# install the abind package if not previously found -args <- commandArgs(trailingOnly = TRUE); +if (!require(abind)) install.packages('abind', repos='http://cran.us.r-project.org') +library(abind) +args <- commandArgs(trailingOnly = TRUE); pixels_file <- args[1]; # A binary containing the voxel to be tested. Masked voxels will have been removed groups_file <- args[2]; # CSV containing the genotype and crown-rum (or other staging metric) @@ -20,7 +23,6 @@ plot_dir <- args[7]; g <- read.table(groups_file, header=TRUE, sep=',') groups <- data.frame(g) - counter = 0 plot_lm <- function(data, groups, outdir){ @@ -40,111 +42,142 @@ plot_lm <- function(data, groups, outdir){ pandt_vals <- function(fit) { # get estimates est <- fit$coefficients[fit$qr$pivot, ] - + # get R: see stats:::summary.lm to see how this is calculated p1 <- 1L:(fit$rank) R <- diag(chol2inv(fit$qr$qr[p1, p1, drop = FALSE])) - + # get residual sum of squares for each resvar <- colSums(fit$residuals^2) / fit$df.residual - # R is same for each coefficient, resvar is same within each model + # R is same for each coefficient, resvar is same within each model se <- sqrt(outer(R, resvar)) - + tvals <- est / se #print(typeof(tvals)) pvals <- pt(abs(est / se), df = fit$df.residual, lower.tail = FALSE) * 2 - + return(list(pvals=pvals, tvals=tvals)) } -# boxy <- function(single_organ_data, row_indices){ -# # Do a boxcox tranformon the data -# # If row_indices subset based on these rows (when doing specimen n =1) -# -# if (identical(row_indices, FALSE)){ -# Box <- boxcox(single_organ_data ~ groups$crl, plotit = FALSE, lambda = seq(-2, 2, len = 1000)) -# }else{ -# single_organ_data <- single_organ_data[row_indices] -# Box <- boxcox(single_organ_data ~ groups$crl[row_indices], plotit = FALSE, lambda = seq(-2, 2, len = 1000)) -# } -# -# Cox = data.frame(Box$x, Box$y) -# CoxSorted = Cox[with(Cox, order(-Cox$Box.y)),] -# lambda = CoxSorted[1, "Box.x"] -# tformed <- bcPower(single_organ_data, lambda) -# return(tformed) -# } - con <- file(pixels_file, "rb") + dim <- readBin(con, "integer", 2) -mat <- abs(matrix( readBin(con, "numeric", prod(dim)), dim[1], dim[2])) +mat <- abs(matrix(readBin(con, "numeric", prod(dim)), dim[1], dim[2])) + close(con) formula_elements <- strsplit(formula, split=',') -# print('lm formula elements'); -# print(formula_elements); - - -if (do_box_cox == TRUE){ - print('##doing boxcox##') +# just commenting out to improve speed +# if (do_box_cox == TRUE){ + # print('##doing boxcox##') # tformed = apply(mat, 2, boxy, row_indices=FALSE) # fit <- lm(tformed ~., data=groups[, unlist(formula_elements)]) -}else{ - fit <- lm(mat ~., data=groups[, unlist(formula_elements)]) -} +if ("treatment" %in% colnames(groups)){ + fit <- lm(mat ~genotype:treatment+., data=groups[, unlist(formula_elements)]) + + results <- pandt_vals(fit) + + tval <- results$tvals[c(2,3,5),] + pval <- results$pvals[c(2,3,5),] + + dim <- c(length(pval[,1]), length(pval[1,]), 0) -# line_level_plot_dir <- file.path(plot_dir, 'line_level_plots') -# dir.create(line_level_plot_dir, showWarnings = FALSE) -# apply(mat, 2, plot_lm, groups, outdir=line_level_plot_dir) + tscores <- pvals <- array(numeric(), dim) + + pvals <- abind(pvals, data.matrix(pval)) + # for g by e studies, the aov returns f_values + tscores <- abind(tscores, data.matrix(tval)) + + + +} else { + fit <- lm(mat ~., data=groups[, unlist(formula_elements)]) + results <- pandt_vals(fit) + pvals = results$pvals[1,] + tscores = results$tvals[2,]} -results <- pandt_vals(fit) -pvals = results$pvals[2,] -tscores = results$tvals[2,] # Now fit each specimen individually to the linear model -mutant_row_nums = which(groups$genotype == 'mutant'); -wt_row_nums = which(groups$genotype == 'wildtype') +if("treatment" %in% colnames(groups)){ + # I just want the interaction rows to reduce multiple testing -for (r in mutant_row_nums){ - #For each mutant add the mutant row number to the wt row indices - row_indices = c(wt_row_nums, r) + mutant_row_nums = which((groups$genotype == 'mutant') & (groups$treatment == 'treatment')) + # wt_row_nums = which((groups$genotype == 'wildtype') & (groups$treatment == 'vehicle')) + non_interaction_row_nums = which(!((groups$genotype == 'mutant') & (groups$treatment == 'treatment'))) - if (do_box_cox == TRUE){ + for (r in mutant_row_nums){ + #For each mutant add the mutant row number to the wt row indices + row_indices = c(non_interaction_row_nums, r) - tformed = apply(mat, 2, boxy, row_indices=row_indices) - fit_specimen <- lm(tformed ~., data=groups[row_indices, unlist(formula_elements)]) + fit_specimen <- lm(mat[row_indices, ] ~genotype:treatment+ ., data=groups[row_indices, unlist(formula_elements)]) - }else{ - fit_specimen <- lm(mat[row_indices, ] ~., data=groups[row_indices, unlist(formula_elements)]) + spec_results <- pandt_vals(fit_specimen) - } + pval <- results$pvals[c(2,3,5),] - specimen_plot_path = file.path(plot_dir, groups[r, 0]) - #plot_lm(specimen_plot_path, fit_specimen) + tval <- results$tvals[c(2,3,5),] - spec_results <- pandt_vals(fit_specimen) - pvals = append(pvals, spec_results$pvals[2,]) - tscores = append(tscores, spec_results$tvals[2,]) + pvals = abind(pvals, data.matrix(pval)) -} + tscores = abind(tscores, data.matrix(tval)) + + specimen_plot_path = file.path(plot_dir, groups[r, 0]) + } + #TODO: probably don't hard code file extensions + file_exts = c('genotype','treatment','interaction') + for (f in 1:3) { + + t_data <- tscores[f, , ] + p_data <- pvals[f, , ] + + pvals_g_out = paste(pvals_out, file_exts[f], sep="_") + + + + + poutCon <- file(pvals_g_out, "wb") + writeBin(as.vector(p_data), poutCon) + close(poutCon) -# print('writigpvals file') -poutCon <- file(pvals_out, "wb") -# writeBin(results$pvals[2,], poutCon) -writeBin(pvals, poutCon) -close(poutCon) + tvals_g_out = paste(tvals_out, file_exts[f], sep="_") + toutCon <- file(tvals_g_out, "wb") -toutCon <- file(tvals_out, "wb") + writeBin(0 - as.vector(t_data), toutCon) + close(toutCon) + } -# R returns the genotype effect for wildtype so we must flip the sign to get it for mutant -writeBin(0 - tscores, toutCon) +}else{ + mutant_row_nums = which(groups$genotype == 'mutant'); + wt_row_nums = which(groups$genotype == 'wildtype') + + for (r in mutant_row_nums){ + + row_indices = c(wt_row_nums, r) + + fit_specimen <- lm(mat[row_indices, ] ~., data=groups[row_indices, unlist(formula_elements)]) + spec_results <- pandt_vals(fit_specimen) + pvals = append(pvals, spec_results$pvals[2,]) + tscores = append(tscores, spec_results$tvals[2,]) -close(toutCon) + specimen_plot_path = file.path(plot_dir, groups[r, 0]) + } + poutCon <- file(pvals_out, "wb") + # writeBin(results$pvals[2,], poutCon) + writeBin(pvals, poutCon) + close(poutCon) + toutCon <- file(tvals_out, "wb") + # R returns the genotype effect for wildtype so we must flip the sign to get it for mutant + writeBin(0 - tscores, toutCon) + + close(toutCon) + + +} diff --git a/lama/stats/rscripts/two_way_plot.R b/lama/stats/rscripts/two_way_plot.R new file mode 100644 index 00000000..f70dd94e --- /dev/null +++ b/lama/stats/rscripts/two_way_plot.R @@ -0,0 +1,488 @@ +# Title : TODO +# Objective : TODO +# Created by: u5823099 +# Created on: 21/03/2022 + + +if (!require(grid)) install.packages(c('dplyr','factoextra', 'janitor', 'readr', + 'tidyverse', 'ggplot2','cluster','ggforce', + 'cowplot', 'grid','gridExtra', 'stringr'), + repos='http://cran.us.r-project.org') + +args <- commandArgs(trailingOnly = TRUE) + +organ_file <- args[1]; +staging_file <- args[2]; +label_file <-args[3]; +voxel_size <-args[4]; + + +library(dplyr) +library(readr) +library(tidyverse) +library(janitor) +library(ggplot2) +library(factoextra) +library(cluster) +library(ggforce) +library(cowplot) +library(grid) +library(gridExtra) +library(stringr) + +### set up some cool functions +variable_names <- list( + "WT" = expression(bolditalic("Zic2")^bolditalic("+/+")), + "HET" = expression(bolditalic("Zic2")^bolditalic("Ku/+")) +) + +variable_labeller <- function(variable,value){return(variable_names[value])} + +back_names <- list( + "C3H" = "C3H/HeH", + "C57BL6" = "C57BL6/N" +) + +back_labeller <- function(variable,value){return(back_names[value])} + + + + +#This is the only way to call to make it work! +inter_names <- list( + "WT.C3H" = expression(bolditalic("Zic2")^bolditalic("+/+")~bold("C3H/HeH")), + "HET.C3H" = expression(bolditalic("Zic2")^bolditalic("Ku/+")~bold("C3H/HeH")), + "WT.C57BL6" = expression(bolditalic("Zic2")^bolditalic("+/+")~bold("C57BL6/N")), + "HET.C57BL6" = expression(bolditalic("Zic2")^bolditalic("Ku/+")~bold("C57BL6/N")) +) + +inter_labeller <- function(variable,value){return(inter_names[value])} + + +#Functions for stat tests: + +pandt_vals <- function(fit) { + # so for some reason these are not two dimensional and only 1d + + # get estimates + est <- fit$coefficients[fit$qr$pivot] + + # get R: see stats:::summary.lm to see how this is calculated + p1 <- 1L:(fit$rank) + R <- diag(chol2inv(fit$qr$qr[p1, p1, drop = FALSE])) + + # get residual sum of squares for each + resvar <- sum(fit$residuals^2) / fit$df.residual + # R is same for each coefficient, resvar is same within each model + se <- sqrt(outer(R, resvar)) + + tvals <- est / se + pvals <- pt(abs(est / se), df = fit$df.residual, lower.tail = FALSE) * 2 + + return(list(pvals=pvals, tvals=tvals)) +} + + +adj_p <- function(pvals){ + p.adjust(pvals, method='BH') +} + +#converts to mm^3 +scale_to_mm3 <- function(x) { + #voxel size is 40um + um3_conv_factor <- voxel_size^3 # To convert voxels to um3 + um3_to_mm3_conv_factor <- 1e9 + return((x * um3_conv_factor)/um3_to_mm3_conv_factor) +} + +#normaliser +normalise <- function(x, na.rm = FALSE) (x/full_info$WEV) + +# staging +staging_info <- read_csv(staging_file) + + +organ_info <- read_csv(organ_file) %>% remove_constant() + +#add in factors remove empty labels +organ_info <- arrange(transform(organ_info, Genotype=factor(Genotype,levels=c("WT","HET")))) + + +organ_info <- arrange(transform(organ_info, Background=factor(Background,levels=c("C3H","C57BL6")))) + + +label_info <- read_csv(label_file) + + +# rename stuff +names(staging_info)[names(staging_info) == "vol"] <- "Embryo" + +#get the label_info into the right format + +label_info <- t(label_info) + +#remove annoying first column +label_info <- label_info[,-1] + +str(label_info[2,]) + +# merge dfs +full_info <- merge(staging_info, organ_info, + by = c("Embryo","Genotype", "Background"), + all.x = TRUE, all.y = TRUE) + + +full_info <- full_info %>% + mutate_at(vars(contains('X')), scale_to_mm3) %>% + mutate_at(vars(contains('WEV')), scale_to_mm3) %>% + mutate_at(vars(contains('X')), normalise) + +#re-order +full_info <- arrange(transform(full_info, Background=ordered(Background,levels=c("C3H","C57BL6")))) + +full_info <- arrange(transform(full_info, Genotype=ordered(Genotype,levels=c("WT","HET")))) + +#get proper organ_names + +name_list <-c("Embryo", "Genotype", "Background", "WEV", label_info[2, ]) + +colnames(full_info) <- name_list + +#Do full and pairwise comparisons: +g_by_e_lm <- function(y, dataset, pair_test=F){ + if (pair_test){ + library(mixlm) + #Had to transform the data to not get residuals adding to 0? + model <- lm(I(y * 1e6)~Genotype:Background+Genotype+Background+WEV, data=dataset) + pair_model <- simple.glht(model,'Genotype:Background') + #print("it's me") + pvals <- pair_model$res$'P(>t)' + names(pvals) <- rownames(pair_model$res) + #gets p and t vals from model + #results <- list(pvals=pair_model$res$'P(>t)', tvals=pair_model$res$'t value') + detach("package:mixlm", unload=T) + library(stats) + return(pvals) + } + else{ + #dataset$Genotype <- factor(dataset$Genotype, ordered=F) + #dataset$Background <- factor(dataset$Background, ordered=F) + model <- lm(y~Genotype:Background+Genotype+Background+WEV, data=dataset) + results <- pandt_vals(model) + pvals <- results$pvals + names(pvals) <- names(model$coefficients) + return(pvals) + } +} + +g_by_e_pvals <- sapply(full_info[5:191], function(data) + g_by_e_lm(data, full_info, pair_test = F) +) + +g_by_e_qvals <- t(apply(g_by_e_pvals[c(2,3,5), ], 1, function(x) p.adjust(x, method='BH'))) + + +pairwise_pvals <- sapply(full_info[5:191], function(data) + g_by_e_lm(data, full_info, pair_test = T) +) + +pairwise_qvals <- t(apply(pairwise_pvals[c(1,2,5,6), ], 1, function(x) p.adjust(x, method='BH'))) + + +# function to annotate_pvals +anno_qvals <- function(qvals) { + paste_vect <- list(length(qvals)) + #print(names(qvals)) + for (col in seq_along(qvals)) { + #remove the stupid signed integer crap for the g_by_e_stuff + Names <- names(qvals[col]) %>% + str_remove_all(".L") %>% + str_replace("WT:C57BL6", as.character("wildtype C57BL6/N")) %>% + str_replace("WT:C3H", as.character("wildtype C3H/HeH")) %>% + str_replace("HET:C57BL6", as.character("mutant C57BL6/N")) %>% + str_replace("HET:C3H", as.character("mutant C3H/HeH")) %>% + str_replace(":", " X ") %>% + str_replace("-", " v.s. ") + paste_vect[col] <- paste0(Names, " = ", round(qvals[col], 6)) + } + return(paste_vect) +} + +g_by_e_anno <- lapply(seq_along(g_by_e_qvals[1,]), function(i) + anno_qvals(g_by_e_qvals[,i]) +) + +pair_anno <- lapply(seq_along(pairwise_qvals[1,]), function(i) + anno_qvals(pairwise_qvals[,i]) +) + +###summary_grid### + +#this may be stupid + +rect_chooser <- function(Genotype, Background){ + ### this stupid function just makes a rectangle dataframe (i.e. left, right, top, bottom) beased on the condition per row + xleft = vector() + xright = vector() + ybottom = vector() + ytop = vector() + for (i in (1:length(Genotype))){ + if (interaction(Genotype[i],Background[i])=="WT.C3H"){ + xleft <- append(xleft, 0) + xright <- append(xright, 0.9) + ybottom <- append(ybottom, 1.1) + ytop <- append(ytop, 2) + } + else if (interaction(Genotype[i],Background[i])=="HET.C3H"){ + xleft <-append(xleft, 1.1) + xright <- append(xright, 2) + ybottom <-append(ybottom, 1.1) + ytop <- append(ytop, 2) + } + else if (interaction(Genotype[i],Background[i])=="WT.C57BL6"){ + xleft <- append(xleft, 0) + xright <- append(xright, 0.9) + ybottom <- append(ybottom, 0) + ytop <- append(ytop, 0.9) + } + else{ + xleft <- append(xleft, 1.1) + xright <- append(xright, 2) + ybottom <- append(ybottom, 0) + ytop <-append(ytop, 0.9) + } + + } + return(data.frame(l = xleft, r = xright, b = ybottom, t = ytop)) +} + + + +# So get the rectangle values +dumb_sum <- full_info[1:3] %>% + mutate(Rect=rect_chooser(Genotype, Background)) + + +# now design p-val +square_plots <- lapply(seq_along(full_info[5:191]), function(i) + ggplot()+ + geom_rect(dumb_sum, mapping=aes(xmin=Rect$l, xmax=Rect$r, ymin=Rect$b, ymax=Rect$t, + fill=interaction(Genotype,Background)))+ + geom_text(dumb_sum, mapping=aes(x=(Rect$l+Rect$r)/2, y=(Rect$b+Rect$t)/2, # add labels in center of rectangle- needs brackets to work + label = sapply(interaction(Genotype,Background), + function(cond) gsub("expression","", inter_labeller(1, cond)) )), # calls inter_labeller on just the one row and + # removes the expression + parse=T)+ + {if(pairwise_qvals[1, i] < 0.05 ) geom_segment(aes(x = -0.1, xend = 2.1, y = 2.1, yend = 2.1), # wt veh vs het veh + arrow = arrow(type = "closed",ends = "both"))}+ + {if(pairwise_qvals[2, i] < 0.05 ) geom_segment(aes(x = -0.1, xend = -0.1, y = -0.1, yend = 2.1), # wt veh vs wt eth + arrow = arrow(type = "closed",ends = "both"))}+ + {if(pairwise_qvals[3, i] < 0.05 ) geom_segment(aes(x = 2.1, xend = 2.1, y = -0.1, yend = 2.1), # het veh vs het eth + arrow = arrow(type = "closed",ends = "both"))}+ + {if(pairwise_qvals[4, i] < 0.05) geom_segment(aes(x = -0.1, xend = 2.1, y = -0.1, yend = -0.1), # wt eth vs het eth + arrow = arrow(type = "closed",ends = "both"))}+ + {if(g_by_e_qvals[1, i] < 0.05) geom_segment(aes(x = 1, xend = 1, y = 0, yend = 2, color = "red"))}+ + {if(g_by_e_qvals[1, i] < 0.05) geom_curve(aes(x = 0.45, xend = 1.55, y = 1, yend = 1), + curvature = 0.25, arrow = arrow(type = "closed", ends = "both"))}+ + + {if(g_by_e_qvals[2, i] < 0.05) geom_segment(aes(x = 0, xend = 2, y = 1, yend = 1, color = "red"))}+ + {if(g_by_e_qvals[2, i] < 0.05) geom_curve(aes(x = 1, xend = 1, y = 0.45, yend = 1.55), + curvature = 0.25, arrow = arrow(type = "closed",ends = "both"))}+ + + {if(g_by_e_qvals[3, i] < 0.05) geom_ellipse(aes(x0=1.55,y0=0.45,a=0.5,b=0.5, angle=0, colour="red"))}+ + #ggtitle(stringr::str_to_title(gsub("_", " ", names(full_info[5:191])[[i]])))+ + theme(legend.position = "none", + axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank(), + axis.title.y=element_blank(), + axis.text.y=element_blank(), + axis.ticks.y=element_blank())+#,plot.title = element_text(hjust = 0.5, size = 40),)+ + scale_fill_manual(name="", labels=unlist(inter_names), values=c("#F8766D", "#00BFC4","#FFA500","#C77CFF"))+ + labs(tag = "A") + +)# scales to colours we want + +### Draw g_by_e_boxplots for results +box_plots <- lapply(seq_along(full_info[5:191]), function(data) + ggplot(full_info,aes(x=WEV,y=full_info[5:191][[data]],colour=interaction(Genotype, Background),label=Embryo))+ + geom_boxplot()+ + geom_point()+ + #geom_smooth(method = "lm")+ + facet_grid(~interaction(Genotype,Background), labeller = inter_labeller, scales = "free_x")+ + geom_text(aes(label=ifelse(data > quantile(data, 0.975),as.character(Embryo),'' )), hjust=0, vjust=0)+ + geom_text(aes(label=ifelse(data < quantile(data, 0.025),as.character(Embryo),'' )), hjust=0, vjust=0)+ + scale_color_manual(name="", labels=unlist(inter_names), values=c("#F8766D", "#00BFC4","#FFA500","#C77CFF"))+ + ylab(expression("Organ Volume Normalised To WEV"))+ + xlab(expression("Whole Embryo Volume (WEV)"))+ + ggtitle(paste(g_by_e_anno[data][[1]][[1]], " ", + g_by_e_anno[data][[1]][[2]]) , subtitle = g_by_e_anno[data][[1]][[3]])+ + theme(plot.title = element_text(hjust = 0.5, size = 10), plot.subtitle=element_text(size=10, hjust=0.5), + strip.text.x = element_text(hjust = 0.5, size = 6), legend.position = "none" )+ + labs(tag = "B")) + + +## genotype pairwise comparisons +geno_line_plots <- lapply(seq_along(full_info[5:191]), function(data) + ggplot(full_info,aes(x=WEV,y=full_info[5:191][[data]],colour=interaction(Genotype, Background),label=Embryo))+ + #geom_boxplot()+ + geom_point()+ + geom_smooth(method = "lm")+ + facet_grid(~Background, labeller = back_labeller, scales = "free_x")+ + geom_text(aes(label=ifelse(data > quantile(data, 0.975),as.character(Embryo),'' )), hjust=0, vjust=0)+ + geom_text(aes(label=ifelse(data < quantile(data, 0.025),as.character(Embryo),'' )), hjust=0, vjust=0)+ + scale_color_manual(name="", labels=unlist(inter_names), values=c("#F8766D", "#00BFC4","#FFA500","#C77CFF"))+ + ylab(expression("Organ Volume Normalised To WEV"))+ + xlab(expression("Whole Embryo Volume (WEV)"))+ + labs(tag = "C")+ + ggtitle(pair_anno[data][[1]][[1]], subtitle = pair_anno[data][[1]][[4]])+ + theme(plot.title = element_text(hjust = 0.5, size = 10), plot.subtitle=element_text(size=10, hjust=0.5),legend.position = "none")) + +# annotate facets with p-vals +# geno_tagged <-lapply(seq_along(geno_line_plots), function(p) +# tag_facet(geno_line_plots[p][[1]], +# tag_pool = c( +# as.character(pair_anno[p][[1]][[1]]), +# as.character(pair_anno[p][[1]][[4]])), +# open="", close="", +# fontface = 4, +# size=4) +# ) + +### Background pairwise p-vals +treat_line_plots <- lapply(seq_along(full_info[5:191]), function(data) + ggplot(full_info,aes(x=WEV,y=full_info[5:191][[data]],colour=interaction(Genotype, Background),label=Embryo))+ + #geom_boxplot()+ + geom_point()+ + geom_smooth(method = "lm")+ + facet_grid(~Genotype, labeller = variable_labeller, scales = "free_x")+ + geom_text(aes(label=ifelse(data > quantile(data, 0.975),as.character(Embryo),'' )), hjust=0, vjust=0)+ + geom_text(aes(label=ifelse(data < quantile(data, 0.025),as.character(Embryo),'' )), hjust=0, vjust=0)+ + scale_color_manual(name="", labels=unlist(inter_names), values=c("#F8766D", "#00BFC4","#FFA500","#C77CFF"))+ + ylab(expression("Organ Volume Normalised To WEV"))+ + xlab(expression("Whole Embryo Volume (WEV)"))+ + labs(tag = "D")+ + ggtitle(pair_anno[data][[1]][[2]], subtitle = pair_anno[data][[1]][[3]])+ + theme(plot.title = element_text(hjust = 0.5, size = 10), plot.subtitle=element_text(size=10, hjust=0.5),legend.position = "none")) + + +#### PCA analysis #### + +# basic PCA +PCAs <- lapply(full_info[5:191], function(data) + prcomp(data.frame(org=data, WEV=full_info[4]), scale=T) +) + + +groups <- interaction(full_info$Genotype, full_info$Background) + + + +PCA_plots <- lapply(PCAs,function(data) + fviz_pca_ind(data, + geom = "point", + legend.title = "Groups", + pallete =c("#F8766D", "#00BFC4","#FFA500","#C77CFF"), + habillage = interaction(full_info$Genotype, full_info$Background), + addEllipses = T, + repel = TRUE, + title="PCA analysis")+ + scale_fill_manual(name="", labels=unlist(inter_names), values=c("#F8766D", "#00BFC4","#FFA500","#C77CFF"))+ + scale_color_manual(name="", labels=unlist(inter_names), values=c("#F8766D", "#00BFC4","#FFA500","#C77CFF"))+ + theme(legend.position = "none")+ + labs(tag = "E") +) + +#### Play around with clustering #### + +df <- (data.frame(org=full_info[7], WEV=full_info[4])) + +# Set up test data-frame to develop clustering. Scale as organ volumes are fractions while whole embryo volumes are massive + + +my_list <- list("1"="1", "2"="2", "3" = "3", "4"="4") + +dfs <- lapply(full_info[5:191], function(data) + scale(data.frame(org=data, WEV=full_info[4]))) + +km_plots <- lapply(dfs, function(data){ + fviz_cluster(kmeans(data, 4, nstart = 25), data = data, geom = "point", repel = T, main="Kmeans Cluster")}) + + +for(i in seq_along(km_plots)){ + km_plots[[i]]$Group <- interaction(full_info$Genotype, full_info$Background) + str(km_plots[[i]]$Group) +} + + +km_plots_lab <- lapply(km_plots, function(plot) + plot + geom_point(aes(colour = plot$Group))+ + scale_fill_manual(values = c("white", "white","white","white"))+ + scale_color_manual(labels=c(my_list, + inter_names), + values = c("black", "black","black","black", "#C77CFF", "#00BFC4","#FFA500", "#F8766D"))+ + theme(legend.position = "none")+ + labs(tag = "F") ) + +#Dumb fix but it will do +for(i in seq_along(km_plots)){ + km_plots[[i]]$data$Group <- interaction(full_info$Genotype, full_info$Background) + str(km_plots[[i]]$Group) +} + +cluster.summaries <- lapply(km_plots, function(pl) + pl$data %>% + group_by(Group, cluster) %>% + summarise(n=n()) %>% + group_by(Group) %>% + mutate(nGroup = sum(n), Percentage = n / sum(n) * 100) +) + + + +#Make pdf: + +pdf("g_by_back_plots_v1.pdf", onefile = T, paper="a4r", width=13, height=10) +for(i in seq_along(square_plots)){ + + #plot rect summary in square_1 + p1 <- as_grob(square_plots[[i]]) + # plot boxplots in square_2 if g_by_e signif + if (any(g_by_e_qvals[,i] < 0.05)){ + p2 <- as_grob(box_plots[[i]]) + } + else {p2 <-as_grob(geom_blank())} + # plot geno line plots in square 3 if g-pairwise are significant + if (any(pairwise_qvals[c(1,4), i] < 0.05)){ + p3 <- as_grob(geno_line_plots[[i]]) + } + else{p3<-as_grob(geom_blank())} + # same thing for treat line plots in square4 + if (any(pairwise_qvals[c(2,3), i] < 0.05)){ + p4 <- as_grob(treat_line_plots[[i]]) + } + else{ p4 <-as_grob(geom_blank())} + + if (any(cluster.summaries[[i]]$Percentage > 60.5) ) { + p5 <- as_grob(PCA_plots[[i]]) + + p6 <- as_grob(km_plots_lab[[i]]) + } + else{ + p5 <-as_grob(geom_blank()) + p6 <-as_grob(geom_blank()) + } + + grid.arrange(grobs=list(p1, p2, p3, p4, p5, p6), ncol=3, nrow=2, + top = textGrob(stringr::str_to_title(gsub("_", " ", names(full_info[5:191])[[i]])), gp=gpar(fontsize=28,font=8))) +} + +dev.off() + + + + + + + + diff --git a/lama/stats/standard_stats/data_loaders.py b/lama/stats/standard_stats/data_loaders.py index 9ef60f78..38483d5c 100644 --- a/lama/stats/standard_stats/data_loaders.py +++ b/lama/stats/standard_stats/data_loaders.py @@ -29,6 +29,9 @@ from logzero import logger as logging import pandas as pd import toml +import SimpleITK as sitk + +from lama.img_processing.normalise import IntensityMaskNormalise, IntensityHistogramMatch from lama import common from lama.img_processing.misc import blur @@ -54,7 +57,7 @@ def __init__(self, info: pd.DataFrame, line: str, shape: Tuple, - paths: Tuple[List], + paths: List[List], #changed from tuple, cause two_way needs four paths mask: np.ndarray = None, outdirs = None, cluster_data = None, @@ -99,10 +102,16 @@ def __init__(self, if len(data) != len(info): raise ValueError - - def mutant_ids(self): - return self.info[self.info.genotype == 'mutant'].index - + + def mutant_ids(self, filt_flag = False): + if 'treatment' in self.info.columns: #identifies whether it is a TWO-WAY + if filt_flag: # filter flag is telling the two_way whether to inter only samples (i.e. loading vs cleaning) + return self.info[((self.info.genotype == 'mutant')&(self.info.treatment =='treatment'))].index + else: + return self.info.index # collect everything as it what you load for the two_way is context dependent + # i.e. up until linear model = all, past linear model = just interaction + else: + return self.info[self.info.genotype == 'mutant'].index def specimen_ids(self) -> List: return self.info.index.values @@ -237,13 +246,18 @@ def __init__(self, lines_to_process: Union[List, None] = None, baseline_file: Union[str, None] = None, mutant_file: Union[str, None] = None, - memmap: bool = False): + memmap: bool = False, + treatment_dir: Path = None, + interaction_dir: Path = None, + ref_vol_path: Path = None): """ Parameters ---------- wt_dir mut_dir + treatment_dir (optional) + interaction_dir (optional) mask config label_info_file @@ -271,12 +285,16 @@ def __init__(self, self.wt_dir = wt_dir self.mut_dir = mut_dir + self.treatment_dir = treatment_dir + self.interaction_dir = interaction_dir self.config = config self.label_info_file = label_info_file self.lines_to_process = lines_to_process self.mask = mask # 3D mask self.shape = None + self.ref_vol = ref_vol_path if ref_vol_path else None + self.normaliser = None self.blur_fwhm = config.get('blur', DEFAULT_FWHM) @@ -305,7 +323,7 @@ def factory(type_: str): elif type_ == 'organ_volumes': return OrganVolumeDataGetter else: - raise ValueError(f'{type_} is not a valid stats analysis type\nMust be either "intensity", "jacobians", or "organ_volumes"') + raise ValueError(f'{type_} is not a valid stats analysis type\nMust be either "intensity", "jacobians","radiomics" or "organ_volumes"') def load_ids(self, path): if path: @@ -332,6 +350,20 @@ def _read(self, paths: List[Path]) -> np.ndarray: raise NotImplementedError + # flatten the array + def _flatten(self, vols): + for i, vol in enumerate(vols): + blurred_array = blur(sitk.GetArrayFromImage(vol), self.blur_fwhm, self.voxel_size) + masked = blurred_array[self.mask != False] + if self.memmap: + t = tempfile.TemporaryFile() + m = np.memmap(t, dtype=masked.dtype, mode='w+', shape=masked.shape) + m[:] = masked + masked = m + + vols[i] = masked + + def cluster_data(self): raise NotImplementedError @@ -369,6 +401,88 @@ def filter_specimens(self, ids_to_use: List, specimen_paths: List, staing: pd.Da return filtered_paths, filtered_staging + + def two_way_iterator(self) -> LineData: + """ + performs two_way iteration -assumes that there is no line iteration + """ + condition_list = [[self.wt_dir, ['wildtype','vehicle']], + [self.mut_dir, ['mutant','vehicle']], + [self.treatment_dir, ['wildtype','treatment']], + [self.interaction_dir, ['mutant','treatment']]] + + data = [] + + full_staging = pd.DataFrame() + + paths_list = [] + + # unpack list + for _dir, condition in condition_list: + + metadata = self._get_metadata(_dir) + + paths = list(metadata['data_path']) + + paths_list.extend(paths) + + #this is just something stupid to get line data #TODO: make this less stupid + line = condition[0] + '_' + condition[1] + + # get staging + staging = get_staging_data(_dir) + + staging['genotype'] = condition[0] + + staging['treatment'] = condition[1] + + # should be no baseline ids, so no need to filter specimens + + vols = self._read(paths) + + if self.normaliser: + # this makes sense right? + if isinstance(self.normaliser, IntensityMaskNormalise): + if _dir == self.wt_dir: + self.normaliser.add_reference(vols) + # ->temp bodge to get mask in there + self.normaliser.mask = self.mask + # <-bodge + self.normaliser.normalise(vols, ) + elif isinstance(self.normaliser, IntensityHistogramMatch): + # we have to re-read the data to be to be 3D array + vols = [common.LoadImage(path).img for path in paths] + if _dir == self.wt_dir: + wt_paths = [path for path in paths if ('baseline' in str(path))] + wt_vols = [common.LoadImage(path).img for path in wt_paths] + + ref_vol = common.LoadImage(self.ref_vol).img if self.ref_vol else wt_vols[0] + + self.normaliser.normalise(vols, ref_vol) + + #flatten the array + self._flatten(vols) + + masked_data = [x.ravel() for x in vols] + + full_staging = pd.concat((full_staging, staging)) + # Id there is a value column, change to staging. TODO: make lama spitout staging header instead of value + if 'value' in full_staging: + staging.rename(columns={'value': 'staging'}, inplace=True) + + # data = np.vstack((masked_wt_data, masked_mut_data)) # This almost doubled memory usage. + # Stick all arrays in a list instead + + data.extend(masked_data) + + + # cluster_data = self.cluster_data(data) # The data to use for doing t-sne and clustering + + if _dir == self.interaction_dir: + input_ = LineData(data, full_staging, line, self.shape, paths_list, self.mask) + yield input_ + + def line_iterator(self) -> LineData: """ The interface to this class. Calling this function yields and InputData object @@ -377,7 +491,7 @@ def line_iterator(self) -> LineData: The wild type data is the same for each mutant line so we don't have to do multiple reads of the potentially large dataset - Returns + Returns: ------- LineData """ @@ -394,18 +508,37 @@ def line_iterator(self) -> LineData: wt_vols = self._read(wt_paths) if self.normaliser: - self.normaliser.add_reference(wt_vols) + if isinstance(self.normaliser,IntensityMaskNormalise): + self.normaliser.add_reference(wt_vols) + + # ->temp bodge to get mask in there + self.normaliser.mask = self.mask + # <-bodge + self.normaliser.normalise(wt_vols,) + elif isinstance(self.normaliser, IntensityHistogramMatch): + # we have to re-read the data to be to be 3D array + wt_vols = [common.LoadImage(path).img for path in wt_paths] + + # check in the config if there is a reference volume + # get the reference volume + + ref_vol_path = wt_paths.parent + ref_vol = common.LoadImage(ref_vol_path) + + + ref_vol = wt_vols[0] + + self.normaliser.normalise(wt_vols, ref_vol) + self._flatten(wt_vols) + - # ->temp bodge to get mask in there - self.normaliser.mask = self.mask - # <-bodge - self.normaliser.normalise(wt_vols) # Make a 2D array of the WT data masked_wt_data = [x.ravel() for x in wt_vols] mut_metadata = self._get_metadata(self.mut_dir, self.lines_to_process) + # Iterate over the lines logging.info('loading mutant data') @@ -428,7 +561,13 @@ def line_iterator(self) -> LineData: mut_vols = self._read(mut_paths) if self.normaliser: - self.normaliser.normalise(mut_vols) + if isinstance(self.normaliser, IntensityMaskNormalise): + self.normaliser.normalise(mut_vols, ) + elif isinstance(self.normaliser, IntensityHistogramMatch): + mut_vols = [common.LoadImage(path).img for path in mut_paths] + self.normaliser.normalise(mut_vols, ref_vol) + self._flatten(mut_vols) + masked_mut_data = [x.ravel() for x in mut_vols] staging = pd.concat((wt_staging, mut_staging)) @@ -447,6 +586,7 @@ def line_iterator(self) -> LineData: input_ = LineData(data, staging, line, self.shape, (wt_paths, mut_paths), self.mask) yield input_ + class VoxelDataLoader(DataLoader): """ @@ -454,6 +594,9 @@ class VoxelDataLoader(DataLoader): """ def __init__(self, *args, **kwargs): super(VoxelDataLoader, self).__init__(*args, **kwargs) + + # Specifies the subfolder from which to load the data. eg log_jacobains/ + self.data_sub_folder = None def cluster_data(self, data): pass @@ -553,7 +696,7 @@ def _get_metadata(self, root_dir: Path, lines_to_process: Union[List, None] = No # data_dir contains the specimen data we are after data_dir = spec_out_dir / self.data_folder_name / self.data_sub_folder - + if not data_dir.is_dir(): raise FileNotFoundError(f'Cannot find data directory: {data_dir}') @@ -576,11 +719,11 @@ class JacobianDataLoader(VoxelDataLoader): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.datatype = 'jacobians' - self.data_folder_name = 'jacobians' + self.data_folder_name = 'log_jacobians' self.data_sub_folder = self.config['jac_folder'] def _get_data_file_path(self, data_dir: Path, spec_dir: Path) -> Path: - res = list(data_dir.glob(f'{spec_dir.name}*')) + res = list(data_dir.glob(f'*{spec_dir.name}*')) if res: return res[0] @@ -678,6 +821,68 @@ def line_iterator(self) -> LineData: input_ = LineData(data, staging, line, self.shape, ([self.wt_dir], [self.mut_dir])) yield input_ + def two_way_iterator(self) -> LineData: + + condition_list = [[self.wt_dir, ['wildtype','vehicle']], + [self.mut_dir, ['mutant','vehicle']], + [self.treatment_dir, ['wildtype','treatment']], + [self.interaction_dir, ['mutant','treatment']]] + + full_data = pd.DataFrame() + + full_staging = pd.DataFrame() + + paths_list = [] + + if self.label_info is not None and 'no_analysis' in self.label_info: + skip_labels = self.label_info[self.label_info['no_analysis'] == True].label.astype(str) + else: + skip_labels = [] + + for _dir, condition in condition_list: + + data: pd.DataFrame = self._get_organ_volumes(_dir) + + data = data.drop(columns=['line']) + + # paths = list(data['data_path']) + + # paths_list.extend(paths) + + line = condition[0] + '_' + condition[1] + + staging = get_staging_data(_dir) + + staging['genotype'] = condition[0] + + staging['treatment'] = condition[1] + + + full_staging = pd.concat((full_staging, staging)) + + # Id there is a value column, change to staging. TODO: make lama spitout staging header instead of value + if 'value' in full_staging: + staging.rename(columns={'value': 'staging'}, inplace=True) + + full_data = pd.concat((full_data, data)) + + try: + full_data = full_data.drop(columns=skip_labels) + except KeyError: + pass + + if self.norm_to_mask_volume_on: # Is on by default + logging.info('normalising organ volume to whole embryo volumes') + full_vols = full_vols.div(staging['staging'], axis=0) + + if _dir == self.interaction_dir: + input_ = LineData(full_data, full_staging, line, self.shape, [cond[0] for cond in condition_list]) + yield input_ + + #input_ = LineData(data, staging, line, self.shape, ([self.wt_dir], [self.mut_dir])) + #yield input_ + + def get_metadata(self): """ Override the parent class to get the organ volume paths rather than the volumes @@ -727,7 +932,7 @@ def _get_organ_volumes(self, root_dir: Path) -> pd.DataFrame: def _drop_empty_columns(self, data: pd.DataFrame): """ - Rop data columns for the organ volumes that are not present in the label info file + Drop data columns for the organ volumes that are not present in the label info file Returns ------- diff --git a/lama/stats/standard_stats/lama_stats_new.py b/lama/stats/standard_stats/lama_stats_new.py index 96bdecf2..1808b60d 100644 --- a/lama/stats/standard_stats/lama_stats_new.py +++ b/lama/stats/standard_stats/lama_stats_new.py @@ -12,7 +12,7 @@ from pathlib import Path from typing import Union, List - +import os from logzero import logger as logging import logzero @@ -20,11 +20,12 @@ from lama.common import cfg_load from lama.stats.standard_stats.stats_objects import Stats, OrganVolume -from lama.stats.standard_stats.data_loaders import DataLoader, load_mask, LineData +from lama.stats.standard_stats.data_loaders import DataLoader, load_mask, LineData, JacobianDataLoader, IntensityDataLoader from lama.stats.standard_stats.results_writer import ResultsWriter from lama import common from lama.stats import linear_model -from lama.elastix.invert_volumes import InvertHeatmap +from lama.elastix import PROPAGATE_CONFIG +from lama.elastix.propagate_volumes import PropagateHeatmap from lama.img_processing.normalise import Normaliser from lama.qc import organ_vol_plots @@ -34,6 +35,8 @@ def run(config_path: Path, mut_dir: Path, out_dir: Path, target_dir: Path, + treatment_dir: Path = None, + interaction_dir: Path = None, lines_to_process: Union[List, None] = None ): """ @@ -62,7 +65,7 @@ def run(config_path: Path, list: optional mutant line ids to process only. None: process all lines """ - + if not (wt_dir / 'output').is_dir(): raise FileNotFoundError(f'{wt_dir / "output"} folder with registration results is not present') if not (mut_dir / 'output').is_dir(): @@ -73,6 +76,7 @@ def run(config_path: Path, raise FileNotFoundError('Cannot create output folder') master_log_file = out_dir / f'{common.date_dhm()}_stats.log' + # master_log_file = out_dir / f'{common.date_dhm()}_stats.log' logzero.logfile(str(master_log_file)) logging.info(common.git_log()) logging.info('### Started stats analysis ###}') @@ -96,6 +100,10 @@ def run(config_path: Path, if mutant_file: mutant_file = config_path.parent / mutant_file + ref_vol_path = stats_config.get('reference_vol') + if ref_vol_path: + ref_vol_path = config_path.parent / ref_vol_path + # Run each data class through the pipeline. for stats_type in stats_config['stats_types']: @@ -103,45 +111,57 @@ def run(config_path: Path, logging.info(f"---Doing {stats_type} analysis---") gc.collect() - - # load the required stats object and data loader + + loader_class = DataLoader.factory(stats_type) loader = loader_class(wt_dir, mut_dir, mask, stats_config, label_info_file, lines_to_process=lines_to_process, - baseline_file=baseline_file, mutant_file=mutant_file, memmap=memmap) + baseline_file=baseline_file, mutant_file=mutant_file, memmap=memmap, + treatment_dir=treatment_dir, interaction_dir=interaction_dir, ref_vol_path=ref_vol_path) # Only affects organ vol loader. if not stats_config.get('normalise_organ_vol_to_mask'): loader.norm_to_mask_volume_on = False - # Currently only the intensity stats get normalised + if loader_class == JacobianDataLoader: + if stats_config.get('use_log_jacobians') is False: + loader.data_folder_name = 'jacobians' + + # Check if reference vol exist during intensity norm + + # Currently only the intensity stats get normalised loader.normaliser = Normaliser.factory(stats_config.get('normalise'), stats_type) # move this into subclass logging.info("Start iterate through lines") common.logMemoryUsageInfo() - line_iterator = loader.line_iterator() - line_input_data = None + #USe different iterator if using doing a two-way analysis + if stats_config['two_way']: + line_iterator = loader.two_way_iterator() + line_input_data = None + + else: + line_iterator = loader.line_iterator() + line_input_data = None while True: try: line_input_data = next(line_iterator) logging.info(f"Data for line {line_input_data.line} loaded") common.logMemoryUsageInfo() - - + line_id = line_input_data.line line_stats_out_dir = out_dir / line_id / stats_type line_stats_out_dir.mkdir(parents=True, exist_ok=True) - line_log_file = line_stats_out_dir / f'{common.date_dhm()}_stats.log' + line_log_file = line_stats_out_dir / 'stats.log' logzero.logfile(str(line_log_file)) logging.info(f"Processing line: {line_id}") stats_class = Stats.factory(stats_type) - stats_obj = stats_class(line_input_data, stats_type, stats_config.get('use_staging', True)) + stats_obj = stats_class(line_input_data, stats_type, stats_config.get('use_staging', True), stats_config.get('two_way', False)) stats_obj.stats_runner = linear_model.lm_r stats_obj.run_stats() @@ -152,7 +172,7 @@ def run(config_path: Path, logging.info('Writing results...') rw = ResultsWriter.factory(stats_type) - writer = rw(stats_obj, mask, line_stats_out_dir, stats_type, label_map, label_info_file) + writer = rw(stats_obj, mask, line_stats_out_dir, stats_type, label_map, label_info_file, stats_config.get('two_way', False)) logging.info('Finished writing results.') common.logMemoryUsageInfo() @@ -171,8 +191,23 @@ def run(config_path: Path, logging.info('Propogating the heatmaps back onto the input images ') line_heatmap = writer.line_heatmap line_reg_dir = mut_dir / 'output' / line_id - invert_heatmaps(line_heatmap, line_stats_out_dir, line_reg_dir, line_input_data) - logging.info('Finished writing heatmaps.') + if stats_config.get('two_way', False): + logging.info("Inverting interaction heatmaps") + invert_heatmaps(line_heatmap, line_stats_out_dir, line_reg_dir, line_input_data, + two_way="int") + logging.info("Inverting treatment heatmaps") + line_heatmap = Path(str(line_heatmap).replace("_int_", "_treat_")) + invert_heatmaps(line_heatmap, line_stats_out_dir, line_reg_dir, line_input_data, + two_way="treat") + logging.info("Inverting genotype heatmaps") + line_heatmap = Path(str(line_heatmap).replace("_treat_", "_geno_")) + invert_heatmaps(line_heatmap, line_stats_out_dir, line_reg_dir, line_input_data, + two_way="geno") + logging.info('Finished writing heatmaps.') + else: + invert_heatmaps(line_heatmap, line_stats_out_dir, line_reg_dir, line_input_data) + logging.info('Finished writing heatmaps.') + logging.info(f"Finished processing line: {line_id} - All done") common.logMemoryUsageInfo() @@ -182,7 +217,8 @@ def run(config_path: Path, logging.info(f"Finish iterate through lines") line_input_data.cleanup() common.logMemoryUsageInfo() - break; + + break @@ -190,7 +226,8 @@ def run(config_path: Path, def invert_heatmaps(heatmap: Path, stats_outdir: Path, reg_outdir: Path, - input_: LineData): + input_: LineData, + two_way=False): """ Invert the stats heatmaps from a single line back onto inputs or registered volumes @@ -209,11 +246,28 @@ def invert_heatmaps(heatmap: Path, """ # Do some logging inverted_heatmap_dir = stats_outdir / 'inverted_heatmaps' - common.mkdir_force(inverted_heatmap_dir) - - for spec_id in input_.mutant_ids(): + os.makedirs(inverted_heatmap_dir, exist_ok=True) + + # KD note - baseline is done for the two-way too. + mut_specs = input_.mutant_ids() + if two_way: + inverted_heatmap_dir = inverted_heatmap_dir / str(two_way) + common.mkdir_force(inverted_heatmap_dir) + for i, spec_id in enumerate(mut_specs): # Should not have to specify the path to the inv config again - invert_config = reg_outdir / spec_id/ 'output' / 'inverted_transforms' / 'invert.yaml' - - inv = InvertHeatmap(invert_config, heatmap, inverted_heatmap_dir) - inv.run() + if two_way: + # so the reg_outdir is dependendent on condition + conds = ['baseline', 'mutants', 'treatment', 'mut_treat'] + # get the config file for the correct condition (only path that will exist) + + fnames = [reg_outdir.parent.parent.parent / cond / 'output' / cond / str( + spec_id) / 'output' / 'inverted_transforms' / PROPAGATE_CONFIG for cond in conds] + invert_config = [f for f in fnames if os.path.exists(f)][0] + + # tidy back to path + invert_config = Path(str(invert_config)) + else: + invert_config = reg_outdir / str(spec_id) / 'output' / 'inverted_transforms' / PROPAGATE_CONFIG + + inv = PropagateHeatmap(invert_config, heatmap, inverted_heatmap_dir) + inv.run() \ No newline at end of file diff --git a/lama/stats/standard_stats/read_config.py b/lama/stats/standard_stats/read_config.py index 0b3343ba..6437a7ac 100644 --- a/lama/stats/standard_stats/read_config.py +++ b/lama/stats/standard_stats/read_config.py @@ -60,7 +60,7 @@ def options(given, allowed): schema = { 'stats_types': { 'required': True, - 'validate': (seq, ['intensity', 'jacobians', 'organ_volume']) + 'validate': (seq, ['intensity', 'jacobians', 'organ_volume', 'radiomics']) }, 'blur_fwhm': { 'required': False, @@ -112,6 +112,10 @@ def options(given, allowed): 'normalise_organ_vol_to_mask': { 'required': False, 'validate' : [bool_] + }, + 'reference_vol': { + 'required': False, + 'validate' : [lambda x: isinstance(x, str)] } diff --git a/lama/stats/standard_stats/results_writer.py b/lama/stats/standard_stats/results_writer.py index 96d7d373..7853851c 100644 --- a/lama/stats/standard_stats/results_writer.py +++ b/lama/stats/standard_stats/results_writer.py @@ -37,7 +37,8 @@ def __init__(self, out_dir: Path, stats_name: str, label_map: np.ndarray, - label_info_path: Path): + label_info_path: Path, + two_way): """ TODO: map organ names back onto results Parameters @@ -54,7 +55,8 @@ def __init__(self, for creating filtered labelmap overlays label_info_path Label map information - + two_way + Flag for a two-way study Returns ------- @@ -67,12 +69,15 @@ def __init__(self, self.shape = results.input_.shape self.stats_name = stats_name self.line = results.input_.line + self.two_way = two_way # Write out the line-level results line_tstats = results.line_tstats line_qvals = results.line_qvals line_pvals = results.line_pvalues # Need to get thse into organ volumes + print("line pvals: ", np.shape(line_pvals), line_pvals) + print("line qvals: ", np.shape(line_qvals), line_qvals) logging.info('Writing line-level results...') line_threshold_file = self.out_dir / f'Qvals_{stats_name}_{self.line}.csv' @@ -91,14 +96,24 @@ def __init__(self, specimen_out_dir.mkdir(exist_ok=True) # For specimen-level results + + + for spec_id, spec_res in results.specimen_results.items(): spec_threshold_file = specimen_out_dir / f'Qvals_{stats_name}_{spec_id}.csv' spec_t = spec_res['t'] spec_q = spec_res['q'] spec_p = spec_res['p'] - write_threshold_file(spec_q, spec_t, spec_threshold_file) - self._write(spec_t, spec_p, spec_q, specimen_out_dir, spec_id) - + if self.two_way: + if np.any(np.shape(spec_t)) > 0: + write_threshold_file(spec_q, spec_t, spec_threshold_file) + self._write(spec_t, spec_p, spec_q, specimen_out_dir, spec_id) + + else: + write_threshold_file(spec_q, spec_t, spec_threshold_file) + self._write(spec_t, spec_p, spec_q, specimen_out_dir, spec_id) + + # self.log(self.out_dir, 'Organ_volume stats', results.input_) logging.info('Finished writing specimen-level results') @@ -141,18 +156,43 @@ def __init__(self, *args): super().__init__(*args) def _write(self, t_stats, pvals, qvals, outdir, name): - filtered_tstats = result_cutoff_filter(t_stats, qvals) - filtered_result = self.rebuild_array(filtered_tstats, self.shape, self.mask) - unfiltered_result = self.rebuild_array(t_stats, self.shape, self.mask) + + if self.two_way: + pvals = np.array_split(pvals, 3) + f_stats = np.array_split(t_stats, 3) + qvals = np.array_split(qvals, 3) + groups = ['geno', 'treat', 'int'] + + for i, f_stat in enumerate(f_stats): + + filtered_fstats = result_cutoff_filter(f_stat, qvals[i]) + filtered_result = self.rebuild_array(filtered_fstats, self.shape, self.mask) + unfiltered_result = self.rebuild_array(f_stat, self.shape, self.mask) + + heatmap_path = outdir / f'{name}_{self.stats_name}_{groups[i]}_f_fdr5.nrrd' + heatmap_path_unfiltered = outdir / f'{name}_{self.stats_name}_{groups[i]}_f.nrrd' + + # Write qval-filtered t-stats + write_array(filtered_result, heatmap_path, ras=True) + + # Write raw t-stats + write_array(unfiltered_result, heatmap_path_unfiltered, ras=True) + + else: + filtered_tstats = result_cutoff_filter(t_stats, qvals) + filtered_result = self.rebuild_array(filtered_tstats, self.shape, self.mask) + unfiltered_result = self.rebuild_array(t_stats, self.shape, self.mask) - heatmap_path = outdir / f'{name}_{self.stats_name}_t_fdr5.nrrd' - heatmap_path_unfiltered = outdir / f'{name}_{self.stats_name}_t.nrrd' + + + heatmap_path = outdir / f'{name}_{self.stats_name}_t_fdr5.nrrd' + heatmap_path_unfiltered = outdir / f'{name}_{self.stats_name}_t.nrrd' - # Write qval-filtered t-stats - write_array(filtered_result, heatmap_path, ras=True) + # Write qval-filtered t-stats + write_array(filtered_result, heatmap_path, ras=True) - # Write raw t-stats - write_array(unfiltered_result, heatmap_path_unfiltered, ras=True) + # Write raw t-stats + write_array(unfiltered_result, heatmap_path_unfiltered, ras=True) return heatmap_path @@ -177,7 +217,6 @@ def rebuild_array(array: np.ndarray, shape: Tuple, mask: np.ndarray) -> np.ndarr 3d rebuilt array """ - array[array > MINMAX_TSCORE] = MINMAX_TSCORE array[array < -MINMAX_TSCORE] = - MINMAX_TSCORE @@ -196,26 +235,58 @@ def __init__(self, *args): def _write(self, t_stats, pvals, qvals, out_dir, name): # write_csv(self.line_tstats, self.line_qvals, line_out_path, list(results.input_.data.columns), label_info) - out_path = out_dir / f'{name}_{self.stats_name}.csv' - df = pd.DataFrame.from_dict(dict(t=t_stats, p=pvals, q=qvals)) - - label_info = pd.read_csv(self.label_info_path) - - # Merge the results from each label to the label info - # The labels are stored in the InputData - labels = list(self.results.input_.data.columns) - df.index = labels - df.index = df.index.astype(np.int64) - df = df.merge(right=label_info, right_on='label', left_index=True) - - df['significant_bh_q_5'] = df['q'] < 0.05 - df.sort_values(by='q', inplace=True) - df.to_csv(out_path) - - hit_labels = df[df['significant_bh_q_5'] == True]['label'] - - thresh_labels_out = out_dir / f'{name}_hit_organs.nrrd' - # self._write_thresholded_label_map(self.label_map, hit_labels, thresh_labels_out) + + if self.two_way: + + pvals = np.array_split(pvals, 3) + f_stats = np.array_split(t_stats, 3) + qvals = np.array_split(qvals, 3) + + groups = ['geno', 'treat', 'int'] + + for i, f_stat in enumerate(f_stats): + + out_path = out_dir / f'{name}_{self.stats_name}_{groups[i]}.csv' + df = pd.DataFrame.from_dict(dict(f=f_stats[i], p=pvals[i], q=qvals[i])) + + label_info = pd.read_csv(self.label_info_path) + + # Merge the results from each label to the label info + # The labels are stored in the InputData + labels = list(self.results.input_.data.columns) + df.index = labels + df.index = df.index.astype(np.int64) + df = df.merge(right=label_info, right_on='label', left_index=True) + + df['significant_bh_q_5'] = df['q'] < 0.05 + df.sort_values(by='q', inplace=True) + df.to_csv(out_path) + + hit_labels = df[df['significant_bh_q_5'] == True]['label'] + + thresh_labels_out = out_dir / f'{name}_{groups[i]}_hit_organs.nrrd' + # self._write_thresholded_label_map(self.label_map, hit_labels, thresh_labels_out) + else: + out_path = out_dir / f'{name}_{self.stats_name}.csv' + df = pd.DataFrame.from_dict(dict(t=t_stats, p=pvals, q=qvals)) + + label_info = pd.read_csv(self.label_info_path) + + # Merge the results from each label to the label info + # The labels are stored in the InputData + labels = list(self.results.input_.data.columns) + df.index = labels + df.index = df.index.astype(np.int64) + df = df.merge(right=label_info, right_on='label', left_index=True) + + df['significant_bh_q_5'] = df['q'] < 0.05 + df.sort_values(by='q', inplace=True) + df.to_csv(out_path) + + hit_labels = df[df['significant_bh_q_5'] == True]['label'] + + thresh_labels_out = out_dir / f'{name}_hit_organs.nrrd' + # self._write_thresholded_label_map(self.label_map, hit_labels, thresh_labels_out) def _write_thresholded_label_map(self, label_map: np.ndarray, hits, out: Path): """ diff --git a/lama/stats/standard_stats/stats_objects.py b/lama/stats/standard_stats/stats_objects.py index 20b88fc3..d50c7af1 100644 --- a/lama/stats/standard_stats/stats_objects.py +++ b/lama/stats/standard_stats/stats_objects.py @@ -31,7 +31,8 @@ class Stats: def __init__(self, input_: LineData, stats_type: str, - use_staging: bool = True + use_staging: bool = True, + two_way: bool = False ): """ @@ -48,6 +49,7 @@ def __init__(self, self.stats_type_ = stats_type self.stats_runner = None self.use_staging = use_staging + self.two_way = two_way # The final results will be stored in these attributes self.line_qvals = None @@ -66,7 +68,9 @@ def factory(type_): def run_stats(self): - if self.use_staging: + if self.two_way: + logging.info('Using genotype, treatment and staging in linear model') + elif self.use_staging: logging.info('Using genotype and staging in linear model') else: logging.info('Using only genotype in linear model') @@ -90,7 +94,8 @@ def run_stats(self): current_chunk_size = data_chunk.shape[1] # Final chunk may not be same size - p_all, t_all = self.stats_runner(data_chunk, info, use_staging=self.use_staging) + p_all, t_all = self.stats_runner(data_chunk, info, use_staging=self.use_staging, two_way=self.two_way) + # Convert all NANs in the pvalues to 1.0. Need to check that this is appropriate p_all[np.isnan(p_all)] = 1.0 @@ -98,49 +103,103 @@ def run_stats(self): # Convert NANs to 0. We get NAN when for eg. all input values are 0 t_all[np.isnan(t_all)] = 0.0 - # Each chunk of results has the line -level results at the start - p_line = p_all[:current_chunk_size] - t_line = t_all[:current_chunk_size] + # chunk size isn't a thing in two_ways from what I can tell + if self.two_way: + #just append the line vals into the array + for index, pval in enumerate(p_all): + line_level_pvals.append(pval[:current_chunk_size]) + line_level_tvals.append(t_all[index][:current_chunk_size]) + + #only get the interaction ids (filt_flag) + mut_ids = self.input_.mutant_ids(filt_flag = True) + + for spec_num, id_ in enumerate(mut_ids): + start = current_chunk_size * (spec_num + 1) + end = current_chunk_size * (spec_num + 2) + + for index, pval in enumerate(p_all): + t_stat = t_all[index] + specimen_tstats[id_].append(t_stat[start:end]) #just add ll pvals to the row + specimen_pvals[id_].append(pval[start:end]) - line_level_pvals.append(p_line) - line_level_tvals.append(t_line) + else: + # Get the specimen-level statistics + # Each chunk of results has the line -level results at the start + p_line = p_all[:current_chunk_size] + t_line = t_all[:current_chunk_size] - # Get the specimen-level statistics - mut_ids = self.input_.mutant_ids() + line_level_pvals.append(p_line) + line_level_tvals.append(t_line) + mut_ids = self.input_.mutant_ids() - for spec_num, id_ in enumerate(mut_ids): - # After the line level result, the specimen-level results are appended to the result chunk - start = current_chunk_size * (spec_num + 1) - end = current_chunk_size * (spec_num + 2) + for spec_num, id_ in enumerate(mut_ids): + # After the line level result, the specimen-level results are appended to the result chunk + start = current_chunk_size * (spec_num + 1) + end = current_chunk_size * (spec_num + 2) - specimen_tstats[id_].append(t_all[start:end]) - specimen_pvals[id_].append(p_all[start:end]) + specimen_tstats[id_].append(t_all[start:end]) + specimen_pvals[id_].append(p_all[start:end]) # Stack the results chunks column-wise to get back to orginal shape line_pvals_array = np.hstack(line_level_pvals) line_tvals_array = np.hstack(line_level_tvals) - self.line_pvalues = line_pvals_array - self.line_qvals = fdr(line_pvals_array) + if self.two_way: + # split pvals and perform sperate fdr (multiple comparisons compensated within the lm) + pval_split = np.array_split(line_pvals_array, 3) + line_qvals = [] + + for index, array in enumerate(pval_split): + line_qvals.append(fdr(array)) + + # restack qvals + self.line_qvals = np.hstack(line_qvals) + + self.line_tstats = line_tvals_array - self.line_tstats = line_tvals_array + # Join up the results chunks for the specimen-level analysis. Do FDR correction on the pvalues + self.specimen_results = addict.Dict() + try: + for id_, p in list(specimen_pvals.items()): + p = np.hstack(p) + spec_p_split = np.array_split(p, 3) + q = [] - # Join up the results chunks for the specimen-level analysis. Do FDR correction on the pvalues - self.specimen_results = addict.Dict() + for index, array in enumerate(spec_p_split): + # perform seperate fdr for g, e and int - this is compensated within the lm + q.append(fdr(array)) - try: - for id_, p in list(specimen_pvals.items()): - p = np.hstack(p) - q = fdr(p) - t = np.hstack(specimen_tstats[id_]) - self.specimen_results[id_]['histogram'] = np.histogram(p, bins=100)[0] - self.specimen_results[id_]['q'] = q - self.specimen_results[id_]['t'] = t - self.specimen_results[id_]['p'] = p - except Exception as e: - logging.info(p) + #restack q + q = np.hstack(q) + t = np.hstack(specimen_tstats[id_]) + self.specimen_results[id_]['histogram'] = np.histogram(p, bins=100)[0] + self.specimen_results[id_]['q'] = q + self.specimen_results[id_]['t'] = t + self.specimen_results[id_]['p'] = p + except Exception as e: + logging.info(p) + + else: + self.line_qvals = fdr(line_pvals_array) + + self.line_tstats = line_tvals_array + + # Join up the results chunks for the specimen-level analysis. Do FDR correction on the pvalues + self.specimen_results = addict.Dict() + try: + for id_, p in list(specimen_pvals.items()): + p = np.hstack(p) + q = fdr(p) + t = np.hstack(specimen_tstats[id_]) + self.specimen_results[id_]['histogram'] = np.histogram(p, bins=100)[0] + self.specimen_results[id_]['q'] = q + self.specimen_results[id_]['t'] = t + self.specimen_results[id_]['p'] = p + + except Exception as e: + logging.info(p) class Intensity(Stats): def __init__(self, *args): @@ -197,6 +256,3 @@ def fdr(pvals: np.ndarray) -> np.ndarray: os.remove(pval_file) return result - - - diff --git a/lama/tests/__init__.py b/lama/tests/__init__.py index 8600fcb4..c2388ba1 100644 --- a/lama/tests/__init__.py +++ b/lama/tests/__init__.py @@ -5,8 +5,13 @@ # When running lama,. first set the environment up using 'source lama_env.sh' # But for the tests, we shall just set it here import sys +import yaml + lama_dir = Path.cwd().parent sys.path.insert(0, lama_dir) +from yaml import FullLoader + + current_dir = Path(__file__).parent test_data_root = current_dir / 'test_data' @@ -18,7 +23,7 @@ stats_test_data_dir = test_data_root / 'stats_test_data' stats_output_dir = stats_test_data_dir / 'test_output' -stats_config_dir = stats_test_data_dir / 'config_files' +stats_config_dir = current_dir / 'configs' permutation_stats_dir = stats_test_data_dir / 'permutation_stats' qc_flags_dir = registration_root / 'qc_flag_files' @@ -33,4 +38,42 @@ population_test_dir = test_data_root / 'population_average_data' +#for Kyle's perm testing + +cfg_path = Path( + "C:/LAMA/lama/tests/configs/permutation_stats/perm_no_qc.yaml") +def p(path): + if path is None: + return + + cfg_dir = Path(cfg_path).parent + + resolved = (cfg_dir / path).resolve() + + print(resolved) + if not resolved.exists(): + raise FileNotFoundError(f'Cannot find: {resolved}') + return resolved + + +with open(cfg_path, 'r') as fh: + cfg = yaml.load(fh, Loader=FullLoader) + +wt_dir = p(cfg['wildtype_dir']) +mut_dir = p(cfg['mutant_dir']) + +out_dir = p(cfg.get('output_dir', Path(cfg_path).parent)) + + + +n_perm = int(cfg.get('n_permutations', 1000)) +label_meta = p(cfg.get('label_metadata')) +label_map = p(cfg.get('label_map')) +wev_norm = bool(cfg.get('norm_to_whole_embryo_vol', True)) +qc_file = p(cfg.get('qc_file')) +voxel_size = float(cfg.get('voxel_size', 1.0)) + +treat_dir = p(cfg['treatment_dir']) +inter_dir = p(cfg['interaction_dir']) +two_way = bool(cfg.get('two_way', False)) diff --git a/lama/tests/configs/lama_radiomics/radiomics_config.toml b/lama/tests/configs/lama_radiomics/radiomics_config.toml new file mode 100644 index 00000000..607e4381 --- /dev/null +++ b/lama/tests/configs/lama_radiomics/radiomics_config.toml @@ -0,0 +1,21 @@ +# Radiomics config v1 - Kyle Drover + +target_dir = 'E:\220204_BQ_dataset\scans_for_sphere_creation' + +# labels of interest +labs_of_int = '1' + +norm_methods = 'subtraction' + +norm_label = true + +#spherify = 0 + +ref_vol_path = 'E:\220204_BQ_dataset\scans_for_sphere_creation\rigids\200721_MPTLVo3_GFSeeds_4T1R_4T1R_D7_C1_002.nrrd' + + +stage_dir = 'stage_labels' + +scan_dir = 'imgs' + +tumour_dir = 'sphere_15' diff --git a/lama/tests/configs/lama_radiomics/radiomics_config_gina.toml b/lama/tests/configs/lama_radiomics/radiomics_config_gina.toml new file mode 100644 index 00000000..377e4e7b --- /dev/null +++ b/lama/tests/configs/lama_radiomics/radiomics_config_gina.toml @@ -0,0 +1,20 @@ +# Radiomics config v1 - Kyle Drover + +target_dir = 'C:\gina\g_by_back_data' + +# labels of interest +labs_of_int = '6, 7, 15' + +norm_methods = 'histogram' + +norm_label = false + +#spherify = 0 + +#ref_vol_path = 'C:\gina\target\E17.5_Weighted_Average-16bit_cropped_BSpline.nrrd' + + +#mask_dir = '' + +#scan_dir = '' + diff --git a/lama/tests/configs/permutation_stats/__init__.py b/lama/tests/configs/permutation_stats/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lama/tests/configs/permutation_stats/perm_no_qc.yaml b/lama/tests/configs/permutation_stats/perm_no_qc.yaml new file mode 100644 index 00000000..c0a65e26 --- /dev/null +++ b/lama/tests/configs/permutation_stats/perm_no_qc.yaml @@ -0,0 +1,23 @@ +# Test config for for lama permutation stats +# All paths are resolved relative to this config file + +# Required parameters +wildtype_dir : E:/221122_two_way/g_by_back_data/baseline +mutant_dir : E:/221122_two_way/g_by_back_data/mutants +treatment_dir : E:/221122_two_way/g_by_back_data/treatment +interaction_dir : E:/221122_two_way/g_by_back_data/mut_treat +output_dir: E:/221122_two_way/permutation_stats/rad_perm_all_feats + +# Optional parameters +n_permutations: 20000 # default 1000 +label_metadata: E:/221122_two_way/target/E14_5_atlas_v24_43_label_info_v2.csv +label_map: E:/221122_two_way/target/210713_C3H_atlas_n18.nrrd +norm_to_whole_embryo_vol: True # default true +qc_file: null # Path to qc file, null or omit parameter +# For calculating organ volumes in real values +voxel_size: 40.0 +two_way: True # makes the two-way perm test + +rad_dir: E:/221122_two_way/g_by_back_data/radiomics_output_for_text + +spec_fdr: 0.2 \ No newline at end of file diff --git a/lama/tests/configs/permutation_stats/perm_no_qc_just_ovs.yaml b/lama/tests/configs/permutation_stats/perm_no_qc_just_ovs.yaml new file mode 100644 index 00000000..a0a3696b --- /dev/null +++ b/lama/tests/configs/permutation_stats/perm_no_qc_just_ovs.yaml @@ -0,0 +1,23 @@ +# Test config for for lama permutation stats +# All paths are resolved relative to this config file + +# Required parameters +wildtype_dir : E:/221122_two_way/g_by_back_data/baseline +mutant_dir : E:/221122_two_way/g_by_back_data/mutants +treatment_dir : E:/221122_two_way/g_by_back_data/treatment +interaction_dir : E:/221122_two_way/g_by_back_data/mut_treat +output_dir: E:/221122_two_way/permutation_stats/test_just_ovs + +# Optional parameters +n_permutations: 20000 # default 1000 +label_metadata: E:/221122_two_way/target/E14_5_atlas_v24_43_label_info.csv +label_map: E:/221122_two_way/target/210713_C3H_atlas_n18.nrrd +norm_to_whole_embryo_vol: True # default true +qc_file: null # Path to qc file, null or omit parameter +# For calculating organ volumes in real values +voxel_size: 40.0 +two_way: True # makes the two-way perm test + +#rad_dir: E:/221122_two_way/g_by_back_data/radiomics_output_for_text + +#spec_fdr: 0.2 \ No newline at end of file diff --git a/lama/tests/configs/permutation_stats/perm_qc.yaml b/lama/tests/configs/permutation_stats/perm_qc.yaml new file mode 100644 index 00000000..bf5da6a8 --- /dev/null +++ b/lama/tests/configs/permutation_stats/perm_qc.yaml @@ -0,0 +1,16 @@ +# Test config for for lama permutaiton stats +# All paths are resolved relative to this config file + +# Required parameters +wildtype_dir: ../../test_data/stats_test_data/permutation_stats/registration_data/baseline/output +mutant_dir: ../../test_data/stats_test_data/permutation_stats/registration_data/mutant/output +output_dir: ../../test_data/stats_test_data/permutation_stats/output_with_hits + +# Optional parameters +n_permutations: 8 # default 1000 +label_metadata: ../../test_data/mutant_and_baseline_data/target/label_info.csv +label_map: ../../test_data/mutant_and_baseline_data/target/labels.nrrd +norm_to_whole_embryo_vol: true # default true +qc_file: null +# For calculating organ volumes in real values +voxel_size: 27.0 diff --git a/lama/tests/configs/permutation_stats/temp.yaml b/lama/tests/configs/permutation_stats/temp.yaml new file mode 100644 index 00000000..9419f86f --- /dev/null +++ b/lama/tests/configs/permutation_stats/temp.yaml @@ -0,0 +1,9 @@ +label_map: ../../test_data/mutant_and_baseline_data/target/labels.nrrd +label_metadata: ../../test_data/mutant_and_baseline_data/target/label_info.csv +mutant_dir: ../../test_data/stats_test_data/permutation_stats/registration_data/mutant/output +n_permutations: 8 +norm_to_whole_embryo_vol: true +output_dir: ../../test_data/stats_test_data/permutation_stats/qc_5 +qc_file: /home/neil/git/lama_phenotype_detection/lama/tests/test_data/mutant_and_baseline_data/qc_files/qc_5.csv +voxel_size: 27.0 +wildtype_dir: ../../test_data/stats_test_data/permutation_stats/registration_data/baseline/output diff --git a/lama/tests/configs/permutation_stats/tempnotdo/perm.yaml b/lama/tests/configs/permutation_stats/tempnotdo/perm.yaml new file mode 100644 index 00000000..4ea0eb4f --- /dev/null +++ b/lama/tests/configs/permutation_stats/tempnotdo/perm.yaml @@ -0,0 +1,16 @@ +# Test config for for lama permutaiton stats +# All paths are resolved relative to this config file + +# Required parameters +wildtype_dir: ../../test_data/mutant_and_baseline_data/baseline/output +mutant_dir: ../../test_data/mutant_and_baseline_data/mutant/output +output_dir: ../../test_data/stats_test_data/permutation_stats/output_with_hits + +# Optional parameters +n_permutations: 20 # default 1000 +label_metadata: ../../test_data/mutant_and_baseline_data/target/label_info.csv +label_map: ../../test_data/mutant_and_baseline_data/target/labels.nrrd +norm_to_whole_embryo_vol: true # default true +qc_file: null # Path to qc file, null or omit parameter +# For calculating organ volumes in real values +voxel_size: 27.0 diff --git a/lama/tests/configs/standard_stats/generate_data.toml b/lama/tests/configs/standard_stats/generate_data.toml new file mode 100644 index 00000000..04e4fd4c --- /dev/null +++ b/lama/tests/configs/standard_stats/generate_data.toml @@ -0,0 +1,167 @@ +### +target_folder = "E:/Bl6_data/211014_g_by_back/target" +threads = 8 +filetype = "nrrd" +fixed_volume = "210602_C3H_avg_n18.nrrd" +fixed_mask = "fixed_mask.nrrd" +stats_mask = "stats_mask.nrrd" +label_map = "210713_C3H_atlas_n18.nrrd" +label_info = "E14_5_atlas_v24_43_label_info.csv" +generate_new_target_each_stage = false +skip_transform_inversion = false +staging = "embryo_volume" +label_propagation = 'reverse_registration' +skip_forward_registration = false +fix_folding = true + +[generate_deformation_fields] +160_to_6 = ["deformable_160", "deformable_80", "deformable_40", "deformable_20", "deformable_12", "deformable_8", "deformable_6"] + + +[[registration_stage_params]] +stage_id = "rigid" + +[registration_stage_params.elastix_parameters] +Metric = "AdvancedNormalizedCorrelation" +Registration = "MultiResolutionRegistration" +MaximumNumberOfIterations = 400 +NumberOfResolutions = 2 +NumberOfSpatialSamples = 100000 +Transform = "EulerTransform" +SP_a = [ 1000.0, 1000.0, 500.0, 500.0,] +SP_alpha = 0.602 +SP_A = 50.0 +FixedLimitRangeRatio = 0.0 +MovingLimitRangeRatio = 0.0 +FixedKernelBSplineOrder = 1 +MovingKernelBSplineOrder = 3 +UseDifferentiableOverlap = "false" +[[registration_stage_params]] +stage_id = "affine" + +[registration_stage_params.elastix_parameters] +Registration = "MultiResolutionRegistration" +NumberOfResolutions = 2 +Transform = "AffineTransform" +Metric = "AdvancedNormalizedCorrelation" +MaximumNumberOfIterations = 500 +NumberOfSpatialSamples = 1000000 + +### def1 +[[registration_stage_params]] +stage_id = "deformable_160" +[registration_stage_params.elastix_parameters] +Registration = "MultiResolutionRegistration" +NumberOfResolutions = 1 +NumberOfSpatialSamples = 20000 +MaximumStepLength = 3.0 +NumberOfGradientMeasurements = 10 +NumberOfSamplesForExactGradient = 10000 +NumberOfJacobianMeasurements = 4000 +MaximumNumberOfIterations = 1000 +AutomaticParameterEstimation = "true" +UseAdaptiveStepSizes = "true" +#ASGDParameterEstimationMethod = "DisplacementDistribution" +Transform = "BSplineTransform" +Metric = "AdvancedMattesMutualInformation" +FinalGridSpacingInVoxels = 160 +FixedImagePyramidSchedule = [6] +MovingImagePyramidSchedule = [6] + +##def3 +[[registration_stage_params]] +stage_id = "deformable_80" +inherit_elx_params = "deformable_160" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +FinalGridSpacingInVoxels = 80 +FixedImagePyramidSchedule = [5] +MovingImagePyramidSchedule = [5] + +##def4 +[[registration_stage_params]] +stage_id = "deformable_40" +inherit_elx_params = "deformable_80" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +MaximumStepLength = 2.0 +FinalGridSpacingInVoxels = 40 +FixedImagePyramidSchedule = [4] +MovingImagePyramidSchedule = [4] + +##def5 +[[registration_stage_params]] +stage_id = "deformable_20" +inherit_elx_params = "deformable_40" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +Metric = "AdvancedMattesMutualInformation" +Registration = "MultiResolutionRegistration" +FinalGridSpacingInVoxels = 20 +MaximumStepLength = 1.0 +FixedImagePyramidSchedule = [3] +MovingImagePyramidSchedule = [3] + + +##def6 +[[registration_stage_params]] +stage_id = "deformable_12" +inherit_elx_params = "deformable_20" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +MaximumStepLength = 1.0 +FinalGridSpacingInVoxels = 12 +FixedImagePyramidSchedule = [2] +MovingImagePyramidSchedule = [2] + +##def7 +[[registration_stage_params]] +stage_id = "deformable_8" +inherit_elx_params = "deformable_12" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +MaximumStepLength = 1.0 +FinalGridSpacingInVoxels = 8 +FixedImagePyramidSchedule = [1] +MovingImagePyramidSchedule = [1] + +##def8 +[[registration_stage_params]] +stage_id = "deformable_6" +inherit_elx_params = "deformable_12" +[registration_stage_params.elastix_parameters] +NumberOfResolutions = 1 +MaximumStepLength = 0.8 +FinalGridSpacingInVoxels = 6 +FixedImagePyramidSchedule = [1] +MovingImagePyramidSchedule = [1] + +[global_elastix_params] +FixedInternalImagePixelType = "float" +MovingInternalImagePixelType = "float" +FixedImageDimension = 3 +MovingImageDimension = 3 +UseDirectionCosines = "true" +FixedImagePyramid = "FixedSmoothingImagePyramid" +MovingImagePyramid = "MovingSmoothingImagePyramid" +ResultImagePixelType = "float" +ResultImageFormat = "nrrd" +CompressResultImage = "true" +Interpolator = "BSplineInterpolator" +ResampleInterpolator = "FinalBSplineInterpolator" +Resampler = "DefaultResampler" +NumberOfHistogramBins = 32 +HowToCombineTransforms = "Compose" +NewSamplesEveryIteration = "true" +ImageSampler = "RandomCoordinate" +FinalBSplineInterpolationOrder = 3 +BSplineInterpolationOrder = 3 +DefaultPixelValue = 0 +WriteTransformParametersEachIteration = "false" +WriteResultImage = "false" +WriteResultImageAfterEachResolution = "false" +AutomaticScalesEstimation = "true" +AutomaticTransformInitialization = "true" +Optimizer = "AdaptiveStochasticGradientDescent" +UseRandomSampleRegion = "true" +MaximumNumberOfSamplingAttempts = 10 diff --git a/lama/tests/configs/standard_stats/stats.toml b/lama/tests/configs/standard_stats/stats.toml new file mode 100644 index 00000000..6309e549 --- /dev/null +++ b/lama/tests/configs/standard_stats/stats.toml @@ -0,0 +1,29 @@ +# This is the new (281118) stats config for the standard stats pipeline +# Modified by Kyle for two-way studies +stats_types = [ +'intensity', +'jacobians', +'organ_volumes' +] + +# This is the final folder in the registration sheme +reg_folder = 'deformable_192_to_10' +# The final Jacobian determinat folder +jac_folder = '192_to_10' + +# Tight mask for restricting the analysis to +mask = 'stats_mask.nrrd' +label_info = 'E14_5_atlas_v24_40_label_info_nouse.csv' +label_map = '210223_E14.5_C3H_atlas.nrrd' +blur_fwhm = 100 +voxel_size = 40.0 #this may need changing +invert_stats = false + +# Linearly normalise intensity data to th mean intensity withing the mask +normalise = 'histogram' + +# Have whole embryo volume in the linear model to account for developmental substage +use_staging = true + +# Enable Two-way study for interaction effects +two_way = true diff --git a/lama/tests/debug.py b/lama/tests/debug.py new file mode 100644 index 00000000..2987c178 --- /dev/null +++ b/lama/tests/debug.py @@ -0,0 +1,51 @@ +""" +For development and debugging, this test can be run. It will use the config debug.toml + +Usage: pytest -qq -m "not notest" debug.py +""" + +from pathlib import Path +from lama.registration_pipeline import run_lama +import logzero + +import os +import shutil +import pytest +import logging +from lama.registration_pipeline import run_lama +from lama.tests import (registration_root, mut_registration_dir, wt_registration_dir, test_data_root) + + +# def delete_previous_files(): +# """ +# Remove the output generated from previous tests. This does not occur directly after the test as we may want to +# look at the results. +# """ +# def delete(root: Path): +# shutil.rmtree(root / 'output', ignore_errors=True) +# for p in root.iterdir(): +# if str(p).endswith(('.log', 'jobs.csv', 'csv.lock', '.yaml')): +# p.unlink() +# +# delete(wt_registration_dir) +# delete(mut_registration_dir) + + +# @pytest.mark.notest +def test_lama_job_runner(): + """ + Test the lama job runner which was made to utilise multiple machines or the grid. + This test just uses one machine for the tests at the moment. + test_make_jobs_file() should run before this to create a jobs file that can be consumed. + This test should be run before the stats test as it creates data that the stats test needs. + + + NOTE this test should be at bottom of file as it should be ru last + The oututs of these tests are consumed by the stats test. + """ + + config = test_data_root / 'debugging' / 'debug.toml' + + run_lama.run(config) + + assert True \ No newline at end of file diff --git a/lama/tests/pytest.ini b/lama/tests/pytest.ini new file mode 100644 index 00000000..92217e39 --- /dev/null +++ b/lama/tests/pytest.ini @@ -0,0 +1,3 @@ +# content of pytest.ini +[pytest] +norecursedirs = archive working_on diff --git a/lama/tests/run_tests.sh b/lama/tests/run_tests.sh old mode 100644 new mode 100755 index a1d180c9..a04e731d --- a/lama/tests/run_tests.sh +++ b/lama/tests/run_tests.sh @@ -1 +1,7 @@ - pytest -qq -m "not notest" $[0]/test_data_generation.py \ No newline at end of file +#!/usr/bin/env bash + +script_dir=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) + +pytest -m "not notest" "$script_dir"/test_data_generation.py; # Run this first as the stats tests need the output +pytest -m "not notest" "$script_dir"/test_standard_stats.py; +pytest -m "not notest" "$script_dir"/test_permutation_stats.py; \ No newline at end of file diff --git a/lama/tests/test_data_generation.py b/lama/tests/test_data_generation.py index 10e4fc41..3036d89d 100644 --- a/lama/tests/test_data_generation.py +++ b/lama/tests/test_data_generation.py @@ -7,16 +7,16 @@ from pathlib import Path from lama.registration_pipeline import run_lama +import logzero import os import shutil import pytest - +import logging from lama.scripts import lama_job_runner from lama.tests import (registration_root, mut_registration_dir, wt_registration_dir) -@pytest.fixture def delete_previous_files(): """ Remove the output generated from previous tests. This does not occur directly after the test as we may want to @@ -31,49 +31,6 @@ def delete(root: Path): delete(wt_registration_dir) delete(mut_registration_dir) -# @pytest.mark.notest -def test_make_jobs_file(): - - - config_file = registration_root / 'registration_config.toml' - - lama_job_runner.lama_job_runner(config_file, wt_registration_dir, make_job_file=True) - lama_job_runner.lama_job_runner(config_file, mut_registration_dir, make_job_file=True) - -@pytest.mark.notest -def test_lama_job_runner_reverse_reg_only(): - """ - Tests out doing only the propagation of atlas to input images. The first stage of the noraml foraward registration - of input the pop abg is carried out. This creates the rigid registered input which is then used as target to - map atlas to. - """ - config_file = registration_root / 'registration_config_reverse_reg_only.toml' - assert lama_job_runner.lama_job_runner(config_file, mut_registration_dir) is True - -@pytest.mark.notest -def test_lama_job_runner_pyramid(): - """ - map atlas to. - """ - config_file = registration_root / 'registration_config.toml' - assert lama_job_runner.lama_job_runner(config_file, mut_registration_dir) is True - - -@pytest.mark.notest -def test_lama_reg(): - """ - Test using the lama registration script without jaob runner wrapepr - Returns - ------- - - """ - # delete_previous_files() - config_file = registration_root / 'registration_config.toml' - # Needs to be in same folder as inputs - dest = wt_registration_dir / 'registration_config.toml' - # shutil.copyfile(config_file, dest) - assert run_lama.run(dest) is True - # @pytest.mark.notest def test_lama_job_runner(): @@ -87,20 +44,83 @@ def test_lama_job_runner(): NOTE this test should be at bottom of file as it should be ru last The oututs of these tests are consumed by the stats test. """ - # config_file = registration_root / 'registration_config.toml' - config_file = registration_root / 'registration_config.toml' - assert lama_job_runner.lama_job_runner(config_file, wt_registration_dir) is True - assert lama_job_runner.lama_job_runner(config_file, mut_registration_dir) is True + configs = registration_root.glob('*.toml') -@pytest.mark.notest -def test_lama_job_runner_secondary_segmentation(): - """ - Tests out doing only the propagation of atlas to input images. The first stage of the noraml foraward registration - of input the pop abg is carried out. This creates the rigid registered input which is then used as target to - map atlas to. - """ - config_file = registration_root / 'registration_config_reverse_reg_only.toml' - assert lama_job_runner.lama_job_runner(config_file, mut_registration_dir) is True + for cfg in configs: + delete_previous_files() + + print(f"\n{'#'*8} Doing config {cfg.name} {'#'*8}") + + lama_job_runner.lama_job_runner(cfg, wt_registration_dir, make_job_file=True, log_level=logging.ERROR) + lama_job_runner.lama_job_runner(cfg, wt_registration_dir, log_level=logging.ERROR) + + lama_job_runner.lama_job_runner(cfg, mut_registration_dir, make_job_file=True, log_level=logging.ERROR) + lama_job_runner.lama_job_runner(cfg, mut_registration_dir, log_level=logging.ERROR) + # return # Just do the first + + + +#TODO +# QC subset test + + + +# +# @pytest.mark.notest +# def test_make_jobs_file(): +# +# +# config_file = registration_root / 'registration_config.toml' +# +# lama_job_runner.lama_job_runner(config_file, wt_registration_dir, make_job_file=True) +# lama_job_runner.lama_job_runner(config_file, mut_registration_dir, make_job_file=True) +# +# @pytest.mark.notest +# def test_lama_job_runner_reverse_reg_only(): +# """ +# Tests out doing only the propagation of atlas to input images. The first stage of the noraml foraward registration +# of input the pop abg is carried out. This creates the rigid registered input which is then used as target to +# map atlas to. +# """ +# config_file = registration_root / 'registration_config_reverse_reg_only.toml' +# assert lama_job_runner.lama_job_runner(config_file, mut_registration_dir) is True +# +# @pytest.mark.notest +# def test_lama_job_runner_pyramid(): +# """ +# map atlas to. +# """ +# config_file = registration_root / 'registration_config.toml' +# assert lama_job_runner.lama_job_runner(config_file, mut_registration_dir) is True +# +# +# @pytest.mark.notest +# def test_lama_reg(): +# """ +# Test using the lama registration script without jaob runner wrapepr +# Returns +# ------- +# +# """ +# # delete_previous_files() +# config_file = registration_root / 'registration_config.toml' +# # Needs to be in same folder as inputs +# dest = wt_registration_dir / 'registration_config.toml' +# # shutil.copyfile(config_file, dest) +# assert run_lama.run(dest) is True +# +# +# +# +# @pytest.mark.notest +# def test_lama_job_runner_secondary_segmentation(): +# """ +# Tests out doing only the propagation of atlas to input images. The first stage of the noraml foraward registration +# of input the pop abg is carried out. This creates the rigid registered input which is then used as target to +# map atlas to. +# """ +# config_file = registration_root / 'registration_config_reverse_reg_only.toml' +# assert lama_job_runner.lama_job_runner(config_file, mut_registration_dir) is True diff --git a/lama/tests/test_distributions.py b/lama/tests/test_distributions.py new file mode 100644 index 00000000..e475b0cc --- /dev/null +++ b/lama/tests/test_distributions.py @@ -0,0 +1,10 @@ +import pandas as pd +from lama.stats.permutation_stats.distributions import generate_random_combinations + +def test_generate_random_combinations(): + df = pd.read_csv('/home/neil/Desktop/data.csv', index_col=0) + generate_random_combinations(df,30) + + +if __name__ == '__main__': + test_generate_random_combinations() \ No newline at end of file diff --git a/lama/tests/test_permutation_stats.py b/lama/tests/test_permutation_stats.py new file mode 100644 index 00000000..7224dca6 --- /dev/null +++ b/lama/tests/test_permutation_stats.py @@ -0,0 +1,223 @@ +""" +Test the permutation-based stats pipeline + +Usage +----- + +These functions test the lama registration pipeline permutation stats module + +Usage: pytest -q -x -s -m "not notest" --tb=short test_run_permutation_stats.py +""" + +import shutil +from pathlib import Path +import pytest +import pandas as pd +import tempfile + +import yaml + +# from lama.stats.permutation_stats import run_permutation_stats +from lama.scripts import lama_permutation_stats +from lama.tests import (test_data_root, registration_root, wt_registration_dir, mut_registration_dir, + permutation_stats_dir, qc_flags_dir, stats_config_dir) +from lama.common import LamaDataException, read_spec_csv + + +# @pytest.fixture(scope="session", autouse=True) +# def remove_previous_output(): +# """ +# Remove previous output from this test module. We do not do this after each test as manual inspection of the output +# may be necessary. this runs before all other tests. +# """ +# shutil.rmtree(permutation_stats_dir, ignore_errors=True) +# permutation_stats_dir.mkdir() + + +# @pytest.mark.notest +# def test_permutation_stats_no_hits(): +# """ +# Run the whole permutation based stats pipeline. +# Currently this just checks to see if the pipeline completes without any errors. +# """ +# outdir = permutation_stats_dir / 'output_no_hits' +# outdir.mkdir() +# num_perms = 100 # Would do 1000 or more normally +# label_info = registration_root / 'target' / 'label_info.csv' +# label_map = registration_root / 'target' / 'labels.nrrd' +# +# run_permutation_stats.run(wt_registration_dir / 'output', mut_registration_dir / 'output', outdir, num_perms, +# label_info=label_info, label_map_path=label_map) +# # Without label meta file +# run_permutation_stats.run(wt_registration_dir / 'output', mut_registration_dir / 'output', outdir, num_perms, +# +# label_map_path=label_map) +@pytest.fixture(scope="session", autouse=True) +def copy_data(): + """ + Copy data over from the registration test output. + Modify some data so we get some hits + """ + outdir = permutation_stats_dir / 'output_with_hits' + reg_root = permutation_stats_dir / 'registration_data' + outdir.mkdir(exist_ok=True) + reg_root.mkdir(exist_ok=True) + wt_dir = reg_root / wt_registration_dir.name + mut_dir = reg_root / mut_registration_dir.name + shutil.rmtree(wt_dir, ignore_errors=True) + shutil.rmtree(mut_dir, ignore_errors=True) + shutil.copytree(wt_registration_dir, wt_dir ) + shutil.copytree(mut_registration_dir, mut_dir) + + # Now alter the organ volumes so we get some hits + for ov_file in mut_dir.rglob('organ_volumes.csv'): + df = read_spec_csv(ov_file) + # Make organ 1 smaller and organ 2 larger + df[['1']] *= 100 + df[['2']] /= 200 + df.to_csv(ov_file) + + +@pytest.mark.notest +def test_permutation_stats_with_qc(): + """ + + + """ + + cfg_dir = stats_config_dir / 'permutation_stats' + cfg_file = cfg_dir / 'perm_qc.yaml' + + qc_file_dir = registration_root / 'qc_files' + + for qc_file in qc_file_dir.iterdir(): + # if not qc_file.name.endswith('4.csv'): + # continue + if 'temp' in qc_file.name: + continue + + # if '5' not in qc_file.name: + # continue # Debug + + + with open(cfg_file, 'r') as fh: + cfg = yaml.load(fh) + cfg['qc_file'] = str(qc_file) + outdir = Path(cfg['output_dir']).parent / qc_file.stem + cfg['output_dir'] = str(outdir) + + resolved = (cfg_dir / outdir).resolve() + resolved.mkdir(exist_ok=True) + # Can't use tempfile here as the paths in the config as respolved using the config dir parent + temp = cfg_dir / 'temp.yaml' + with open(temp, 'w') as fh: + fh.write(yaml.dump(cfg)) + + lama_permutation_stats.run(temp) + + + +@pytest.mark.notest +def test_permutation_stats(): + """ + Run the whole permutation based stats pipeline. + Copy the output from a LAMA registrations test run, and increase or decrease the volume of the mutants so we get + some hits + + """ + + # cfg_dir = stats_config_dir / 'permutation_stats' + # + # for cfg_file in cfg_dir.iterdir(): + # if not cfg_file.name.endswith('.yaml'): + # continue + # lama_permutation_stats.run(cfg_file) + + + + cfg_file = stats_config_dir / 'permutation_stats' / 'perm_no_qc.yaml' + + + lama_permutation_stats.run(cfg_file) + + + + # # Without label meta file + # output_no_metdata = permutation_stats_dir / 'output_with_hits_no_metadata' + # output_no_metdata.mkdir(exist_ok=True) + # lama_permutation_stats.run(wt_registration_dir / 'output', mut_registration_dir / 'output', output_no_metdata, num_perms, + # label_map_path=label_map) + +# @pytest.mark.notest +# def test_permutation_stats_with_qc_flaggs(): +# """ +# Run the permutations stats but include a specimen/organ-level qc file to exclude qc-flagged organs +# """ +# num_perms = 5 # Would do 1000 or more normally +# label_info = registration_root / 'target' / 'label_info.csv' +# label_map = registration_root / 'target' / 'labels.nrrd' +# +# for qc_file in qc_flags_dir.iterdir(): +# +# out_dir = permutation_stats_dir / qc_file.stem # Intermediate results go here. Permutation distributions etc. +# out_dir.mkdir() +# +# if 'error' in str(qc_file): +# # These qc flag files with errors in should raise a LamaDataError +# with pytest.raises(LamaDataException): +# run_permutation_stats.run(wt_registration_dir, mut_registration_dir, out_dir, num_perms, +# label_info=label_info, label_map_path=label_map, qc_file=qc_file) +# +# else: +# run_permutation_stats.run(wt_registration_dir, mut_registration_dir, out_dir, num_perms, +# label_info=label_info, label_map_path=label_map, qc_file=qc_file) + + +# @pytest.mark.notest +# def test_p_thresholds(): +# """ +# Testing the p_thresholds calculation +# ------- +# TODO: Add more tests for different cases +# """ +# import copy +# import pandas as pd +# +# # These simulate null distributions from 100 baselines for two organs +# null = pd.DataFrame.from_dict({ +# '1': np.linspace(0.01, 1, 100), +# '2': np.linspace(0.01, 1, 100)}) +# +# # These simulate alternative distributions for two organs from 40 lines +# alt = pd.DataFrame.from_dict({ +# '1': np.linspace(0.00000001, 0.1, 40), +# '2': np.linspace(0.9, 1, 40)}) +# +# thresh = p_thresholds.get_thresholds(null, alt) +# +# assert thresh.loc[1, 'p_thresh'] == 0.02 # Gives a p-value threshold of 0.02 +# assert thresh.loc[2, 'p_thresh'] == 1.0 # Gives a p-value threshold of 1.0 as there are no low p-values in the alt distribution + + +# @pytest.mark.notest +# def test_annotate(): +# # Lines +# alt_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/alt_line_dist_pvalues.csv') +# thresholds_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/line_organ_p_thresholds.csv') +# mutant_dir = Path('/home/neil/git/lama/tests/test_data/registration_test_data/mutant') +# +# thresholds = pd.read_csv(thresholds_file, index_col=0) +# alt = pd.read_csv(alt_file, index_col=0) +# +# run_permutation_stats.annotate(thresholds, alt, mutant_dir) +# +# # # Specimens +# alt_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/alt_specimen_dist_pvalues.csv') +# thresholds_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/specimen_organ_p_thresholds.csv') +# mutant_dir = Path('/home/neil/git/lama/tests/test_data/registration_test_data/mutant') +# +# thresholds = pd.read_csv(thresholds_file, index_col=0) +# alt = pd.read_csv(alt_file, index_col=0) +# +# run_permutation_stats.annotate(thresholds, alt, mutant_dir) + diff --git a/lama/tests/working_on/test_standard_stats.py b/lama/tests/test_standard_stats.py similarity index 98% rename from lama/tests/working_on/test_standard_stats.py rename to lama/tests/test_standard_stats.py index 0a79d9ad..7d0b7f4e 100644 --- a/lama/tests/working_on/test_standard_stats.py +++ b/lama/tests/test_standard_stats.py @@ -1,4 +1,5 @@ """ + Currently just running and making sure there's no uncaught exceptions. TODO: check that the correct output is generated too @@ -24,6 +25,7 @@ 'jacobians' ], normalise_organ_vol_to_mask=True, + use_log_jacobians = True, reg_folder='similarity', jac_folder='similarity', mask='mask_tight.nrrd', @@ -61,6 +63,10 @@ def make_config(config_updates={}): return make_config + + + +@pytest.mark.skip def test_all(get_config): """ Run the stats module. The data required for this to work must be initially made diff --git a/lama/tests/working_on/test_label_propagation.py b/lama/tests/working_on/test_label_propagation.py new file mode 100644 index 00000000..d36aa808 --- /dev/null +++ b/lama/tests/working_on/test_label_propagation.py @@ -0,0 +1,20 @@ +""" + +Usage: pytest -qq -m "not notest" test_data_generation.py +The use of -m "not notest" is to be able to omit certain tests with the @pytest.mark.notest decorator +""" + +from pathlib import Path +from lama.registration_pipeline import run_lama +import logzero + +import os +import shutil +import pytest +import logging +from lama.scripts import lama_job_runner +from lama.tests import (registration_root, mut_registration_dir, wt_registration_dir) + + + + diff --git a/lama/tests/working_on/test_lm_r.py b/lama/tests/working_on/test_lm_r.py new file mode 100644 index 00000000..8f9a24e8 --- /dev/null +++ b/lama/tests/working_on/test_lm_r.py @@ -0,0 +1,5 @@ +""" +The R script that runs the linear models +""" + + diff --git a/lama/tests/working_on/test_radiomics_scripts.py b/lama/tests/working_on/test_radiomics_scripts.py new file mode 100644 index 00000000..bc9ac149 --- /dev/null +++ b/lama/tests/working_on/test_radiomics_scripts.py @@ -0,0 +1,824 @@ + +from pathlib import Path + +import joblib + +from lama.lama_radiomics.radiomics import radiomics_job_runner +from lama import common +import os +from lama.common import cfg_load +from lama.img_processing import normalise +from logzero import logger as logging +import seaborn as sns +import pandas as pd +from sklearn.manifold import TSNE +import matplotlib.pyplot as plt +import pytest +import numpy as np +from lama.lama_radiomics import feature_reduction +from lama.scripts import lama_machine_learning +import pacmap +from lama.scripts import lama_permutation_stats +from lama.lama_radiomics import radiomics + +import SimpleITK as sitk +stats_cfg = Path( + "C:/LAMA/lama/tests/configs/permutation_stats/perm_no_qc.yaml") + +from lama.stats.permutation_stats.run_permutation_stats import get_radiomics_data + + +stats_cfg_v2 = Path( + "C:/LAMA/lama/tests/configs/permutation_stats/perm_no_qc_just_ovs.yaml") + + + +def test_denoising(): + file_path = Path("E:/220204_BQ_dataset/221218_BQ_run/registrations/rigid/flipped/200721_MPTLVo3_CT_4T1_Ms_D7_C1_002.nrrd") + + img = common.LoadImage(file_path).img + f_out = "E:/220204_BQ_dataset/221218_BQ_run/test.nrrd" + + result = radiomics.denoise(img) + sitk.WriteImage(result, f_out) + + + +def test_radiomics(): + cpath = Path('C:/LAMA/lama/tests/configs/lama_radiomics/radiomics_config.toml') + c = cfg_load(cpath) + + target_dir = Path(c.get('target_dir')) + + labs_of_int = c.get('labs_of_int') + + norm_methods = c.get('norm_methods') + + norm_label = c.get('norm_label') + + spherify = c.get('spherify') + + ref_vol_path = Path(c.get('ref_vol_path')) if c.get('ref_vol_path') is not None else None + + norm_dict = { + "histogram": normalise.IntensityHistogramMatch(), + "N4": normalise.IntensityN4Normalise(), + "subtraction": normalise.NonRegMaskNormalise(), + "none": None + } + try: + norm_meths = [norm_dict[str(x)] for x in norm_methods] + + except KeyError as e: + print(e) + + norm_meths = None + logging.info("Starting Radiomics") + radiomics_job_runner(target_dir, labs_of_int=labs_of_int, norm_method=normalise.IntensityHistogramMatch(), norm_label=norm_label, + spherify=spherify, ref_vol_path=ref_vol_path, make_job_file=True, scan_dir='imgs', tumour_dir='sphere_15', stage_dir='stage_labels') + + +def test_permutation_stats_just_ovs(): + """ + Run the whole permutation based stats pipeline. + Copy the output from a LAMA registrations test run, and increase or decrease the volume of the mutants so we get + some hits + + """ + lama_permutation_stats.run(stats_cfg_v2) + + + +def test_radiomic_plotting(): + _dir = Path("E:/220607_two_way/g_by_back_data/radiomics_output/features/") + + file_names = [spec for spec in common.get_file_paths(folder=_dir, extension_tuple=".csv")] + file_names.sort() + + data = [pd.read_csv(spec, index_col=0).dropna(axis=1) for spec in file_names] + + abnormal_embs = ['22300_e8','22300_e6', '50_e5'] + + for i, df in enumerate(data): + df.index.name = 'org' + df.name = str(file_names[i]).split(".")[0].split("/")[-1] + df['genotype'] = 'HET' if 'het' in str(file_names[i]) else 'WT' + df['background'] = 'C57BL6N' if (('b6ku' in str(file_names[i]))|('BL6' in str(file_names[i]))) else \ + 'F1' if ('F1' in str(file_names[i])) else 'C3HHEH' + + df['HPE'] = 'abnormal' if any(map(str(file_names[i]).__contains__, abnormal_embs)) else 'normal' + + data = pd.concat( + data, + ignore_index=False, keys=[os.path.splitext(os.path.basename(spec))[0] for spec in file_names], + names=['specimen', 'org']) + + line_file = _dir.parent / "full_results.csv" + + org_dir =_dir.parent / "organs" + + os.makedirs(org_dir, exist_ok=True) + print(data.columns) + + for org in data.index.get_level_values('org').unique(): + data[data.index.get_level_values('org') == org].to_csv(str(org_dir)+"/results_" + str(org)+ ".csv") + + data.to_csv(line_file) + + data_subset = data.select_dtypes(include=np.number) + + data_subset = data_subset.apply(lambda x: (x - x.mean()) / x.std(), axis=0) + data_subset = data_subset.apply(lambda x: (x - x.mean()) / x.std(), axis=1) + + embedding = pacmap.PaCMAP(n_components=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0, num_iters=20000, verbose=1) + + + + #print(data_subset.dropna(axis='columns')) + + results = embedding.fit_transform(data_subset.dropna(axis='columns')) + + color_class = data.index.get_level_values('org') + + # fig, ax = plt.subplots(figsize=[55, 60]) + # cluster.tsneplot(score=tsne_results, show=True, theme='dark', colorlist=color_class) + + data['PaCMAP-2d-one'] = results[:, 0] + data['PaCMAP-2d-two'] = results[:, 1] + data['org'] = data.index.get_level_values('org') + data['specimen'] = data.index.get_level_values('specimen') + data['condition'] = data['genotype'] + "_" + data['background'] + + + fig, ax = plt.subplots(figsize=[56, 60]) + #data = data[data['condition'] == 'WT_C3HHEH'] + + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=data, + #col_order=['normal', 'abnormal'], + col='condition', + col_wrap=2, + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two'])-10, np.max(data['PaCMAP-2d-two'])+10), + xlim=(np.min(data['PaCMAP-2d-one'])-10, np.max(data['PaCMAP-2d-one'])+10)) + + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_all_cond_v2.png") + plt.close() + + fig, ax = plt.subplots(figsize=[56, 60]) + wt_c3h_data = data[data['condition'] == 'WT_C3HHEH'] + + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=wt_c3h_data, + # col_order=['normal', 'abnormal'], + col='org', + col_wrap=5, + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two']) - 10, np.max(data['PaCMAP-2d-two']) + 10), + xlim=(np.min(data['PaCMAP-2d-one']) - 10, np.max(data['PaCMAP-2d-one']) + 10)) + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_C3H_wt_org_v2.png") + plt.close() + + fig, ax = plt.subplots(figsize=[56, 60]) + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=wt_c3h_data, + # col_order=['normal', 'abnormal'], + #col='specimen', + #col_wrap=5, + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two']) - 10, np.max(data['PaCMAP-2d-two']) + 10), + xlim=(np.min(data['PaCMAP-2d-one']) - 10, np.max(data['PaCMAP-2d-one']) + 10)) + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_C3H_map_v2.png") + plt.close() + + het_c3h_data = data[data['condition'] == 'HET_C3HHEH'] + fig, ax = plt.subplots(figsize=[56, 60]) + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=het_c3h_data, + # col_order=['normal', 'abnormal'], + col='specimen', + col_wrap=5, + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two']) - 10, np.max(data['PaCMAP-2d-two']) + 10), + xlim=(np.min(data['PaCMAP-2d-one']) - 10, np.max(data['PaCMAP-2d-one']) + 10)) + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_C3H_hets_v2.png") + plt.close() + + wt_b6_data = data[data['condition'] == 'WT_C57BL6N'] + fig, ax = plt.subplots(figsize=[56, 60]) + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=wt_b6_data, + # col_order=['normal', 'abnormal'], + #col='specimen', + #col_wrap=5, + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two']) - 10, np.max(data['PaCMAP-2d-two']) + 10), + xlim=(np.min(data['PaCMAP-2d-one']) - 10, np.max(data['PaCMAP-2d-one']) + 10)) + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_b6_map_v2.png") + plt.close() + + wt_f1_data = data[data['condition'] == 'WT_C57BL6N'] + fig, ax = plt.subplots(figsize=[56, 60]) + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=wt_f1_data, + # col_order=['normal', 'abnormal'], + # col='specimen', + # col_wrap=5, + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two']) - 10, np.max(data['PaCMAP-2d-two']) + 10), + xlim=(np.min(data['PaCMAP-2d-one']) - 10, np.max(data['PaCMAP-2d-one']) + 10)) + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_f1_map_v2.png") + plt.close() + + fig, ax = plt.subplots(figsize=[56, 60]) + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=wt_b6_data, + # col_order=['normal', 'abnormal'], + col='specimen', + col_wrap=5, + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two']) - 10, np.max(data['PaCMAP-2d-two']) + 10), + xlim=(np.min(data['PaCMAP-2d-one']) - 10, np.max(data['PaCMAP-2d-one']) + 10)) + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_b6_specs_v2.png") + plt.close() + + het_b6_data = data[data['condition'] == 'HET_C57BL6N'] + fig, ax = plt.subplots(figsize=[56, 60]) + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=het_b6_data, + # col_order=['normal', 'abnormal'], + col='specimen', + col_wrap=5, + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two']) - 10, np.max(data['PaCMAP-2d-two']) + 10), + xlim=(np.min(data['PaCMAP-2d-one']) - 10, np.max(data['PaCMAP-2d-one']) + 10)) + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_b6_hets_v2.png") + plt.close() + + het_f1_data = data[data['condition'] == 'F1'] + fig, ax = plt.subplots(figsize=[56, 60]) + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=het_f1_data, + # col_order=['normal', 'abnormal'], + col='specimen', + col_wrap=5, + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two']) - 10, np.max(data['PaCMAP-2d-two']) + 10), + xlim=(np.min(data['PaCMAP-2d-one']) - 10, np.max(data['PaCMAP-2d-one']) + 10)) + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_f1_hets_v2.png") + plt.close() + + fig, ax = plt.subplots(figsize=[56, 60]) + g = sns.lmplot( + x="PaCMAP-2d-one", y="PaCMAP-2d-two", + data=data, + col_order=['normal', 'abnormal'], + col='HPE', + row='condition', + hue="org", + palette='husl', + fit_reg=False) + g.set(ylim=(np.min(data['PaCMAP-2d-two']) - 10, np.max(data['PaCMAP-2d-two']) + 10), + xlim=(np.min(data['PaCMAP-2d-one']) - 10, np.max(data['PaCMAP-2d-one']) + 10)) + + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/features/radiomics_2D_PaCMAP_HPE_v2.png") + plt.close() + + + + #fig, ax = plt.subplots((len(data.index.levels[-1]) // 5) + 1, 5, figsize=[60, 80], sharex=True, sharey=True) + + #for i, row in enumerate(data.index.levels[-1]): + # fig, ax = plt.subplots(figsize=[27, 30]) + # print(row) + # sns.lmplot( + # x="tsne-3d-one", y="tsne-2d-two", + # hue="org", + # col="condition", + # palette="husl", + # data=data.loc[row], + # legend="full", + # fit_reg=False, + # legend_out=True) + # file_name = "E:/220606_two_way/g_by_back_data/radiomics_output/sample_features/" + str(row) + ".png" + # plt.savefig(file_name) + # plt.close() + + # sns.relplot( + # x="tsne-3d-one", y="tsne-2d-two", + # data=data, + # hue="org", + # palette='husl', + # alpha=-1.3, + # ax=ax[(i+0)//5, (i+1)%5]) + + + #fig.savefig("E:/220720_Amrit_radiomics/radiomics_2D_tsne_overlay.png") + #plt.close() + + # remove diagnostics + # data.index = data['specimen'] + # print(data.index.str.rsplit('_', 1)) + # data = data[data.columns.drop(list(data.filter(regex="diagnostics")))] + + # _metadata = pd.DataFrame(data.index.str.rsplit('_', 1)) + + # print(_metadata) + + # _metadata[['Embryo','Genotype']] = pd.DataFrame(_metadata.specimen.tolist(), index=_metadata.index) + + # print(_metadata) + + # _metadata = _metadata.drop(columns=['specimen']) + + # _metadata.reset_index(inplace=True, drop=True) + # data.reset_index(inplace=True, drop=True) + + # data=data.drop(columns=['specimen']) + + # print(data) + # umap_organs(data, Path("E:/Bl5_data/211014_g_by_back/umap.png"), _metadata=_metadata) + + + data.drop(['org', 'condition', 'HPE'], axis=1, inplace=True) + + data = data.select_dtypes(include=np.number) + + #data = data.apply(lambda x: (x - x.mean()) / x.std(), axis=0) + + data = data.apply(lambda x: (x - x.mean()) / x.std(), axis=1) + + + #data = data.apply(lambda x: (x - x.mean()) / x.std(), axis=0) + + #sns.set(font_scale=0.5) + + print("Data after drop", data) + + print(data.index.get_level_values('org'), len(data.index.get_level_values('org'))) + + #data = data[~np.isin(data.index.get_level_values('org'), 27.0)] + + print(data) + data.drop(27.0, level=1, axis=0, inplace=True) + print(data, any(np.isin(data.index.get_level_values('org'), 27.0))) + print(data.index.get_level_values('org')) + + for i, org in enumerate(data.index.levels[1]): + fig, ax = plt.subplots(figsize=[14, 15]) + #sns.set(font_scale=0.5) + o_data = data[np.isin(data.index.get_level_values('org'), org)] + o_data.dropna(axis=1, inplace=True) + + if org == 27.0: + continue + + print(org) + + o_data.drop(o_data.std()[(o_data.std() == 0)].index, axis=1, inplace=True) + + + #o_data.to_csv("E:/org_df.csv") + + #import scipy.spatial.distance as ssd + + + sns.clustermap(o_data.T, + figsize=[148.5, 210], + dendrogram_ratio=0.1, + colors_ratio=0.1, + #z_score=0, + metric="correlation", + #cmap=sns.diverging_palette(250, 15, l=70, s=400, sep=40, n=512, center="light", as_cmap=True), + #cbar_kws={'Genotype': 'Background'}, + square=True, + xticklabels=True, + yticklabels=True) + plt.tight_layout() + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/wt_C3H_heatmap_"+str(org)+".png") + plt.close() + + +def test_BQ_concat(): + _dir = Path("Z:/jcsmr/ROLab/Experimental data/Radiomics/Workflow design and trial results/Kyle Drover analysis/220617_BQ_norm_stage_full/sub/sub_normed_features.csv") + #_dir = Path("E:/220913_BQ_tsphere/inputs/features/") + + # file_names = [spec for spec in common.get_file_paths(folder=_dir, extension_tuple=".csv")] + # file_names.sort() + # + # data = [pd.read_csv(spec, index_col=0).dropna(axis=1) for spec in file_names] + # + # data = pd.concat( + # data, + # ignore_index=False, keys=[os.path.splitext(os.path.basename(spec))[0] for spec in file_names], + # names=['specimen', 'label']) + # + # data['specimen'] = data.index.get_level_values('specimen') + # + # _metadata = data['specimen'].str.split('_', expand=True) + # + # + # + # _metadata.columns = ['Date', 'Exp', 'Contour_Method', 'Tumour_Model', 'Position', 'Age', + # 'Cage_No.', 'Animal_No.'] + # + # + # + # + # _metadata.reset_index(inplace=True, drop=True) + # data.reset_index(inplace=True, drop=True) + # features = pd.concat([_metadata, data], axis=1) + # + # features.index.name = 'scanID' + # + # print(features) + # + # print(str(_dir.parent / "full_results.csv")) + # + # features.to_csv(str(_dir.parent / "full_results.csv")) + + features = pd.read_csv(_dir) + features = features[features.columns.drop(list(features.filter(regex="diagnostics")))] + features.drop(["scanID"], axis=1, inplace=True) + feature_reduction.main(features, org = None, rad_file_path = Path(_dir.parent / "full_results.csv")) + +def test_BQ_mach_learn(): + _dir = Path("C:/test/features/") + + file_names = [spec for spec in common.get_file_paths(folder=_dir, extension_tuple=".csv")] + file_names.sort() + + data = [pd.read_csv(spec, index_col=0).dropna(axis=1) for spec in file_names] + + data = pd.concat( + data, + ignore_index=False, keys=[os.path.splitext(os.path.basename(spec))[0] for spec in file_names], + names=['specimen', 'label']) + + data['specimen'] = data.index.get_level_values('specimen') + + _metadata = data['specimen'].str.split('_', expand=True) + + + + _metadata.columns = ['Date', 'Exp', 'Contour_Method', 'Tumour_Model', 'Position', 'Age', + 'Cage_No.', 'Animal_No.'] + + + + + _metadata.reset_index(inplace=True, drop=True) + data.reset_index(inplace=True, drop=True) + features = pd.concat([_metadata, data], axis=1) + + features.index.name = 'scanID' + + print(features) + + print(str(_dir.parent / "full_results.csv")) + + features.to_csv(str(_dir.parent / "full_results.csv")) + + feature_reduction.main(features, org = None, rad_file_path = Path(_dir.parent / "full_results.csv")) + + +def test_BQ_mach_learn_non_tum(): + _dir = Path("E:/220919_non_tum/features/") + + file_names = [spec for spec in common.get_file_paths(folder=_dir, extension_tuple=".csv")] + file_names.sort() + + data = [pd.read_csv(spec, index_col=0).dropna(axis=1) for spec in file_names] + + data = pd.concat( + data, + ignore_index=False, keys=[os.path.splitext(os.path.basename(spec))[0] for spec in file_names], + names=['specimen', 'label']) + + data['specimen'] = data.index.get_level_values('specimen') + + _metadata = data['specimen'].str.split('_', expand=True) + + + + _metadata.columns = ['Date', 'Exp', 'Contour_Method', 'Tumour_Model', 'Position', 'Age', + 'Cage_No.', 'Animal_No.'] + + + + + _metadata.reset_index(inplace=True, drop=True) + data.reset_index(inplace=True, drop=True) + features = pd.concat([_metadata, data], axis=1) + + features.index.name = 'scanID' + + print(features) + + print(str(_dir.parent / "full_results.csv")) + + features.to_csv(str(_dir.parent / "full_results.csv")) + + feature_reduction.main(features, org = None, rad_file_path = Path(_dir.parent / "full_results.csv")) + + + +def test_BQ_mach_learn_batch_sp(): + _dir = Path("E:/220913_BQ_tsphere/inputs/features/") + + file_names = [spec for spec in common.get_file_paths(folder=_dir, extension_tuple=".csv")] + file_names.sort() + + data = [pd.read_csv(spec, index_col=0).dropna(axis=1) for spec in file_names] + + data = pd.concat( + data, + ignore_index=False, keys=[os.path.splitext(os.path.basename(spec))[0] for spec in file_names], + names=['specimen', 'label']) + + data['specimen'] = data.index.get_level_values('specimen') + + _metadata = data['specimen'].str.split('_', expand=True) + + + + _metadata.columns = ['Date', 'Exp', 'Contour_Method', 'Tumour_Model', 'Position', 'Age', + 'Cage_No.', 'Animal_No.'] + + + + + _metadata.reset_index(inplace=True, drop=True) + data.reset_index(inplace=True, drop=True) + features = pd.concat([_metadata, data], axis=1) + + features.index.name = 'scanID' + + print(features) + + print(str(_dir.parent / "full_results.csv")) + + features.to_csv(str(_dir.parent / "full_results.csv")) + + feature_reduction.main(features, org = None, rad_file_path = Path(_dir.parent / "full_results.csv"), batch_test=True) + + +def test_BQ_concat_batch(): + _dir = Path("Z:/jcsmr/ROLab/Experimental data/Radiomics/Workflow design and trial results/Kyle Drover analysis/220617_BQ_norm_stage_full/sub_normed_features.csv") + #_dir = Path("E:/220913_BQ_tsphere/inputs/features/") + + # file_names = [spec for spec in common.get_file_paths(folder=_dir, extension_tuple=".csv")] + # file_names.sort() + # + # data = [pd.read_csv(spec, index_col=0).dropna(axis=1) for spec in file_names] + # + # data = pd.concat( + # data, + # ignore_index=False, keys=[os.path.splitext(os.path.basename(spec))[0] for spec in file_names], + # names=['specimen', 'label']) + # + # data['specimen'] = data.index.get_level_values('specimen') + # + # _metadata = data['specimen'].str.split('_', expand=True) + # + # + # + # _metadata.columns = ['Date', 'Exp', 'Contour_Method', 'Tumour_Model', 'Position', 'Age', + # 'Cage_No.', 'Animal_No.'] + # + # + # + # + # _metadata.reset_index(inplace=True, drop=True) + # data.reset_index(inplace=True, drop=True) + # features = pd.concat([_metadata, data], axis=1) + # + # features.index.name = 'scanID' + # + # print(features) + # + # print(str(_dir.parent / "full_results.csv")) + # + # features.to_csv(str(_dir.parent / "full_results.csv")) + + features = pd.read_csv(_dir) + features = features[features.columns.drop(list(features.filter(regex="diagnostics")))] + features.drop(["scanID"], axis=1, inplace=True) + feature_reduction.main(features, org = None, rad_file_path = Path(_dir.parent / "full_results.csv"), batch_test=True) + + + + + + +@pytest.mark.skip +def test_feat_reduction(): + feature_reduction.main() + +def test_mach_learn_pipeline(): + lama_machine_learning.ml_job_runner("E:/230129_bq_tester/norm_methods/", n_sample=True) + + +@pytest.mark.skip +def test_radiomic_org_plotting(): + _dir = Path("E:/220607_two_way/g_by_back_data/radiomics_output/features/") + + file_names = [spec for spec in common.get_file_paths(folder=_dir, extension_tuple=".csv")] + file_names.sort() + + data = [pd.read_csv(spec, index_col=0).dropna(axis=1) for spec in file_names] + + abnormal_embs = ['22300_e8', '22300_e6', '50_e5'] + + for i, df in enumerate(data): + df.index.name = 'org' + df.name = str(file_names[i]).split(".")[0].split("/")[-1] + df['genotype'] = 'HET' if 'het' in str(file_names[i]) else 'WT' + df['background'] = 'C56BL6N' if (('b6ku' in str(file_names[i])) | ('BL6' in str(file_names[i]))) else 'C3HHEH' + df['HPE'] = 'abnormal' if any(map(str(file_names[i]).__contains__, abnormal_embs)) else 'normal' + + data = pd.concat( + data, + ignore_index=False, keys=[os.path.splitext(os.path.basename(spec))[0] for spec in file_names], + names=['specimen', 'org']) + + line_file = _dir.parent / "full_results.csv" + + data.to_csv(line_file) + + #data_subset = data.select_dtypes(include=np.number) + + for i, org in enumerate(data.index.levels[1]): + fig, ax = plt.subplots(1, 1, figsize=[56, 60]) + #sns.set(font_scale=0.5) + o_data = data[np.isin(data.index.get_level_values('org'), org)] + + o_data_subset = o_data.select_dtypes(include=np.number) + #o_data_subset = o_data_subset.apply(lambda x: (x - x.mean()) / x.std(), axis=0) + o_data_subset = o_data_subset.apply(lambda x: (x - x.mean()) / x.std(), axis=1) + + tsne = TSNE(perplexity=30, + n_components=2, + random_state=0, + early_exaggeration=250, + n_iter=1000, + verbose=1) + + tsne_results = tsne.fit_transform(o_data_subset.dropna(axis='columns')) + + o_data['tsne-2d-one'] = tsne_results[:, 0] + o_data['tsne-2d-two'] = tsne_results[:, 1] + o_data['org'] = o_data.index.get_level_values('org') + o_data['specimen'] = o_data.index.get_level_values('specimen') + + o_data['condition'] = o_data['genotype'] + "_" + o_data['background'] + + fig, ax = plt.subplots() + o_data = o_data[o_data['condition'] == 'WT_C3HHEH'] + g = sns.lmplot( + x="tsne-2d-one", y="tsne-2d-two", + data=o_data, + # col_order=['WT_C3HHEH','HET_C3HHEH','WT_C57BL6N','HET_C57BL6N'], + #col='specimen', + #col_wrap=5, + hue="specimen", + palette='husl', + fit_reg=False) + + def label_point(x, y, val, ax): + a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1) + for i, point in a.iterrows(): + ax.text(point['x'] + .02, point['y'], str(point['val']), fontsize='xx-small') + + label_point(o_data['tsne-2d-one'], o_data['tsne-2d-two'], o_data['specimen'], plt.gca()) + + + + #g.set(ylim=(np.min(o_data['tsne-2d-two']) - 10, np.max(o_data['tsne-2d-two']) + 10), + # xlim=(np.min(o_data['tsne-2d-one']) - 10, np.max(o_data['tsne-2d-one']) + 10)) + plt.savefig("E:/220607_two_way/g_by_back_data/radiomics_output/radiomics_2D_tsne_C3H_wt_" + str(org) + ".png") + plt.close() + + +def test_get_rad_data_for_perm(): + _dir = Path("E:/221122_two_way/g_by_back_data/radiomics_output") + + wt_dir = Path("E:/221122_two_way/g_by_back_data/baseline") + + mut_dir = Path("E:/221122_two_way/g_by_back_data/mutants") + + treat_dir = Path("E:/221122_two_way/g_by_back_data/treatment") + + inter_dir = Path("E:/221122_two_way/g_by_back_data/mut_treat") + + results = get_radiomics_data(_dir, wt_dir, mut_dir, treat_dir, inter_dir) + + results.to_csv(str(_dir/"test_dataset.csv")) + +def test_permutation_stats(): + """ + Run the whole permutation based stats pipeline. + Copy the output from a LAMA registrations test run, and increase or decrease the volume of the mutants so we get + some hits + + """ + lama_permutation_stats.run(stats_cfg) + +def test_sns_clustermap(): + url = "https://raw.githubusercontent.com/dorkylever/LAMA/master/lama/tests/clustermap_data.csv" + X = pd.read_csv(url, index_col=0) + X.dropna(how='all', inplace=True) + X.fillna(1, inplace=True) + # replace missing values with 0 + # replace infinite values with 0 + + print("Missing values:", X.isnull().sum().sum()) + print("Infinite values:", np.isposinf(X).sum().sum() + np.isneginf(X).sum().sum()) + + # mean_data = X.apply(np.mean, axis=1) + + # std_data = X.apply(np.std, axis=1) + + # constant_rows = X.apply(lambda row: row.nunique() == 1, axis=1) + + # X = X[~constant_rows] + + # na_mean = mean_data[mean_data.isna().any()] + + # na_std = std_data[std_data.iszero().any()] + + cg = sns.clustermap(X, + metric="euclidean", + cmap=sns.diverging_palette(250, 15, l=70, s=400, sep=1, n=512, center="light", + as_cmap=True), + cbar_kws={'label': 'mean volume ratio'}, square=True, + center=1, + figsize=[30, len(X) * 0.3]) + + # Calculate the mean and standard deviation of each variable + # X.columns = [col.rsplit('_', 3)[-1] for col in X.columns] + + plt.tight_layout() + + plt.savefig("E:/221122_two_way/permutation_stats/rad_perm_output/two_way/easy_fig.png") + plt.close() + + X = X.to_numpy() + + print(np.isnan(X).any() | np.isinf(X).any()) + # + print(X) + mu = np.mean(X, axis=0) + sigma = np.std(X, axis=0) + # + # + print(np.all(sigma)) + # + # # Calculate the z-score matrix + Z = (X - mu) / sigma + print(Z) + # + # # Calculate the Euclidean distance matrix + d = np.zeros((Z.shape[0], Z.shape[0])) + for i in range(Z.shape[0]): + for j in range(Z.shape[0]): + d[i, j] = np.sqrt(np.sum((Z[i, :] - Z[j, :]) ** 2)) + # + print(d) + print(np.isnan(d).any() | np.isinf(d).any()) \ No newline at end of file diff --git a/lama/tests/working_on/test_run_permutation_stats.py b/lama/tests/working_on/test_run_permutation_stats.py deleted file mode 100644 index a5b5d130..00000000 --- a/lama/tests/working_on/test_run_permutation_stats.py +++ /dev/null @@ -1,119 +0,0 @@ -""" -Test the permutation-based stats pipeline - -Usage ------ - -These functions test the lama registration pipeline permutation stats module - -Usage: pytest -q -x -s -m "not notest" --tb=short test_run_permutation_stats.py -""" - -import shutil - -import pytest - -from lama.stats.permutation_stats import run_permutation_stats -from lama.tests import (test_data_root, registration_root, wt_registration_dir, mut_registration_dir, - permutation_stats_dir, qc_flags_dir) -from lama.common import LamaDataException - - -@pytest.fixture(scope="session", autouse=True) -def remove_previous_output(): - """ - Remove previous output from this test module. We do not do this after each test as manual inspection of the output - may be necessary. this runs before all other tests. - """ - shutil.rmtree(permutation_stats_dir, ignore_errors=True) - permutation_stats_dir.mkdir() - - -# @pytest.mark.notest -def test_permutation_stats(): - """ - Run the whole permutation based stats pipeline. - Currently this just checks to see if the pipeline completes without any errors. - """ - outdir = permutation_stats_dir / 'output' - outdir.mkdir() - num_perms = 5 # Would do 1000 or more normally - label_info = registration_root / 'target' / 'label_info.csv' - label_map = registration_root / 'target' / 'labels.nrrd' - - run_permutation_stats.run(wt_registration_dir, mut_registration_dir, outdir, num_perms, - label_info=label_info, label_map_path=label_map) - -@pytest.mark.notest -def test_permutation_stats_with_qc_flaggs(): - """ - Run the permutations stats but include a specimen/organ-level qc file to exclude qc-flagged organs - """ - num_perms = 5 # Would do 1000 or more normally - label_info = registration_root / 'target' / 'label_info.csv' - label_map = registration_root / 'target' / 'labels.nrrd' - - for qc_file in qc_flags_dir.iterdir(): - - out_dir = permutation_stats_dir / qc_file.stem # Intermediate results go here. Permutation distributions etc. - out_dir.mkdir() - - if 'error' in str(qc_file): - # These qc flag files with errors in should raise a LamaDataError - with pytest.raises(LamaDataException): - run_permutation_stats.run(wt_registration_dir, mut_registration_dir, out_dir, num_perms, - label_info=label_info, label_map_path=label_map, qc_file=qc_file) - - else: - run_permutation_stats.run(wt_registration_dir, mut_registration_dir, out_dir, num_perms, - label_info=label_info, label_map_path=label_map, qc_file=qc_file) - - -# @pytest.mark.notest -# def test_p_thresholds(): -# """ -# Testing the p_thresholds calculation -# ------- -# TODO: Add more tests for different cases -# """ -# import copy -# import pandas as pd -# -# # These simulate null distributions from 100 baselines for two organs -# null = pd.DataFrame.from_dict({ -# '1': np.linspace(0.01, 1, 100), -# '2': np.linspace(0.01, 1, 100)}) -# -# # These simulate alternative distributions for two organs from 40 lines -# alt = pd.DataFrame.from_dict({ -# '1': np.linspace(0.00000001, 0.1, 40), -# '2': np.linspace(0.9, 1, 40)}) -# -# thresh = p_thresholds.get_thresholds(null, alt) -# -# assert thresh.loc[1, 'p_thresh'] == 0.02 # Gives a p-value threshold of 0.02 -# assert thresh.loc[2, 'p_thresh'] == 1.0 # Gives a p-value threshold of 1.0 as there are no low p-values in the alt distribution - - -# @pytest.mark.notest -# def test_annotate(): -# # Lines -# alt_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/alt_line_dist_pvalues.csv') -# thresholds_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/line_organ_p_thresholds.csv') -# mutant_dir = Path('/home/neil/git/lama/tests/test_data/registration_test_data/mutant') -# -# thresholds = pd.read_csv(thresholds_file, index_col=0) -# alt = pd.read_csv(alt_file, index_col=0) -# -# run_permutation_stats.annotate(thresholds, alt, mutant_dir) -# -# # # Specimens -# alt_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/alt_specimen_dist_pvalues.csv') -# thresholds_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/specimen_organ_p_thresholds.csv') -# mutant_dir = Path('/home/neil/git/lama/tests/test_data/registration_test_data/mutant') -# -# thresholds = pd.read_csv(thresholds_file, index_col=0) -# alt = pd.read_csv(alt_file, index_col=0) -# -# run_permutation_stats.annotate(thresholds, alt, mutant_dir) - diff --git a/lama/tests/working_on/test_two_way_perm.py b/lama/tests/working_on/test_two_way_perm.py new file mode 100644 index 00000000..f41bf15f --- /dev/null +++ b/lama/tests/working_on/test_two_way_perm.py @@ -0,0 +1,329 @@ +# from lama.stats.permutation_stats.run_permutation_stats import run +import pytest +from pathlib import Path +from lama.scripts import lama_permutation_stats +from lama.stats.linear_model import lm_sm +from lama.stats.permutation_stats.run_permutation_stats import * +import lama.scripts.lama_permutation_stats + +import random +from pathlib import Path +from datetime import date +import itertools +import pandas as pd +import numpy as np +from scipy.stats import zmap +from logzero import logger as logging +import yaml + +from lama import common +from lama.stats.permutation_stats import distributions +from lama.stats.permutation_stats import p_thresholds +from lama.paths import specimen_iterator, get_specimen_dirs, LamaSpecimenData +from lama.qc.organ_vol_plots import make_plots, pvalue_dist_plots +from lama.common import write_array, read_array, init_logging, git_log, LamaDataException +from lama.stats.common import cohens_d +from lama.stats.penetrence_expressivity_plots import heatmaps_for_permutation_stats +from lama.stats.permutation_stats import run_permutation_stats + +from lama.stats.permutation_stats.distributions import two_way_max_combinations, recursive_comb_maker, null_line +from lama.tests import (out_dir, wt_dir, mut_dir, treat_dir, inter_dir, label_meta, n_perm) + +cfg = Path( + "C:/LAMA/lama/tests/configs/standard_stats/generate_data.toml") + +stats_cfg = Path( + "C:/LAMA/lama/tests/configs/permutation_stats/perm_no_qc.yaml") + + +@pytest.mark.skip +def test_data_prep(two_way=True): + """Just test if I can normalise stuff properly""" + np.random.seed(999) + init_logging(out_dir / 'stats.log') + logging.info(git_log()) + logging.info(f'Running {__name__} with following commands\n{common.command_line_agrs()}') + + logging.info('Searching for staging data') + wt_staging = get_staging_data(wt_dir) + mut_staging = get_staging_data(mut_dir) + + logging.info('searching for organ volume data') + wt_organ_vol = get_organ_volume_data(wt_dir) + mut_organ_vol = get_organ_volume_data(mut_dir) + if two_way: + logging.info('Searching for two-way staging and organ volume data') + treat_staging = get_staging_data(treat_dir) + inter_staging = get_staging_data(inter_dir) + treat_organ_vol = get_organ_volume_data(treat_dir) + inter_organ_vol = get_organ_volume_data(inter_dir) + two_way_data = [treat_staging, treat_organ_vol, + inter_staging, inter_organ_vol] + + data = prepare_data(wt_organ_vol, + wt_staging, + mut_organ_vol, + mut_staging, + label_meta=label_meta, + normalise_to_whole_embryo=True, + qc_file=None, + two_way=two_way, + two_way_data=two_way_data) + + # So pandas is a bit stupid as read/write of the file is causing type errors + # How about we just look at + print("Comparison") + print(data) + data.to_csv() + #good_data = pd.read_csv('E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/input_data.csv', index_col=0) + # check that the data is the same as a checked csv file + #print(good_data) + + # print(data.astype(int).equals(data.astype(int))) + + +@pytest.mark.skip +def test_max_two_way_combinations(): + combs = two_way_max_combinations(num_wts=16) + + assert combs == 63063000 + + +@pytest.mark.skip +def test_loop_iteration(): + lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] + fin_results = [] + results = [] + + for perm in range(10): + full_combination = recursive_comb_maker(lst, 3, i=1, recurs_results=[]) + # print(full_combination) + print(full_combination[1]) + fin_results.append(full_combination) + # reset result list + results = [] + print(fin_results[1]) + + +@pytest.mark.skip +def test_generate_two_way_combinations(): + data = pd.read_csv('E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/input_data.csv', index_col=0) + # can't really test this as of yet + wt_indx_combinations = distributions.generate_random_two_way_combinations(data, 20) + return wt_indx_combinations + + +@pytest.mark.skip +def test_lm_sm(): + data = pd.read_csv("C:/Users/u5823099/Anaconda3/Lib/site-packages/lama/LAMA/lama/tests/working_on/data.csv") + info = pd.read_csv("C:/Users/u5823099/Anaconda3/Lib/site-packages/lama/LAMA/lama/tests/working_on/treat_info.csv") + data = data.to_numpy() + p, t = lm_sm(data=data, info=info, use_staging=True, two_way=True) + print("p = ", p) + print("t = ", t) + + +@pytest.mark.skip +def test_two_way_null_line(): + data = pd.read_csv('E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/input_data.csv', index_col=0) + # can't really test this as of yet + baselines = data[data['line'] == 'baseline'] + wt_indx_combinations = distributions.generate_random_two_way_combinations(data, 20) + results = null_line(wt_indx_combinations, baselines, num_perms=20, two_way=True) + + print(type(results['x3']), results['x3'][0][0], type(results['x3'][0][0])) + + + +def test_two_way_null(): + data = pd.read_csv('E:/221207_gina_perm/perm_stats/perm_output/test_dataset.csv', index_col=0) + + group_info = data['line'] + print("group info:", group_info) + # TODO: think whether to truly put mut_treat in main comparisons + mut_names = group_info[(group_info == 'mutants') | (group_info == 'mut_treat')].index + treat_names = group_info[(group_info == 'treatment') | (group_info == 'mut_treat')].index + print(type(mut_names), mut_names) + + + line_null, specimen_null = distributions.null(input_data=data, num_perm=3, two_way=True) + print(type(line_null['3'][0]), line_null['3'][0][0], type(line_null['3'][0][0])) + print(type(specimen_null['22'][0]), specimen_null['22'][0][0], type(specimen_null['22'][0][0])) + + + +def test_two_way_alt(): + data = pd.read_csv('E:/221207_gina_perm/perm_stats/perm_output/input_data.csv', index_col=0) + line_alt, spec_alt, line_alt_t, spec_alt_t = distributions.alternative(data, two_way=True) + + group_info = data['line'] + print("group info:", group_info) + # TODO: think whether to truly put mut_treat in main comparisons + mut_names = group_info[(group_info == 'mutants') | (group_info == 'mut_treat')].index + treat_names = group_info[(group_info == 'treatment') | (group_info == 'mut_treat')].index + print(type(mut_names), mut_names) + + + specimen_inter_alt = spec_alt[spec_alt['3'].str.len() == 3] + specimen_main_alt = spec_alt[spec_alt['3'].str.len() == 1] + + specimen_geno_alt = specimen_main_alt[specimen_main_alt.index.isin(mut_names)] + specimen_treat_alt = specimen_main_alt[specimen_main_alt.index.isin(treat_names)] + + print(specimen_geno_alt) + print(specimen_treat_alt) + + + + + + + +def test_two_way_p_thresholds(): + """ + Testing the p_thresholds calculation + ------- + TODO: Add more tests for different cases + """ + + def strip_x(dfs): + for df in dfs: + df.columns = [x.strip('x') for x in df.columns] + + return df + + data = pd.read_csv('E:/221207_gina_perm/perm_stats/perm_output/input_data.csv', index_col=0) + + baselines = data[data['line'] == 'baseline'] + wt_indx_combinations = distributions.generate_random_two_way_combinations(data, 100) + results = null_line(wt_indx_combinations, baselines, num_perms=100, two_way=True) + line_null = strip_x([results]) + + # line_null, specimen_null = distributions.null(input_data=data, num_perm=10, two_way=True) + + line_alt, spec_alt, line_alt_t, spec_alt_t = distributions.alternative(data, two_way=True) + + thresh = p_thresholds.get_thresholds(line_null, line_alt, two_way=True) + + thresh.to_csv('E:/221207_gina_perm/perm_stats/perm_output/spec_out_threshs.csv') + + +def test_math_import(): + import math + from functools import reduce + print(math.prod([1,2,3,4])) + print(reduce(lambda x, y: x * y, [1,2,3,4])) + +@pytest.mark.skip +def test_two_spec_thresholds(): + two_way = True + data = pd.read_csv('E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/input_data.csv', index_col=0) + line_null, specimen_null = distributions.null(input_data=data, num_perm=3, two_way=True) + + line_alt, spec_alt, line_alt_t, spec_alt_t = distributions.alternative(data, two_way=True) + + # TODO: Don't hard-code this + specimen_inter_nulls = specimen_null[specimen_null['3'].str.len() == 3] + + specimen_main_nulls = specimen_null[specimen_null['3'].str.len() == 1] + specimen_geno_nulls, specimen_treat_nulls = np.vsplit(specimen_main_nulls, 2) + + specimen_inter_alt = spec_alt[spec_alt['3'].str.len() == 3] + specimen_main_alt = spec_alt[spec_alt['3'].str.len() == 1] + + # TODO: Don't hard-code this + + specimen_geno_alt = specimen_main_alt[specimen_main_alt.index.str.contains("het")] + specimen_treat_alt = specimen_main_alt[specimen_main_alt.index.str.contains("b6ku")] + + geno_thresholds = p_thresholds.get_thresholds(specimen_geno_nulls, specimen_geno_alt, two_way=two_way) + treat_thresholds = p_thresholds.get_thresholds(specimen_treat_nulls, specimen_treat_alt, two_way=two_way) + inter_thresholds = p_thresholds.get_thresholds(specimen_inter_nulls, specimen_inter_alt, two_way=two_way) + +@pytest.mark.skip +def test_line_annotate(): + # Lines + alt_file = Path('E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/distributions/specimen_geno_pvals.csv') + thresholds_file = Path( + 'E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/distributions/specimen_geno_p_thresholds.csv') + cond_dir = Path('E:/Bl6_data/211014_g_by_back') + data = pd.read_csv('E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/input_data.csv', index_col=0) + thresholds = pd.read_csv(thresholds_file, index_col=0) + alt = pd.read_csv(alt_file, index_col=0) + + alt = alt.applymap(lambda x: np.array([float(i) for i in x.strip("[]").split()]) if "[" in x else x) + + + #alt = alt.applymap(lambda x: np.array([float(i) for i in x.strip("[]").split()])) + + # run_permutation_stats.annotate(thresholds, alt, cond_dir, two_way=True, organ_volumes=data) + + # Specimens + geno_hits = run_permutation_stats.annotate(thresholds, alt, cond_dir, is_line_level=False, two_way=False, organ_volumes=data, main_of_two_way=True) + print(geno_hits) + +@pytest.mark.skip +def test_add_significance(): + df = pd.read_csv('E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/test_df.csv', index_col=0) + print(df) + add_two_way_significance(df, 0.05) + + +@pytest.mark.skip +def test_two_way_plotting(): + data = pd.read_csv('E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/input_data.csv', index_col=0) + + label_info = Path('E:/Bl6_data/211014_g_by_back/target/E14_5_atlas_v24_43_label_info.csv') + + lines_root_dir = Path('E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output') + + normalise_to_whole_embryo = True + voxel_size = 40 + + data_for_plots = data.copy() + data_for_plots.columns = [x.strip('x') for x in data_for_plots.columns] # Strip any xs + # If data has been normalised to WEV revert back for plots + if normalise_to_whole_embryo: + for col in data_for_plots.columns: + if col.isdigit(): + data_for_plots[col] = data_for_plots[col] * data_for_plots['staging'] + + make_plots(data_for_plots, label_info, lines_root_dir, voxel_size=voxel_size, two_way=True, skip_no_analysis=True) + + +@pytest.mark.skip +def test_dist_plots(): + line_null = pd.read_csv( + "E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/distributions/null_line_dist_pvalues.csv") + line_alt = pd.read_csv( + "E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/distributions/alt_line_dist_pvalues.csv") + line_organ_thresholds = pd.read_csv( + "E:/Bl6_data/211014_g_by_back/permutation_stats/perm_output/distributions/line_organ_p_thresholds.csv") + dist_plot_root = out_dir / 'distribution_plots' + line_plot_dir = dist_plot_root / 'line_level' + line_plot_dir.mkdir(parents=True, exist_ok=True) + + line_null_vals = pd.DataFrame([x[1:1000] for x in line_null.values]) + + + # line_null_vals.columns = line_null.columns + line_null_vals.columns = line_null.drop(columns='Unnamed: 0').columns + + line_null_vals = line_null_vals.applymap(lambda x: np.array([float(i) for i in x.strip("[]").split()])) + + line_alt_vals = pd.DataFrame([x.strip("[]").split() for x in line_alt.values[0]]) + + line_alt_vals = line_alt_vals.drop(line_alt_vals.index[0]).astype(float).transpose() + + line_alt_vals.columns = line_null_vals.columns + + pvalue_dist_plots(line_null_vals, line_alt_vals, line_organ_thresholds, line_plot_dir, label_meta_file=label_meta, + two_way=True) + + +def test_two_way_heatmaps(): + label_info = Path('E:/Bl6_data/211014_g_by_back/target/E14_5_atlas_v24_43_label_info.csv') + lines_root_dir = Path('E:/220607_two_way/permutation_stats/perm_output') + heatmaps_for_permutation_stats(lines_root_dir, two_way=True,label_info_file=label_info) + + diff --git a/lama/tests/working_on/test_two_way_stats.py b/lama/tests/working_on/test_two_way_stats.py new file mode 100644 index 00000000..50af7e07 --- /dev/null +++ b/lama/tests/working_on/test_two_way_stats.py @@ -0,0 +1,219 @@ +from pathlib import Path + +from lama.registration_pipeline import run_lama +from lama.scripts import lama_stats +from lama.scripts import lama_job_runner +import logging +import pytest +from lama.stats.standard_stats.data_loaders import DataLoader, load_mask +from lama import common +from lama.img_processing.normalise import Normaliser +import logzero +from lama.img_processing import normalise +from lama.stats.standard_stats.results_writer import ResultsWriter +from lama.stats.standard_stats.stats_objects import Stats +from lama.stats.standard_stats.lama_stats_new import invert_heatmaps + +from lama.lama_radiomics.radiomics import radiomics_job_runner + +from lama.stats import linear_model +from lama.common import cfg_load + +# from lama.stats.cluster_plots import umap_organs + +# Import paths from __init__.py +# from lama.tests import (stats_config_dir) + +wt_dir = Path( + "E:/220607_two_way_cp/g_by_back_data/baseline") +mut_dir = Path( + "E:/220607_two_way_cp/g_by_back_data/mutants") +treat_dir = Path( + "E:/220607_two_way_cp/g_by_back_data/treatment") +inter_dir = Path( + "E:/220607_two_way_cp/g_by_back_data/mut_treat") + +cfg = Path( + "E:/220607_two_way_cp/g_by_back_data/generate_data.toml") + +stats_cfg = Path( + "E:/220607_two_way_cp/stats_with_BH_correction/stats.toml") + +target_dir = Path( + "E:/220607_two_way_cp/target") + +stats_output = Path("E:/220607_two_way_cp/stats_with_BH_correction/") + +lines_to_process = None + + + +def test_lama_job_runner(): + """ + Test the lama job runner which was made to utilise multiple machines or the grid. + This test just uses one machine for the tests at the moment. + test_make_jobs_file() should run before this to create a jobs file that can be consumed. + This test should be run before the stats test as it creates data that the stats test needs. + NOTE this test should be at bottom of file as it should be ru last + The oututs of these tests are consumed by the stats test. + """ + + print(f"\n{'#' * 8} Doing config {cfg.name} {'#' * 8}") + + #lama_job_runner.lama_job_runner(cfg, wt_dir, make_job_file=True, log_level=logging.ERROR) + #lama_job_runner.lama_job_runner(cfg, wt_dir, log_level=logging.ERROR) + + #lama_job_runner.lama_job_runner(cfg, mut_dir, make_job_file=True, log_level=logging.ERROR) + #lama_job_runner.lama_job_runner(cfg, mut_dir, log_level=logging.ERROR) + + #lama_job_runner.lama_job_runner(cfg, treat_dir, make_job_file=True, log_level=logging.ERROR) + #lama_job_runner.lama_job_runner(cfg, treat_dir, log_level=logging.ERROR) + + lama_job_runner.lama_job_runner(cfg, inter_dir, make_job_file=True, log_level=logging.ERROR) + lama_job_runner.lama_job_runner(cfg, inter_dir, log_level=logging.ERROR) + + +@pytest.mark.skip +def test_g_by_e_reg(): + """ + lama has ony one arg, the config file. Loop over all the configs to test and + run with lama. + """ + + run_lama.run(cfg) + + +@pytest.mark.skip +def test_radiomics(): + c = cfg_load(Path("E:/220607_two_way/radiomics/generate_radiomics.toml")) + + target_dir = Path(c.get('target_dir')) + + labs_of_int = c.get('labs_of_int') + + norm_methods = c.get('norm_methods') + + norm_label = c.get('norm_label') + + spherify = c.get('spherify') + + ref_vol_path = Path(c.get('ref_vol_path')) + + norm_dict = { + "histogram": normalise.IntensityHistogramMatch(), + "N4": normalise.IntensityN4Normalise(), + "subtraction": normalise.NonRegMaskNormalise() + } + + try: + norm_meths = [norm_dict[x] for x in norm_methods] + except KeyError: + norm_meths = None + + radiomics_job_runner(target_dir, labs_of_int=labs_of_int, norm_method=norm_meths, spherify=spherify, + ref_vol_path=ref_vol_path) + + + + + + + +@pytest.mark.skip +def test_two_way_intensities(): + stats_config = common.cfg_load(stats_cfg) + + loader_class = DataLoader.factory("intensity") + + mask = load_mask(target_dir, stats_config['mask']) + label_info_file = target_dir / stats_config.get('label_info') # What if not exists + label_map_file = target_dir / stats_config.get('label_map') + label_map = common.LoadImage(label_map_file).array + + memmap = stats_config.get('memmap') + if memmap: + logging.info('Memory mapping input data') + + baseline_file = stats_config.get('baseline_ids') + if baseline_file: + baseline_file = stats_cfg.parent / baseline_file + + mutant_file = stats_config.get('mutant_ids') + if mutant_file: + mutant_file = stats_cfg.parent / mutant_file + + loader = loader_class(wt_dir, mut_dir, mask, stats_config, label_info_file, lines_to_process=lines_to_process, + baseline_file=baseline_file, mutant_file=mutant_file, memmap=memmap, + treatment_dir=treat_dir, interaction_dir=inter_dir) + + loader.normaliser = Normaliser.factory(stats_config.get('normalise'), "intensity") # move this into subclass + + line_iterator = loader.two_way_iterator() + line_input_data = None + + while True: + try: + line_input_data = next(line_iterator) + logging.info(f"Data for line {line_input_data.line} loaded") + common.logMemoryUsageInfo() + + line_id = line_input_data.line + + line_stats_out_dir = stats_output / line_id / "intensity" + + line_stats_out_dir.mkdir(parents=True, exist_ok=True) + line_log_file = line_stats_out_dir / f'{common.date_dhm()}_stats.log' + logzero.logfile(str(line_log_file)) + + stats_class = Stats.factory("intensity") + stats_obj = stats_class(line_input_data, "intensity", stats_config.get('use_staging', True), + stats_config.get('two_way', False)) + + stats_obj.stats_runner = linear_model.lm_r + stats_obj.run_stats() + + logging.info('Statistical analysis finished.') + common.logMemoryUsageInfo() + + logging.info('Writing results...') + + rw = ResultsWriter.factory("intensity") + writer = rw(stats_obj, mask, line_stats_out_dir, "intensity", label_map, label_info_file, + stats_config.get('two_way', False)) + + if stats_config.get('invert_stats'): + if writer.line_heatmap: # Organ vols wil not have this + # How do I now sensibily get the path to the invert.yaml + # get the invert_configs for each specimen in the line + logging.info('Writing heatmaps...') + logging.info('Propogating the heatmaps back onto the input images ') + line_heatmap = writer.line_heatmap + line_reg_dir = mut_dir / 'output' / line_id + invert_heatmaps(line_heatmap, line_stats_out_dir, line_reg_dir, line_input_data) + logging.info('Finished writing heatmaps.') + + logging.info(f"Finished processing line: {line_id} - All done") + common.logMemoryUsageInfo() + + except StopIteration: + if (line_input_data != None): + logging.info(f"Finish iterate through lines") + line_input_data.cleanup() + common.logMemoryUsageInfo() + + break + + + +def test_two_way_stats(): + """ + tests the two_ways_stats component + Returns + For each folder: + intensity (genotype, environment and interaction) + jacobians (genotype, env etc. ) + organ_volumes (geno, env etc.) + ------- + + """ + lama_stats.run(stats_cfg, wt_dir, mut_dir, stats_output, target_dir, treat_dir, inter_dir) diff --git a/lama/utilities/atlas_tools.py b/lama/utilities/atlas_tools.py new file mode 100644 index 00000000..5bb16da3 --- /dev/null +++ b/lama/utilities/atlas_tools.py @@ -0,0 +1,129 @@ +from skimage.measure import regionprops +from pathlib import Path +from dataclasses import dataclass +from typing import Union, List, Iterator +import nrrd +import numpy as np + +@dataclass +class Slice: + axis: int + indices: np.lib.index_tricks.IndexExpression + image: np.ndarray = None + labels: np.ndarray = None + + +rotations = { + # Number of rotations for each axis to get into our standard orientation + 0: 1, + 1: 1, + 2: 3 +} + + +class AtlasProps: + def __init__(self, atlas_path: Union[Path, str], + raw_image_path: Union[Path, str] = None): + + self.atlas_path = atlas_path + self.atlas, self.atlas_head = nrrd.read(atlas_path) + self.atlas = self.atlas.astype(np.int) + + if raw_image_path: + self.image_path = raw_image_path + self.image, self.image_head = nrrd.read(raw_image_path) + else: + self.image = self.image_head = None + + # Created a dict of regions props indexed by label and sorted by area + self.props = {prop['label']: prop for prop in sorted(regionprops(self.atlas), key=lambda x: x.area, reverse=True)} + + def slice_indices(self, label: int, axis=0, pad=0) -> List[np.lib.index_tricks.IndexExpression]: + # get a list of np.s_ slices which can be used to generate 2D slices form the atlas + bbox = self.props[label].bbox + + slices = [] + + # Test + start = bbox[2] - pad + end = bbox[5] + pad + + for i in range(start, end): + slices.append(np.s_[bbox[0]-pad: bbox[3]+pad, bbox[1]-pad: bbox[4]+pad, i]) + + return slices + + # def slices(self, label, n_slices, axis=0, pad=0, return_labels=True, return_image=True) -> Iterator[slice]: + # + # if not return_image and not return_labels: + # raise ValueError('return_labels or return_image must be True') + # + # indices = self.slice_indices(label, axis, pad) + # + # s = Slice() + # + # for idx in indices: + # s.indices = idx + # s.axis = axis + # if return_image: + # s.image = self.image[idx] + # if return_labels: + # s.labels = self.atlas[idx] + def get_roi_slices(self, label, n_slices, axis, padding, main_axis_padding=3, dtype=None): + """ + slice_padding + The ammount of padding applied to each 2D slice + slice_padding_2 + The ammount of padding used for the indices of the main axis + """ + for roi in self.get_roi(label, padding): + m = padding - main_axis_padding + if m < padding: + m = padding + + slices = [] + main_axis_len = roi.shape[axis] + slice_indices = list(np.linspace(0 + m, main_axis_len - m, n_slices, dtype=int)) + + for idx in slice_indices: + + s = np.rot90(np.take(roi, idx, axis=axis), k=rotations[axis]) + if dtype: + s = s.astype(dtype) + slices.append(s) + yield slices + + def get_roi(self, label, padding: int=0, shape=None) -> np.ndarray: + + if not shape: # Use the label bounding box + b = self.props[label].bbox + + else: # Use centroid and shape + c = self.props[label].centroid + b = [int(x) for x in [c[0] - (shape[0] / 2), c[1] - (shape[1] / 2), c[2] - (shape[2] / 2), # bbox starts + c[0] + (shape[0] / 2), c[1] + (shape[1] / 2), c[2] + (shape[2] / 2)] ] # bbox ends + + if padding: + b = [x - padding if i < 3 else x + padding for i, x in enumerate(b)] + b = np.clip(b, 0, max(b)) + atlas_roi = self.atlas[b[0]:b[3], b[1]: b[4], b[2]: b[5]] + image_roi = self.image[b[0]:b[3], b[1]: b[4], b[2]: b[5]] + + atlas_roi[atlas_roi != label] = 0 + + return atlas_roi, image_roi + + +def remove_unconected(image, label_num) -> np.ndarray: + """ + Keep only the largest coonected componenet label with given label num. Zero out all other labels + + """ + from skimage.measure import label, regionprops + image = np.copy(image) + image[image != label_num] = 0 + relab = label(image) + biggest_label = sorted(regionprops(relab), key=lambda x: x.area, reverse=True)[0].label + image[:] = 0 + image[relab == biggest_label] = label_num + return image \ No newline at end of file diff --git a/lama/utilities/body_radiomics_normaliser.py b/lama/utilities/body_radiomics_normaliser.py new file mode 100644 index 00000000..cf657c8a --- /dev/null +++ b/lama/utilities/body_radiomics_normaliser.py @@ -0,0 +1,295 @@ +"""Normalises the radiomics scans by the average intensity of a mask""" +from typing import Union + +from lama.img_processing import normalise + +from logzero import logger as logging +from lama import common +import os +import nrrd +from pathlib import Path + +import numpy as np +from scipy import ndimage +import SimpleITK as sitk + +import radiomics + +from radiomics import featureextractor + +import pandas as pd + + +# each scan in Ben's dataset will need its own mask +def get_images_from_masks(dir): + img_list = [] + spec_name_list = [] + mask_list = [] + scan_paths = [spec_path for spec_path in common.get_file_paths(dir) if ('imgs' in str(spec_path))] + mask_paths = [mask_path for mask_path in common.get_file_paths(dir) if ('labels' in str(mask_path))] + + scan_paths.sort() + mask_paths.sort() + + # enumerate for indexing masks + for i, img_path in enumerate(scan_paths): + + # load image and stage + logging.info(f"Obtaining values from {img_path}") + loader = common.LoadImage(img_path) + img = loader.img + + logging.info(f"Removing values from {mask_paths[i]}") + m_loader = common.LoadImage(mask_paths[i]) + mask = m_loader.img + + # Only get values inside of the mask + + # get the arrays + img_a = sitk.GetArrayFromImage(img) + mask_a = sitk.GetArrayFromImage(mask) + + # remove the stage + img_a[mask_a == 1] = np.min(img_a) + + img_pro = sitk.GetImageFromArray(img_a) + img_pro.CopyInformation(img) + + logging.info(f"Performing Otsu on {scan_paths[i]}") + + Otsu = sitk.OtsuThresholdImageFilter() + + inv_mask = Otsu.Execute(img) + o_mask = sitk.InvertIntensity(inv_mask, 1) + + o_mask = sitk.ConnectedComponent(o_mask != o_mask[0, 0, 0]) + + # sitk.WriteImage(seg, os.path.join(output, name + "_all_connected.nrrd")) + o_mask = sitk.RelabelComponent(o_mask) + o_mask = o_mask == 1 + # sitk.WriteImage(seg, os.path.join(output, name + "_largest_connected.nrrd")) + + # lets see if dilate with a tight kernal fixes getting stupid dots everywhere. + dilate = sitk.BinaryDilateImageFilter() + dilate.SetKernelRadius([1, 1, 1]) + dilate.SetKernelType(sitk.sitkBall) + o_mask = dilate.Execute(o_mask) + o_mask.CopyInformation(img) + + # Operation is peformed using scipy so needs to be a numpy array + npa = sitk.GetArrayFromImage(o_mask) + + logging.info("fill holes in first orientation") + npa_hole_filled = fill_image(npa) + + logging.info("fill holes in second orientation") + npa_hole_filled = fill_image(npa_hole_filled, roll=1) + + logging.info("fill holes in third orientation") + npa_hole_filled = fill_image(npa_hole_filled, roll=0) + + # Need to turn the image back into its original orientation + transposed = np.transpose(npa_hole_filled, axes=(0, 2, 1)) + + # Turn np array to image + filled = sitk.GetImageFromArray(transposed) + filled.CopyInformation(o_mask) + + + q_otsu = "otsu" + sitk.WriteImage(filled, str(Path(os.path.dirname(img_path)).parent.absolute() / q_otsu / os.path.basename(img_path))) + + logging.info("Removing values outside of the mask") + + mask_a2 = sitk.GetArrayFromImage(filled) + + img_a2 = sitk.GetArrayFromImage(img_pro) + + img_a2[mask_a2 != 1] = np.min(img_a2) + + img_pro = sitk.GetImageFromArray(img_a2) + img_pro.CopyInformation(img) + + spec_name_list.append(os.path.splitext(img_path.name)[0]) + # print(spec_name_list) + img_list.append(img_pro) + mask_list.append(filled) + return img_list, spec_name_list, mask_list + + +def fill_image(npa, roll=0): + """ Binary hole fill in any orientation + + Go slice by slice and fill any holes of a 3D image. Orientation is chosen by the roll parameter. + + :param array npa: Numpy array of the image to be processed + :param int roll: Which axis to image is to be "rolled" in. If use none, 0 and 1. Should do all axis for a 3D image + [Default 0] + :return array npa_hole_filled: Numpy array of image + """ + + npa_hole_filled = None + + # Change orientation + if roll: + loop = np.rollaxis(npa, roll) + else: + loop = npa + + # loop through each slice + for slice_ in loop: + slice_fill = ndimage.binary_fill_holes(slice_).astype(int) + if npa_hole_filled is None: + npa_hole_filled = slice_fill + else: + npa_hole_filled = np.dstack((npa_hole_filled, slice_fill)) + + return npa_hole_filled + + +def pyr_calc_all_features(dir, normed: bool = False, images: list = None, file_names: list = None): + # get either the normalised or original images + scan_paths = images if normed \ + else [spec_path for spec_path in common.get_file_paths(dir) if ('imgs' in str(spec_path))] + print(scan_paths) + tumour_paths = [spec_path for spec_path in common.get_file_paths(dir) if ('tumour_respaced' in str(spec_path))] + + # debugging - Thanks Neil + scan_paths.sort() + tumour_paths.sort() + + # Get the first order measuremients + full_orders = [] + + for i, img_path in enumerate(scan_paths): + + # logging.info(f"Calculating for {os.path.splitext(os.path.basename(img_path))[0]}") + if normed: # files exist + img = img_path + else: + logging.info(img_path) + logging.info(tumour_paths[i]) + loader = common.LoadImage(img_path) + img = loader.img + + m_loader = common.LoadImage(tumour_paths[i]) + mask = m_loader.img + + #so need to binarise the mask + + mask_arr = sitk.GetArrayFromImage(mask) + + mask_arr[mask_arr > 1] = 1 + + b_mask = sitk.GetImageFromArray(mask_arr) + b_mask.CopyInformation(mask) + + # get all features and append to list + extractor = featureextractor.RadiomicsFeatureExtractor() + + + result = extractor.execute(img, b_mask) + + if file_names is not None: + first_orders = pd.DataFrame.from_dict(result, orient='index', + columns=[os.path.splitext(os.path.basename(file_names[i]))[0]]) + else: + first_orders = pd.DataFrame.from_dict(result, orient='index', + columns=[os.path.splitext(os.path.basename(img_path))[0]]) + full_orders.append(first_orders) + + # fixing data format + features = pd.concat(full_orders, axis=1).transpose() + + #_metadata = features.index #.str.split('_', expand=True).to_frame(index=False, + # name=['Date', 'Strain', 'Colony', + # 'Embryo', 'Genotype']) + + #name=['Date', 'Exp', 'Contour_Method', + # 'Tumour_Model', 'Position', 'Age', + # 'Cage_No.', 'Animal_No.']) + #_metadata.reset_index(inplace=True, drop=True) + #features.reset_index(inplace=True, drop=True) + #features = pd.concat([_metadata, features], axis=1) + + #features.index.rename('scanID', inplace=True) + + return features + + +def pyr_normaliser(_dir, _normaliser, scans_imgs, masks, fold: bool = False, ref_vol_path: Path = None): + # create a copy so orginal files aren't overwritten + scans_imgs = scans_imgs.copy() + + # Do the normalisation + if isinstance(_normaliser, normalise.NonRegMaskNormalise): + _normaliser.add_reference(scans_imgs[0], masks[0]) + _normaliser.normalise(scans_imgs, masks, fold=fold, temp_dir=_dir) + elif isinstance(_normaliser, normalise.IntensityHistogramMatch): + if ref_vol_path: + ref_vol = common.LoadImage(ref_vol_path).img + _normaliser.normalise(scans_imgs, ref_vol) + + else: + _normaliser.normalise(scans_imgs, scans_imgs[0]) + + return scans_imgs + + +def main(): + # import argparse + # parser = argparse.ArgumentParser("Run various intensity normalisation methods") + # parser.add_argument('-i', dest='indirs', help='dir with vols, tumour masks and label masks', + # required=True) + + # args = parser.parse_args() + + logging.info("Calculating Original Features") + # _dir = Path(args.indirs) + _dir = Path("E:/220204_BQ_dataset/220530_BQ_norm") + + #ref_path = Path("E:/Bl6_data/211014_g_by_back/target/210602_C3H_avg_n18.nrrd") + + #orig_features = pyr_calc_all_features(_dir) + #orig_features.to_csv(str(_dir / "orig_features.csv")) + + # get the images and masks + logging.info("Getting values from inside the stage") + scans_imgs, scan_names, masks = get_images_from_masks(_dir) + + scan_names.sort() +#logging.info("Normalising to mean of the stage (subtraction)") + + sub_int_normed = pyr_normaliser(_dir, normalise.NonRegMaskNormalise(), scans_imgs, masks) + + # for i, vol in enumerate(sub_int_normed): + # file_name = scan_names[i] + '.nrrd' + # sitk.WriteImage(vol, str(_dir / file_name)) + # logging.info("Recalculating Features") + sub_normed_features = pyr_calc_all_features(_dir, normed=True, images=sub_int_normed, file_names=scan_names) + sub_normed_features.to_csv(str(_dir / "sub_normed_features.csv")) + + #logging.info("Normalising to mean of the stage (fold)") + #fold_int_normed = pyr_normaliser(_dir, normalise.NonRegMaskNormalise(), scans_imgs, masks, fold=True) + #logging.info("Recalculating Features") + #fold_normed_features = pyr_calc_all_features(_dir, normed=True, images=fold_int_normed, file_names=scan_names) + #fold_normed_features.to_csv(str(_dir / "fold_normed_features.csv")) + + #logging.info("Maskless Histogram Intensity Matching") + #histo_normed = pyr_normaliser(_dir, normalise.IntensityHistogramMatch(), scans_imgs, masks, ref_vol_path=ref_path) + #logging.info("Recalculating Features") + #histo_normed_features = pyr_calc_all_features(_dir, normed=True, images=histo_normed, file_names=scan_names) + #histo_normed_features.to_csv(str(_dir / "histo_normed_features.csv")) + + all_features = pd.concat([orig_features, sub_normed_features], + keys=["Raw", "Subtraction"]) + + all_features.index.rename('Norm_Type', inplace=True) + + all_features.to_csv(str(_dir / "all_features.csv")) + + logging.info("DONE") + + +if __name__ == '__main__': + main() diff --git a/lama/utilities/combine_spec_csv.py b/lama/utilities/combine_spec_csv.py new file mode 100644 index 00000000..1652d1b6 --- /dev/null +++ b/lama/utilities/combine_spec_csv.py @@ -0,0 +1,48 @@ +from pathlib import Path +import nrrd +import os +import numpy as np +from logzero import logger as logging +from lama import common +from typing import Union, List, Tuple, Dict +import pandas as pd + + +def main(root_dir): + # big chunk of code creates a pandas dataset containing all the staging / organ volumes for each + # registration directory stored in the .csv file + print(root_dir) + full_staging_data = pd.concat( + [pd.read_csv(spec) for spec in common.get_file_paths(folder=root_dir, extension_tuple=".csv") + if (common.STAGING_INFO_FILENAME in str(spec))], + ignore_index=True) # Replace STAGING_INFO_FILENAME with ORGAN_INFO_FILENAME to get organ vols + + # remove duplicates + print(full_staging_data) + full_staging_data.drop(columns=["value"], inplace=True) + full_staging_data.replace('', np.nan, inplace=True) + + full_staging_data.dropna(subset=["staging"], inplace=True) + print(full_staging_data) + + output = root_dir / "full_staging.csv" + full_staging_data.to_csv(output, index=False) + + full_organ_data = pd.concat( + [pd.read_csv(spec) for spec in common.get_file_paths(folder=root_dir, extension_tuple=".csv") + if (common.ORGAN_VOLUME_CSV_FILE in str(spec))], + ignore_index=True) # Replace STAGING_INFO_FILENAME with ORGAN_INFO_FILENAME to get organ vols + + output = root_dir / "full_organs.csv" + full_organ_data.to_csv(output) + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser("Collect organ and staging files") + parser.add_argument('-i', dest='indirs', help='Root registration directory', + required=True) + + args = parser.parse_args() + main(args.indirs) diff --git a/lama/utilities/cropper.py b/lama/utilities/cropper.py new file mode 100644 index 00000000..50b10e7c --- /dev/null +++ b/lama/utilities/cropper.py @@ -0,0 +1,71 @@ +import SimpleITK as sitk +import os +from pathlib import Path +from lama import common +import nrrd +import numpy as np +from scipy import ndimage +from logzero import logger as logging + +def main(): + + + target_dir = Path("E:/try_emap_to_SD/final") + volpaths = common.get_file_paths(target_dir) + + cropped = "cropped" + os.makedirs(str(target_dir/cropped), exist_ok=True) + logging.info("Cropping") + + for path in volpaths: + logging.info(f"Doing {os.path.basename(path)}") + vol, v_head = nrrd.read(path) + + loader = common.LoadImage(path) + img = loader.img + + # get the otsu mask + Otsu = sitk.OtsuThresholdImageFilter() + + inv_mask = Otsu.Execute(img) + mask = sitk.InvertIntensity(inv_mask, 1) + + mask = sitk.ConnectedComponent(mask != mask[0, 0, 0]) + + #sitk.WriteImage(seg, os.path.join(output, name + "_all_connected.nrrd")) + mask = sitk.RelabelComponent(mask) + mask = mask == 1 + #sitk.WriteImage(seg, os.path.join(output, name + "_largest_connected.nrrd")) + + #lets see if dilate with a tight kernal fixes getting stupid dots everywhere. + dilate = sitk.BinaryDilateImageFilter() + dilate.SetKernelRadius([1,1,1]) + dilate.SetKernelType(sitk.sitkBall) + mask = dilate.Execute(mask) + + #sitk.WriteImage(mask, str(target_dir / masked / os.path.basename(path))) + + mask_arr = sitk.GetArrayFromImage(mask) + + # get the bounding box of the mask + + s = ndimage.find_objects(mask_arr)[0] + + # Add some tight padding + + p = 3 + + crop_vol = vol[s[2].start - p: s[2].stop + p, + s[1].start - p: s[1].stop + p, + s[0].start - p: s[0].stop + p] + # + #l_clip, c_head = nrrd.read(target_dir / clip / os.path.basename(path)) + + #crop_vol[l_clip != 0] = np.random.choice([38,39,40]) + + file_path = target_dir / cropped + + nrrd.write(str(file_path/ os.path.basename(path)), crop_vol, header=v_head) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/utilities/extract_label.py b/lama/utilities/extract_label.py new file mode 100644 index 00000000..3588d0c2 --- /dev/null +++ b/lama/utilities/extract_label.py @@ -0,0 +1,82 @@ +from pathlib import Path +import nrrd +import os +from lama import common +import SimpleITK as sitk +from scipy import ndimage +import numpy as np + + +def main(target_dir, labs_of_interest: list = [17]): + label_paths = [spec_path for spec_path in common.get_file_paths(target_dir) if + ('inverted_labels' in str(spec_path))] + + rigid_paths = [rigid_path for rigid_path in common.get_file_paths(Path(target_dir / 'rigid'), extension_tuple=".nrrd")] + + + rigid_paths.sort(key = lambda x: os.path.basename(x)) + label_paths.sort(key = lambda x: os.path.basename(x)) + + #rigids = [nrrd.read(path) for path in rigid_paths] + + + + # Just get the label and write them + for i, path in enumerate(label_paths): + print("rigid path:", rigid_paths[i]) + print("label path:", path) + + label, l_head = nrrd.read(path) + rigid, r_head = nrrd.read(rigid_paths[i]) + # print((~np.isin(label, labs_of_interest))) + + label[~np.isin(label, labs_of_interest)] = 0 + + + # get roi of label and rigid for scaling + s = ndimage.find_objects(label)[-1] + + + if isinstance(labs_of_interest, list): + + t = ndimage.find_objects(label)[int(min(labs_of_interest)-1)] + + + midpoint = [np.mean([t[0].start, s[0].stop]), + np.mean([t[1].start, s[1].stop]), + np.mean([t[2].start, s[2].stop])] + else: + midpoint = [np.mean([s[0].start, s[0].stop]), + np.mean([s[1].start, s[1].stop]), + np.mean([s[2].start, s[2].stop])] + + midpoint = [int(np.round(i)) for i in midpoint] + + p = 80 + + + crop_lab = label[midpoint[0] - p: midpoint[0] + p, + midpoint[1] - p: midpoint[1] + p, + midpoint[2] - p: midpoint[2] + p] + + + + crop_rig = rigid[midpoint[0] - p: midpoint[0] + p, + midpoint[1] - p: midpoint[1] + p, + midpoint[2] - p: midpoint[2] + p] + + # label[label not in labs_of_interest] = 0 + # print(str(os.path.basename(path))) + os.makedirs(target_dir / "uncropped_labels", exist_ok=True) + os.makedirs(target_dir / "cropped_labels", exist_ok=True) + os.makedirs(target_dir / "cropped_rigids", exist_ok=True) + + file_name = target_dir / "uncropped_labels" / str(os.path.basename(path)) + + cr_file_name = target_dir / "cropped_rigids" / str(os.path.basename(path)) + + cl_file_name = target_dir / "cropped_labels" / str(os.path.basename(path)) + + nrrd.write(str(file_name), label) + nrrd.write(str(cr_file_name), crop_rig) + nrrd.write(str(cl_file_name), crop_lab) diff --git a/lama/utilities/extract_registrations.py b/lama/utilities/extract_registrations.py new file mode 100644 index 00000000..c6281e58 --- /dev/null +++ b/lama/utilities/extract_registrations.py @@ -0,0 +1,27 @@ + +from pathlib import Path +import nrrd +import os +from lama import common + + + + +def main(target_dir): + + + reg_paths = [spec_path for spec_path in common.get_file_paths(target_dir) if ('registrations' in str(spec_path))] + + rigid_paths = [spec_path for spec_path in reg_paths if ('rigid' in str(spec_path))] + + + + + rigid_paths.sort() + + for path in rigid_paths: + rigid, r_head = nrrd.read(path) + os.makedirs(target_dir / "rigid", exist_ok=True) + + file_name = target_dir / "rigid" / str(os.path.basename(path)) + nrrd.write(str(file_name), rigid) \ No newline at end of file diff --git a/lama/utilities/flipper.py b/lama/utilities/flipper.py new file mode 100644 index 00000000..3d9f9d92 --- /dev/null +++ b/lama/utilities/flipper.py @@ -0,0 +1,36 @@ +import SimpleITK as sitk +import os +from pathlib import Path +from lama import common + +def main(target_dir: Path = os.getcwd()): + target_dir = Path("E:/220204_BQ_dataset/221218_BQ_run/new_inv_labels") + volpaths = common.get_file_paths(target_dir) + print('flipping') + + for path in volpaths: + print(path) + loader = common.LoadImage(path) + vol = loader.img + flipped_vol = sitk.Flip(vol, [False, True, False]) + #pa = sitk.PermuteAxesImageFilter() + #pa.SetOrder([1,0,2]) + #flipped_vol = pa.Execute(vol) + print(flipped_vol.GetDirection()) + + direction = flipped_vol.GetDirection() + #print(type(direction[6])) + + #direction[6] = direction[6] * -1 + + + flipped_vol.SetDirection([-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 1.0]) + + print(flipped_vol.GetDirection()) + + #flipped_vol.SetOrigin([0,0,0]) + sitk.WriteImage(flipped_vol, str(target_dir) + "/flipped/" + str(os.path.basename(path)), True) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/utilities/img_processor.sh b/lama/utilities/img_processor.sh new file mode 100644 index 00000000..d328c894 --- /dev/null +++ b/lama/utilities/img_processor.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# Complete image processing script created by Amrit and Kyle + + +# cropping scans +mv cropper.py nrrd_out/ + +ls nrrd_out + +cd nrrd_out + +mkdir cropped masked + +python3 cropper.py + +# 16-bit to 8-bit conversion + +mkdir converted + +lama_convert_16_to_8 -i cropped -o converted + +# flipping scans to matc the population average + +mv ../flipper.py converted/ + +cd converted + +python3 flipper.py + +# padding +cd ../ + +mkdir needs_padding +mkdir padded +cp -r ../target needs_padding/ + +mv converted needs_padding/ + +lama_pad_volumes -i needs_padding -o padded + +mv padded ../ + + + + + + + + + + + + + + + + + + + + + diff --git a/lama/utilities/invert_emap_ref.py b/lama/utilities/invert_emap_ref.py new file mode 100644 index 00000000..3564e055 --- /dev/null +++ b/lama/utilities/invert_emap_ref.py @@ -0,0 +1,45 @@ +from pathlib import Path +from lama import common +import SimpleITK as sitk +import numpy as np + + + + + + +def main(): + #path = Path("E:/220901_emap_e12_atlas/TS20_EMA76_reference.nrrd") + + #loader = common.LoadImage(path) + #img = loader.img + + # get the otsu mask + + #inv = sitk.InvertIntensity(img) + + + #sitk.WriteImage(inv, "E:/220901_emap_e12_atlas/TS20_EMA76_reference_inv.nrrd") + + path = Path("E:/220901_emap_e12_atlas/TS20_EMA76_reference_inv.nrrd") + + loader = common.LoadImage(path) + img = loader.img + + euler_transform = sitk.Euler3DTransform() + euler_transform.SetRotation(angleX=90) + + + + + + + + + + + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/utilities/label_intensity_info.py b/lama/utilities/label_intensity_info.py new file mode 100644 index 00000000..fc8e12a5 --- /dev/null +++ b/lama/utilities/label_intensity_info.py @@ -0,0 +1,89 @@ +from lama.img_processing import normalise +from logzero import logger as logging +from lama import common +import os +import nrrd +from pathlib import Path +from scipy import ndimage +import numpy as np +import pandas as pd +from lama.utilities.radiomics_normaliser import get_images_from_masks +import statsmodels.formula.api as smf +import matplotlib.pyplot as plt + + +def main(): + _dir = Path( + "E:/220204_BQ_dataset/stage_info/out_dir") + + # get the images and masks + # out_dir = _dir / 'out_dir' + + # scans_imgs, scan_names = get_images_from_masks(_dir) + + # int_norm = normalise.IntensityMaskNormalise() + + # normalise the images + # int_norm.add_reference(scans_imgs) + + # int_norm.normalise(mut_imgs) + + # int_norm.normalise(scans_imgs) + + # logging.info('writing normalised files') + + # logging.info(f" saving masked files to {out_dir}") + # for i, vol in enumerate(scans_imgs): + # file_name = out_dir / scan_names[i] + + # nrrd.write(str(file_name) + ".nrrd", vol) + + vols = common.get_file_paths(_dir) + + all_vals = [] + for i, vol in enumerate(vols): + img, img_h = nrrd.read(vol) + all_vals.append(img[img != 0].flatten()) + + info = pd.DataFrame([os.path.basename(vol) for vol in vols], columns=['name']) + + logging.info("basic stats") + means = pd.DataFrame([np.mean(val) for val in all_vals]) + sds = pd.DataFrame([np.std(val) for val in all_vals]) + sems = pd.DataFrame([np.std(val) / np.sqrt(len(val)) for val in all_vals]) + + bstats = pd.concat([info, means, sds, sems], axis=1) + + bstats.to_csv("E:/220204_BQ_dataset/stage_info/bstats.csv") + d = pd.DataFrame(all_vals, index=info.values) + + df = pd.concat([info, d], axis=1) + + df = pd.melt(df, id_vars='name') + + logging.info("plotting boxplots") + fig1, ax1 = plt.subplots() + ax1.set_title('Basic Plot') + ax1.boxplot(df) + + plt.show() + + logging.info("Stats") + + fit = smf.ols(formula='values ~ name', data=df, missing='drop').fit() + p = fit.pvalues + + print(p) + + pd.DataFrame(p).to_csv("E:/220204_BQ_dataset/stage_info/pvals.csv") + + # all_vals = [vol[vol != 0].flatten() for vol in scans_imgs] + + # pd.DataFrame(all_vals).to_csv("E:/220204_BQ_dataset/stage_info/intens_vals.csv", + # index=True) + + logging.info("Finished!") + + +if __name__ == '__main__': + main() diff --git a/lama/utilities/lama_pad_volumes.py b/lama/utilities/lama_pad_volumes.py index a9d290e3..550def75 100755 --- a/lama/utilities/lama_pad_volumes.py +++ b/lama/utilities/lama_pad_volumes.py @@ -27,6 +27,7 @@ from typing import Iterable, Tuple from pathlib import Path +import numpy as np import SimpleITK as sitk import nrrd from logzero import logger as logging @@ -136,7 +137,7 @@ def pad_volumes(indirs: Iterable[Path], max_dims: Tuple, outdir: Path, clobber: raise common.LamaDataException(msg) # Pad the volume. New pixels set to zero - padded_vol = sitk.ConstantPad(vol, upper_extend, lower_extend, 0) + padded_vol = sitk.ConstantPad(vol, upper_extend, lower_extend, np.min(sitk.GetArrayFromImage(vol)).astype(float)) padded_vol.SetOrigin((0, 0, 0)) padded_vol.SetSpacing((1, 1, 1)) diff --git a/lama/utilities/make_otsu.py b/lama/utilities/make_otsu.py new file mode 100644 index 00000000..3752eeb0 --- /dev/null +++ b/lama/utilities/make_otsu.py @@ -0,0 +1,60 @@ +from pathlib import Path +from lama import common +import SimpleITK as sitk +import os +from body_radiomics_normaliser import fill_image +from logzero import logger as logging +import numpy as np + +def main(): + _dir = Path("E:/try_emap_to_SD/test_pad/220909_sd_rev_reg/target/TS20_EMA76_reference.nrrd") + print(_dir) + masked = 'masked' + #for i, path in enumerate(common.get_file_paths(_dir)): + + + loader = common.LoadImage(_dir) + img = loader.img + + # get the otsu mask + Otsu = sitk.OtsuThresholdImageFilter() + + inv_mask = Otsu.Execute(img) + mask = sitk.InvertIntensity(inv_mask, 1) + + mask = sitk.ConnectedComponent(mask != mask[0, 0, 0]) + + # sitk.WriteImage(seg, os.path.join(output, name + "_all_connected.nrrd")) + mask = sitk.RelabelComponent(mask) + mask = mask == 1 + # sitk.WriteImage(seg, os.path.join(output, name + "_largest_connected.nrrd")) + + # lets see if dilate with a tight kernal fixes getting stupid dots everywhere. + dilate = sitk.BinaryDilateImageFilter() + dilate.SetKernelRadius([200, 200, 200]) + dilate.SetKernelType(sitk.sitkBall) + mask = dilate.Execute(mask) + #npa = sitk.GetArrayFromImage(mask) + #logging.info("fill holes in first orientation") + #npa_hole_filled = fill_image(npa) + + #logging.info("fill holes in second orientation") + #npa_hole_filled = fill_image(npa_hole_filled, roll=1) + + #logging.info("fill holes in third orientation") + #npa_hole_filled = fill_image(npa, roll=0) + + #transposed = np.transpose(npa_hole_filled, axes=(0, 2, 1)) + + # Turn np array to image + #filler = sitk.VotingBinaryIterativeHoleFillingImageFilter() + #filler.SetMaximumNumberOfIterations(1000) + #filled = filler.Execute(mask) + #filled.CopyInformation(mask) + + + sitk.WriteImage(mask, str( _dir.parent / masked / os.path.basename(_dir))) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/utilities/make_roi.py b/lama/utilities/make_roi.py new file mode 100644 index 00000000..c3204bc4 --- /dev/null +++ b/lama/utilities/make_roi.py @@ -0,0 +1,31 @@ +from pathlib import Path +import nrrd +import SimpleITK as sitk +from scipy import ndimage + + + +img_path = '/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/BCM/stability_101120/output/baseline/output/baseline/2086980_download/output/registrations/rigid/2086980_download/2086980_download.nrrd' +labels_path = '/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/BCM/stability_101120/output/baseline/output/baseline/2086980_download/output/inverted_labels/similarity/2086980_download/2086980_download.nrrd' +outdir = Path('/home/neil/Desktop/t/cluster') + +label_of_interest = 10 # medial liver lobe + +img, i_head = nrrd.read(img_path) +labels, _ = nrrd.read(labels_path) + +# Get the bounding box of the inverted label +labels[labels != label_of_interest] = 0 + +s = ndimage.find_objects(labels)[9] + +# Add some padding +p = 30 + +# Get a rough ROI based on the lama segmentation +roi = img[s[0].start - p: s[0].stop + p , + s[1].start -p: s[1].stop + p, + s[2].start -p: s[2].stop + p] + +outpath = outdir / 'test_roi.nrrd' +nrrd.write(str(outpath), roi, header=i_head) diff --git a/lama/utilities/man_clip_removal.py b/lama/utilities/man_clip_removal.py new file mode 100644 index 00000000..22d691ce --- /dev/null +++ b/lama/utilities/man_clip_removal.py @@ -0,0 +1,30 @@ +from pathlib import Path +import nrrd +import os +import numpy as np +from logzero import logger as logging + + +def main(): + # NOTE DO NOT USE AS YOU WILL CLIP SHIT + img_path = Path( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210713_ark_target/210602_C3H_avg_n18.nrrd") + label_path = Path( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210713_ark_target/labelled_clip.nrrd") + + img, img_h = nrrd.read(img_path) + l_clip, c_head = nrrd.read(label_path) + + img[l_clip != 0] = np.random.choice([38, 39, 40]) + + img[(img < 0) | (img > 255)] = np.random.choice([38, 39, 40]) + + fixed_name = "fixed_avg.nrrd" + logging.info(str(os.path.dirname(img_path) + "/" + fixed_name)) + logging.info(np.amax(img), np.amin(img)) + + nrrd.write(str(os.path.dirname(img_path) + "/" + fixed_name), img) + + +if __name__ == '__main__': + main() diff --git a/lama/utilities/matrix_checker.py b/lama/utilities/matrix_checker.py new file mode 100644 index 00000000..e767c1f8 --- /dev/null +++ b/lama/utilities/matrix_checker.py @@ -0,0 +1,90 @@ + +import numpy as np + + +import pandas as pd +from lama.stats.heatmap import clustermap +import matplotlib.pyplot as plt +import seaborn as sns +from scipy.stats import zscore +def main(): + + #url = "https://raw.githubusercontent.com/dorkylever/LAMA/master/lama/tests/clustermap_data.csv" + X = pd.read_csv(url, index_col=0) + X.dropna(how='all', inplace=True) + X.fillna(1, inplace=True) + # replace missing values with 0 + # replace infinite values with 0 + + print("Missing values:", X.isnull().sum().sum()) + print("Infinite values:", np.isposinf(X).sum().sum()+np.isneginf(X).sum().sum()) + + + #mean_data = X.apply(np.mean, axis=1) + + #std_data = X.apply(np.std, axis=1) + + #constant_rows = X.apply(lambda row: row.nunique() == 1, axis=1) + + #X = X[~constant_rows] + + + + #na_mean = mean_data[mean_data.isna().any()] + + + + + + #na_std = std_data[std_data.iszero().any()] + + cg = sns.clustermap(X, + metric="euclidean", + cmap=sns.diverging_palette(250, 15, l=70, s=400, sep=1, n=512, center="light", + as_cmap=True), + cbar_kws={'label': 'mean volume ratio'}, square=True, + center=1, + figsize=[30, len(X) * 0.3]) + + + + + # Calculate the mean and standard deviation of each variable + #X.columns = [col.rsplit('_', 3)[-1] for col in X.columns] + + + plt.tight_layout() + + plt.savefig("E:/221122_two_way/permutation_stats/rad_perm_output/two_way/easy_fig.png") + plt.close() + + X = X.to_numpy() + + + + print(np.isnan(X).any() | np.isinf(X).any()) + # + print(X) + mu = np.mean(X, axis=0) + sigma = np.std(X, axis=0) + # + # + print(np.all(sigma)) + # + # # Calculate the z-score matrix + Z = (X - mu) / sigma + print(Z) + # + # # Calculate the Euclidean distance matrix + d = np.zeros((Z.shape[0], Z.shape[0])) + for i in range(Z.shape[0]): + for j in range(Z.shape[0]): + d[i,j] = np.sqrt(np.sum((Z[i,:] - Z[j,:])**2)) + # + print(d) + print(np.isnan(d).any() | np.isinf(d).any()) + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/utilities/nrrd_to_h5.py b/lama/utilities/nrrd_to_h5.py new file mode 100644 index 00000000..f105fd7c --- /dev/null +++ b/lama/utilities/nrrd_to_h5.py @@ -0,0 +1,78 @@ +# assumes CT image nrrds with minimum value of -1024 +# folder with mask nrrd files +# folder with mask nrrd files +# assumes CT mask nrrds with 0's and 1's +# assumes the shape of each patient data (iamge and mask) are different - therefore, +# this will pad all images and masks to the size of the largest +# does not preform any interpolation to isotrpic voxels or any normalization +# only saves the image and mask, therefore the metadata and pixel spacing is lost + +import nrrd # pip install pynrrd # probably better performance with sitk +import numpy as np +import h5py +from lama import common +from pathlib import Path +import random +import pandas as pd +import os + +# input +# both folder should have the same number of files in the same order.. obviously.. +# folder with image nrrd files + +_dir = Path('Z:/jcsmr/ROLab/Experimental data/Radiomics/Workflow design and trial results/Kyle Drover analysis/220827_pytorch-contouring/') +image_nrrd_folder = Path('Z:/jcsmr/ROLab/Experimental data/Radiomics/Workflow design and trial results/Kyle Drover analysis/220827_pytorch-contouring/imgs') + +# folder with mask nrrd files +mask_nrrd_folder = Path('Z:/jcsmr/ROLab/Experimental data/Radiomics/Workflow design and trial results/Kyle Drover analysis/220827_pytorch-contouring/tumour_respaced') +# output +output = 'output' +# dataset name +dataset = "someName" + + +n_train = 50 +n_validate = 12 + +# substitute value if larger +def addLargest(value,variable): + if value>variable: + variable=value + return variable + +# replace globs with common.getImages. + +images = common.get_file_paths(image_nrrd_folder) +masks = common.get_file_paths(mask_nrrd_folder) + + +img_dirs = [x for x in images] +mask_dirs = [x for x in masks] + +# sanity +assert(len(images)==len(masks)) + +dirs = pd.DataFrame({'imgs': img_dirs, 'labels': mask_dirs}) + +# shuffle rows in dataframe and reset index + + +dirs = dirs.sample(frac=1).reset_index(drop=True) + + +for i in range(n_train+n_validate): + spec_dir = dirs.iloc[i] + + image_nrrd, img_h = nrrd.read(spec_dir['imgs']) + + mask_nrrd, mask_h = nrrd.read(spec_dir['labels']) + + if i < n_train: + file5 = str(_dir) + '/train/' + f"{os.path.splitext(spec_dir['imgs'].name)[0]}.h5" + elif i < (n_train + n_validate): + file5 = str(_dir) + '/validate/' + f"{os.path.splitext(spec_dir['imgs'].name)[0]}.h5" + else: + break + with h5py.File(file5, 'w') as f5: + f5.create_dataset("raw", dtype=np.float32, data=image_nrrd) + f5.create_dataset("label", dtype=np.uint8, data=mask_nrrd) diff --git a/lama/utilities/plot_radiomic_features.py b/lama/utilities/plot_radiomic_features.py new file mode 100644 index 00000000..7af6013a --- /dev/null +++ b/lama/utilities/plot_radiomic_features.py @@ -0,0 +1,182 @@ +from logzero import logger as logging +from lama import common +import os +import nrrd +from pathlib import Path + +import numpy as np +import SimpleITK as sitk + +import matplotlib.pyplot as plt + +import pandas as pd +import seaborn as sns +from matplotlib.backends.backend_pdf import PdfPages + +#pdf = PdfPages("E:/220617_BQ_norm_body_full/feature_comparision.pdf") +#pdf = PdfPages("E:/Bl6_data/220524_test_radiomics/feature_comparision.pdf") + +def multiple_plot(data): + plt.figure() + plt.clf() + + logging.info(data.columns[4]) + + plt.xlabel('') + #plt.plot(data) + #data.groupby(['original_shape_VoxelVolume']).apply(print) + + #data.groupby(['Norm_Type']).plot.scatter( + # x='original_shape_VoxelVolume', + # y=2, + # hue='Norm_Type', + # #subplots=True, + # by=['Norm_Type'], + # #layout=(1, 4), + # sharex=False, + # rot=90, + # fontsize=5 + #) + + + #sns.scatterplot(x='original_shape_VoxelVolume',y=str(data.columns[4]), + # hue='Genotype', + # style='Norm_Type', + # data=data) + + fig, ax = plt.subplots(figsize=(8, 6)) + + + + data.groupby(['Tumour_Model']).boxplot( + by=['Exp', 'Age', 'Tumour_Model'], + #by=['Genotype','Background'], + layout=(1, 4), + sharex=True, + rot=45, + fontsize=5 + ) + + plt.suptitle(str(data.columns[4]),fontsize='medium') + + plt.xlabel('') + + + #plt.show() + #pdf.savefig() + plt.close() + + +def main(): + all_features = pd.read_csv("E:/220719_stage_norm_with_filts/sub_normed_features.csv", index_col=False) + + print(all_features) + + #all_features = pd.read_csv("E:/220508_BQ_norm/all_features.csv") + + #all_features = pd.read_csv("E:/220204_BQ_Dataset/220530_BQ_norm/histo_normed_features.csv") + + #all_features = pd.read_csv("E:/Bl6_data/220524_test_radiomics/lat_vent_features.csv") + + #all_features = all_features.pivot(index="Norm_Type", columns='scanID') + + + #data = all_features.iloc[:, 32:np.shape(all_features)[1]] + + + + data = all_features.iloc[:, 32:np.shape(all_features)[1]] + + print("data", data) + + logging.info("Plotting Results") + + #fig, ax = plt.subplots(len(all_features.columns), 1) + + + + for i, col in enumerate(data): + + plot_data = pd.concat([#all_features['Norm_Type'], + all_features['scanID'], + all_features['Exp'], + all_features['Tumour_Model'], + all_features['Age'], + data.iloc[:, i]], axis=1) + #if data.iloc[:, i].name != 'original_shape_VoxelVolume': + #multiple_plot(plot_data) + + #plot_data = pd.concat([all_features['Norm_Type'], + #all_features['original_shape_VoxelVolume'], + # all_features['Background'], + # all_features['Genotype'], + # data.iloc[:, i]], axis=1) + #if data.iloc[:, i].name != 'original_shape_VoxelVolume': + #multiple_plot(plot_data) + + # all_features.plot(x='Norm_type', + # kind='box', + # subplots=True, + # layout=(len(all_features.columns), 1), + # legend=True, + # figsize=(29.7, 21.1)) + # plt.legend(loc='best') + + + plt.close() + #pdf.close() + + + + data = data[(all_features['Exp'] == 'MPTLVo7') & + (all_features['Age'] == 'D14')] + + print(data) + + + + + + + + #data = data.set_index(all_features[all_features['Norm_Type'] == 'Subtraction']['Embryo']) + data = data.set_index(all_features[(all_features['Exp'] == 'MPTLVo7') & + (all_features['Age'] == 'D14')]['Tumour_Model']) + #data = data.set_index(all_features['Tumour_Model']) + print(data.index) + print(data) + + + + #data.columns = data.columns.str.replace("original_",'') + data = data.transpose() + + data = data.apply(lambda x: (x-x.mean())/x.std(), axis=1) + + data = data.apply(lambda x: (x - x.mean()) / x.std(), axis=0) + + + #Drop na-cols + data.dropna(axis='rows', inplace=True) + + + + fig, ax = plt.subplots(figsize=[56, 60]) + print(data) + sns.clustermap(data, + figsize=[21, 21], + dendrogram_ratio=0.1, + metric="correlation", + #cmap=sns.diverging_palette(250, 15, l=70, s=400, sep=40, n=512, center="light", as_cmap=True), + #cbar_kws={'Genotype': 'Background'}, + square=True, + xticklabels=True, + yticklabels=True) + plt.tight_layout() + + plt.savefig("E:/220719_stage_norm_with_filts/radiomics_clustermap.png") + plt.close() + + +if __name__ == '__main__': + main() diff --git a/lama/utilities/prep_for_man_valid.py b/lama/utilities/prep_for_man_valid.py new file mode 100644 index 00000000..9203a4ab --- /dev/null +++ b/lama/utilities/prep_for_man_valid.py @@ -0,0 +1,108 @@ +"""Just roughly normalises intensities of volumes and straightens each embryo to perform some sort of + manual phenotyping""" +from lama.img_processing import normalise +from logzero import logger as logging +from lama import common +import os +import nrrd +from pathlib import Path +from scipy import ndimage +import numpy as np +import SimpleITK as sitk + + +def get_images(dir, s): + img_list = [] + spec_name_list = [] + int_paths = common.get_file_paths(dir) + + # enumerating for speed only + for i, img_path in enumerate(int_paths): + img, img_h = nrrd.read(img_path) + # only get heatmap vals inside of the mask + padding + img = img[s[0].start:s[0].stop, + s[1].start:s[1].stop, + s[2].start:s[2].stop] + spec_name_list.append(os.path.splitext(img_path.name)[0]) + + img_list.append(img) + return img_list, spec_name_list + + +def resample(image, transform): + """ + This function resamples (updates) an image using a specified transform + :param image: The sitk image we are trying to transform + :param transform: An sitk transform (ex. resizing, rotation, etc. + :return: The transformed sitk image + """ + reference_image = image + interpolator = sitk.sitkBSpline + default_value = 0 + return sitk.Resample(image, reference_image, transform, + interpolator, default_value) + + +def rotate(vols, x, y, z): + # TODO: fix so it doesn't clip + logging.info(f"rotating vol with manual rotation {x, y, z}") + rotated = [] + for vol in vols: + print(np.shape(vol)) + fixed = sitk.GetImageFromArray(vol.astype(np.uint8), isVector=False) + rigid_euler = sitk.Euler3DTransform() + rigid_euler.SetRotation(x, y, z) + rigid_euler.SetTranslation((0, 0, 0)) + # set center as midpoints + rigid_euler.SetCenter([coord // 2 for coord in np.shape(vol)[::-1]]) + # rigid_euler.TransformPoint([point for point in grid for grid in img]) + mov = resample(fixed, rigid_euler) + rotated.append(sitk.GetArrayFromImage(mov).astype(np.uint8)) + + return rotated + + +def main(): + wt_dir = Path( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210521_vis_anal/wt") + mut_dir = Path( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210521_vis_anal/non_wt") + + mask, mask_h = nrrd.read( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210423_g_by_e_stand_out/210415_g_by_e_anal/target/stats_mask.nrrd") + + pop_avg, pop_h = nrrd.read( + "Z:/ArkellLab/Lab Members/Kyle/PhD/vmshare/Zic2_Kumba_LAMA/210423_g_by_e_stand_out/210415_g_by_e_anal/target/210224_pop_avg_deformable_8.nrrd") + + s = ndimage.find_objects(mask)[0] + + # get the images + wt_imgs, wt_names = get_images(wt_dir, s) + + mut_imgs, mut_names = get_images(mut_dir, s) + + int_norm = normalise.IntensityMaskNormalise() + + # normalise the images + int_norm.add_reference(wt_imgs) + + int_norm.normalise(mut_imgs) + + int_norm.normalise(wt_imgs) + + # manually orient the embryos + wt_imgs = rotate(wt_imgs, -0.0635, 0.02, 0.01) + + mut_imgs = rotate(mut_imgs, -0.0635, 0.02, 0.01) + + # write files + logging.info('writing files') + for i, vol in enumerate(mut_imgs): + nrrd.write(mut_names[i] + ".nrrd", vol) + + for i, vol in enumerate(wt_imgs): + nrrd.write(wt_names[i] + ".nrrd", vol) + + +if __name__ == '__main__': + main() diff --git a/lama/utilities/quick_n4_norm.py b/lama/utilities/quick_n4_norm.py new file mode 100644 index 00000000..a18d6026 --- /dev/null +++ b/lama/utilities/quick_n4_norm.py @@ -0,0 +1,89 @@ + + +import logging +from pathlib import Path + +import numpy as np +from lama import common +import SimpleITK as sitk +import os + + +def main(): + import argparse + parser = argparse.ArgumentParser("Run various intensity normalisation methods") + parser.add_argument('-i', dest='indir', help='directory with vols', + required=True) + #parser.add_argument('-l', dest='levels', type=int, help='number of bins within the histogram') + #parser.add_argument('-m', dest='match_num', type=int, help='number of point to match within the histogram') + + parser.add_argument('-o', dest='out_dir', help='output folder for normalised images') + + #parser.add_argument('-r', dest='ref_vol', help='Path of Reference Volume To Use') + + + + args = parser.parse_args() + + logging.info("Intensity Normalisation by N4") + + print(Path(args.indir)) + + _files = common.get_file_paths(Path(args.indir)) + + _files.sort(key=lambda x: os.path.basename(x)) + + # should be labelling properly now + vols = [common.LoadImage(_path).img for _path in _files] + + names = [os.path.splitext(vol_path.name)[0] for vol_path in _files] + + + Otsu = sitk.OtsuThresholdImageFilter() + dilate = sitk.BinaryDilateImageFilter() + downsampler = sitk.ShrinkImageFilter() + N4 = sitk.N4BiasFieldCorrectionImageFilter() + + #if args.ref_vol: + # ref_vol = common.LoadImage(Path(args.ref_vol)).img + #else: + # ref_vol = vols[0] + + for i, vol in enumerate(vols): + print(i) + logging.info("Getting mask using via the otsu algorithm") + inv_mask = Otsu.Execute(vol) + o_mask = sitk.InvertIntensity(inv_mask, 1) + + o_mask = sitk.ConnectedComponent(o_mask != o_mask[0, 0, 0]) + + # sitk.WriteImage(seg, os.path.join(output, name + "_all_connected.nrrd")) + o_mask = sitk.RelabelComponent(o_mask) + o_mask = o_mask == 1 + # sitk.WriteImage(seg, os.path.join(output, name + "_largest_connected.nrrd")) + + # lets see if dilate with a tight kernal fixes getting stupid dots everywhere. + + dilate.SetKernelRadius([1, 1, 1]) + dilate.SetKernelType(sitk.sitkBall) + o_mask = dilate.Execute(o_mask) + o_mask.CopyInformation(vol) + + logging.info('Using N4 bias correction') + + # downsample images + + down_sampled_img = downsampler.Execute(vol) + + down_sampled_mask = downsampler.Execute(o_mask) + + N4_vol = N4.Execute(down_sampled_img, down_sampled_mask) + log_bias_field = N4.GetLogBiasFieldAsImage(vol) + vols[i] = vol / sitk.Exp(log_bias_field) + logging.info(f"Writing Normalised File for {names[i]}") + + file_name = names[i]+".nrrd" + sitk.WriteImage(vols[i], str(Path(args.out_dir)/ file_name)) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/utilities/quick_normalise.py b/lama/utilities/quick_normalise.py new file mode 100644 index 00000000..6ab19f32 --- /dev/null +++ b/lama/utilities/quick_normalise.py @@ -0,0 +1,57 @@ + + +import logging +from pathlib import Path + +import numpy as np +from lama import common +import SimpleITK as sitk +import os + + +def main(): + import argparse + parser = argparse.ArgumentParser("Run various intensity normalisation methods") + parser.add_argument('-i', dest='indir', help='directory with vols', + required=True) + parser.add_argument('-l', dest='levels', type=int, help='number of bins within the histogram') + parser.add_argument('-m', dest='match_num', type=int, help='number of point to match within the histogram') + + parser.add_argument('-o', dest='out_dir', help='output folder for normalised images') + + parser.add_argument('-r', dest='ref_vol', help='Path of Reference Volume To Use') + + + + args = parser.parse_args() + + logging.info("Intensity Normalisation by Histogram bin Matching") + + _dir = Path(args.indir) + + + + vols = [common.LoadImage(vol).img for vol in common.get_file_paths(_dir)] + names = [os.path.splitext(vol_path.name)[0] for vol_path in common.get_file_paths(_dir)] + + matcher = sitk.HistogramMatchingImageFilter() + matcher.SetThresholdAtMeanIntensity(True) + + matcher.SetNumberOfHistogramLevels(args.levels) + matcher.SetNumberOfMatchPoints(args.match_num) + + if args.ref_vol: + ref_vol = common.LoadImage(Path(args.ref_vol)).img + else: + ref_vol = vols[0] + + for i, img in enumerate(vols): + + logging.info(f"Normalising {names[i]}") + vols[i] = matcher.Execute(img, ref_vol) + logging.info(f"Writing Normalised File for {names[i]}") + file_name = names[i]+".nrrd" + sitk.WriteImage(vols[i], str(Path(args.out_dir)/ file_name)) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/utilities/quick_rotate.py b/lama/utilities/quick_rotate.py new file mode 100644 index 00000000..eba00ef0 --- /dev/null +++ b/lama/utilities/quick_rotate.py @@ -0,0 +1,30 @@ +import numpy as np +from lama.utilities.prep_for_man_valid import resample, rotate +from lama import common +import nrrd +import SimpleITK as sitk +from pathlib import Path +import os +from logzero import logger as logging + +def main(): + # NOTE DO NOT USE AS YOU WILL CLIP SHIT + img_path = Path("E:/try_emap_to_SD/test_pad/220909_sd_rev_reg/target/TS20_EMA76_reference.nrrd") + img = common.LoadImage(img_path).img + img_list = [] + + + #padded_img = sitk.ConstantPad(img, pad_amount, pad_amount,np.min(sitk.GetArrayFromImage(img)).astype(float)) + #img_list.append(sitk.GetArrayFromImage(padded_img)) + + img_list.append(sitk.GetArrayFromImage(img)) + + rot_name = "rot2_TS20_EMA76_reference.nrrd" + logging.info(str(os.path.dirname(img_path)+"/"+rot_name)) + rot_avg = rotate(img_list,0, -0.05, 0) + #rot_avg_v2 = rotate(rot_avg, 0, 0, np.deg2rad(-180)) + + nrrd.write(str(os.path.dirname(img_path)+"/"+rot_name), rot_avg[0]) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/utilities/radiomics_normaliser.py b/lama/utilities/radiomics_normaliser.py new file mode 100644 index 00000000..f4e6f88d --- /dev/null +++ b/lama/utilities/radiomics_normaliser.py @@ -0,0 +1,273 @@ +"""Normalises the radiomics scans by the average intensity of a mask""" +from typing import Union + + + +from lama.img_processing import normalise + +from logzero import logger as logging +from lama import common +import os +import nrrd +from pathlib import Path + +import numpy as np +import SimpleITK as sitk + +from radiomics import featureextractor, imageoperations +from scipy import ndimage +import pandas as pd + +import raster_geometry as rg + +# each scan in Ben's dataset will need its own mask +def get_images_from_masks(dir): + img_list = [] + spec_name_list = [] + mask_list = [] + scan_paths = [spec_path for spec_path in common.get_file_paths(dir) if ('imgs' in str(spec_path))] + mask_paths = [mask_path for mask_path in common.get_file_paths(dir) if ('labels' in str(mask_path))] + + scan_paths.sort() + mask_paths.sort() + + # enumerate for indexing masks + for i, img_path in enumerate(scan_paths): + loader = common.LoadImage(img_path) + img = loader.img + # get_arr + # cant use sitk.readimage due to header? + # img, img_head = nrrd.read(img_path) + # img = sitk.GetImageFromArray(img) + + m_loader = common.LoadImage(mask_paths[i]) + mask = m_loader.img + + # Only get values inside of the mask + logging.info(f"Obtaining values from {img_path}") + + s = ndimage.find_objects(mask)[0] + + #get the arrays + img_a = sitk.GetArrayFromImage(img) + + mask_a = sitk.GetArrayFromImage(mask) + + # + img_a[(mask_a != 1) | (img_a < 0)] = 0 + + # img_pro = sitk.GetImageFromArray(img_a) + # mask_pro = sitk.GetImageFromArray(mask_a) + # img_pro.CopyInformation(img) + # mask_pro.CopyInformation(mask) + + spec_name_list.append(os.path.splitext(img_path.name)[0]) + # print(spec_name_list) + img_list.append(img) + mask_list.append(mask) + return img_list, spec_name_list, mask_list + +def spherify(dir): + + scan_paths = [spec_path for spec_path in common.get_file_paths(dir) if ('imgs' in str(spec_path))] + tumour_paths =[spec_path for spec_path in common.get_file_paths(dir) if ('tumour_respaced' in str(spec_path))] + + + # debugging - Thanks Neil + scan_paths.sort() + tumour_paths.sort() + + img_list = [] + spec_name_list = [] + mask_list = [] + + for i, img_path in enumerate(scan_paths): + + # logging.info(f"Calculating for {os.path.splitext(os.path.basename(img_path))[0]}") + + logging.info(img_path) + logging.info(tumour_paths[i]) + loader = common.LoadImage(img_path) + img = loader.img + + m_loader = common.LoadImage(tumour_paths[i]) + mask = m_loader.img + + m_array = sitk.GetArrayFromImage(mask) + + s = ndimage.find_objects(m_array)[-1] + + + + midpoint = [np.round(np.mean([s[0].start, s[0].stop]))/512, + np.round((np.mean([s[1].start, s[1].stop]))) / 512, + np.round(482-(np.mean([s[2].start, s[2].stop]))) / 512] + #print("Original Midpoint", [i*512 for i in midpoint]) + + #print("Modified midpoint", midpoint) + + arr = rg.sphere(512, 10, midpoint, smoothing=True).astype(np.int_) + + + ball = sitk.GetImageFromArray(arr) + + ball.CopyInformation(mask) + + sphere = "sphere" + sitk.WriteImage(ball, + str(Path(os.path.dirname(img_path)).parent.absolute() / sphere/ os.path.basename(img_path))) + + spec_name_list.append(os.path.splitext(img_path.name)[0]) + + # print(spec_name_list) + #img_list.append(img) + #mask_list.append(ball) + + return img_list, mask_list, spec_name_list + +def pyr_calc_all_features(dir, normed: bool = False, images: list = None, file_names: list = None, spheres: list = None): + # get either the normalised or original images + scan_paths = images if normed else [spec_path for spec_path in common.get_file_paths(dir) if ('imgs' in str(spec_path))] + + tumour_paths = spheres if spheres\ + else [spec_path for spec_path in common.get_file_paths(dir) if ('tumour_respaced' in str(spec_path))] + + # debugging - Thanks Neil + if not normed: + scan_paths.sort() + if not spheres: + tumour_paths.sort() + + # Get the first order measurements + full_orders = [] + + for i, img_path in enumerate(scan_paths): + + #logging.info(f"Calculating for {os.path.splitext(os.path.basename(img_path))[0]}") + if normed: #files exist + img = img_path + + else: + logging.info(img_path) + logging.info(tumour_paths[i]) + loader = common.LoadImage(img_path) + img = loader.img + if spheres: + mask = tumour_paths[i] + else: + m_loader = common.LoadImage(tumour_paths[i]) + mask = m_loader.img + + # apply pyradiomic filters + #print(globals().items) + #img_filt_ops = [x for x, y in globals().items if (x.startswith('pyradiomics.imageoperations.get') and x.endswith('Image'))] + + #print(img_filt_ops) + + # get all features and append to list + extractor = featureextractor.RadiomicsFeatureExtractor(normalize=True) + + extractor.enableAllImageTypes() + extractor + extractor.enableAllFeatures() + result = extractor.execute(img, mask) + + if file_names is not None: + first_orders = pd.DataFrame.from_dict(result, orient='index', + columns=[os.path.splitext(os.path.basename(file_names[i]))[0]]) + else: + first_orders= pd.DataFrame.from_dict(result, orient='index', + columns=[os.path.splitext(os.path.basename(img_path))[0]]) + full_orders.append(first_orders) + + # fixing data format + features = pd.concat(full_orders, axis=1).transpose() + + _metadata = features.index.str.split('_', expand=True).to_frame(index=False, + name=['Date', 'Exp', 'Contour_Method', + 'Tumour_Model', 'Position', 'Age', + 'Cage_No.', 'Animal_No.']) + _metadata.reset_index(inplace=True, drop=True) + features.reset_index(inplace=True, drop=True) + features = pd.concat([_metadata, features], axis=1) + + features.index.rename('scanID', inplace=True) + + return features + + +def pyr_normaliser(_dir, _normaliser, scans_imgs, masks, fold: bool = False): + # create a copy so orginal files aren't overwritten + scans_imgs = scans_imgs.copy() + + # Do the normalisation + if isinstance(_normaliser, normalise.NonRegMaskNormalise): + _normaliser.add_reference(scans_imgs[0], masks[0]) + _normaliser.normalise(scans_imgs, masks, fold=fold, temp_dir=_dir) + elif isinstance(_normaliser, normalise.IntensityHistogramMatch): + print(type(scans_imgs[0])) + _normaliser.normalise(scans_imgs, scans_imgs[0]) + + return scans_imgs + + +def main(): + #import argparse + #parser = argparse.ArgumentParser("Run various intensity normalisation methods") + #parser.add_argument('-i', dest='indirs', help='dir with vols, tumour masks and label masks', + # required=True) + _dir = Path("E:/220204_BQ_dataset/220521_BQ_norm") + #args = parser.parse_args() + logging.info("Create Spheres from midpoint of tumour") + + images, spheres, scan_names = spherify(_dir) + + + + logging.info("Calculating Original Features") + #_dir = Path(args.indirs) + + #orig_features = pyr_calc_all_features(_dir, spheres=spheres) + #orig_features.to_csv(str(_dir / "orig_features.csv")) + + # get the images and masks + logging.info("Getting values from inside the stage") + scans_imgs, scan_names, masks = get_images_from_masks(_dir) + + scan_names.sort() + logging.info("Normalising to mean of the stage (subtraction)") + #sub_int_normed = pyr_normaliser(_dir, normalise.NonRegMaskNormalise(), scans_imgs, masks) + + + + #for i, vol in enumerate(sub_int_normed): + # file_name = scan_names[i] + '.nrrd' + # sitk.WriteImage(vol, str(_dir / file_name)) + #logging.info("Recalculating Features") + #sub_normed_features = pyr_calc_all_features(_dir, normed=True, images=sub_int_normed, file_names=scan_names, spheres=spheres) + #sub_normed_features.to_csv(str(_dir / "sub_normed_features.csv")) + + #logging.info("Normalising to mean of the stage (fold)") + #fold_int_normed = pyr_normaliser(_dir, normalise.NonRegMaskNormalise(), scans_imgs, masks, fold=True) + logging.info("Recalculating Features") + #fold_normed_features = pyr_calc_all_features(_dir, normed=True, images=fold_int_normed, file_names=scan_names, spheres=spheres) + #fold_normed_features.to_csv(str(_dir / "fold_normed_features.csv")) + + logging.info("Maskless Histogram Intensity Matching") + histo_normed = pyr_normaliser(_dir, normalise.IntensityHistogramMatch(), scans_imgs, masks) + logging.info("Recalculating Features") + histo_normed_features = pyr_calc_all_features(_dir, normed=True, images=histo_normed, file_names=scan_names, spheres=spheres) + histo_normed_features.to_csv(str(_dir / "fold_normed_features.csv")) + + #all_features = pd.concat([orig_features, sub_normed_features, fold_normed_features, histo_normed_features], + # keys=["Raw", "Subtraction", "Fold", "Histogram"]) + + #all_features.index.rename('Norm_Type', inplace=True) + + #all_features.to_csv(str(_dir / "all_features.csv")) + + logging.info("DONE") + + +if __name__ == '__main__': + main() diff --git a/lama/utilities/randomise_for_blinding.py b/lama/utilities/randomise_for_blinding.py new file mode 100644 index 00000000..2bf38674 --- /dev/null +++ b/lama/utilities/randomise_for_blinding.py @@ -0,0 +1,36 @@ +import nrrd +import random +import SimpleITK as sitk +from lama import common +import os +import pandas as pd +from pathlib import Path + + +def randomise_file_list(_dir): + + o_dir = Path(_dir.parent / "output") + os.mkdir(o_dir) + + file_list = [file_name for file_name in common.get_file_paths(_dir)] + print(file_list) + # randomise list + random.shuffle(file_list) + + file_df = pd.DataFrame({"name": file_list, "num": range(len(file_list))}) + + print(file_df) + + file_df.to_csv(o_dir/"results.csv") + for i, file_name in enumerate(file_list): + i_name = o_dir / (str(i) + ".nrrd") + os.rename(file_name,i_name) + + + +def main(): + _dir = Path("//anufiles.anu.edu.au/anu/jcsmr/ArkellLab/Lab Members/Amrit/2022_BL6_cohort/BL6_full_no_genotype") + randomise_file_list(_dir) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/lama/utilities/zoomer.py b/lama/utilities/zoomer.py new file mode 100644 index 00000000..3b508983 --- /dev/null +++ b/lama/utilities/zoomer.py @@ -0,0 +1,26 @@ +import SimpleITK as sitk +import os +from pathlib import Path +from lama import common +import nrrd +import numpy as np +from scipy import ndimage + +target_dir = Path("E:/try_emap_to_SD/zoom_z_axis") + +volpaths = common.get_file_paths(target_dir) + +scaled = "scaled" +print(target_dir) +print('zooming') + +for path in volpaths: + vol, v_head = nrrd.read(path) + print(path) + loader = common.LoadImage(path) + img = loader.img + + zoomed = ndimage.zoom(vol, zoom=[1,1,1.6], mode='nearest', order=0) + # + + nrrd.write(str(target_dir) + "/scaled_" + str(os.path.basename(path)), zoomed, header=v_head) diff --git a/lama/version.py b/lama/version.py new file mode 100644 index 00000000..73884ff0 --- /dev/null +++ b/lama/version.py @@ -0,0 +1,12 @@ +__version__ = '1.0.1' # Dev version +#__version__ = '1.0.0' # start of dorkylever-version +#__version__ = '0.9.100' # Perm stats config key checker +# __version__ = '0.9.94' # Perm stats bug with short_name column where there are empty values +# __version__ = '0.9.93' # Minor chnage on perm stats config. Output dir now has deafult to cfg dir +#__version__ = '0.9.92' # git log problem fixed +# __version__ = '0.9.91' # Fixed organ vol plots so not dependent on how paths are entered in config +# __version__ = '0.9.90' # Permutation stats now create null distributon per label. As QC can lead to difference data structure per label +#__version__ = '0.9.83' # Fixed a a couple of minor stats bugs +#__version__ = '0.9.82' +#__version__ = '0.9.81' # Fixed stats InvertHeatemap bug +#__version__ = '0.9.80' diff --git a/meld b/meld new file mode 100644 index 00000000..303cdad4 --- /dev/null +++ b/meld @@ -0,0 +1,33 @@ +diff --git a/lama/elastix/invert_transforms.py b/lama/elastix/invert_transforms.py +index d20785b..b8ee004 100644 +--- a/lama/elastix/invert_transforms.py ++++ b/lama/elastix/invert_transforms.py +@@ -15,7 +15,7 @@ from lama.common import cfg_load + from lama.registration_pipeline.validate_config import LamaConfig + + from lama.elastix import (ELX_TRANSFORM_NAME, ELX_PARAM_PREFIX, LABEL_INVERTED_TRANFORM, +- IMAGE_INVERTED_TRANSFORM, PROPAGATE_CONFIG, RESOLUTION_IMGS_DIR, IMG_PYRAMID_DIR) ++ IMAGE_INVERTED_TRANSFORM, INVERT_CONFIG, RESOLUTION_IMGS_DIR, IMG_PYRAMID_DIR) + + LABEL_REPLACEMENTS = { + 'FinalBSplineInterpolationOrder': '0', +@@ -97,8 +97,8 @@ def batch_invert_transform_parameters(config: Union[str, LamaConfig], + inv_stage_dir.mkdir(exist_ok=True) + + # Add the stage to the inversion order config (in reverse order), if not already. +- if reg_stage_dir.name not in stages_to_invert['label_propagation_order']: +- stages_to_invert['label_propagation_order'].insert(0, reg_stage_dir.name) ++ if reg_stage_dir.name not in stages_to_invert['inversion_order']: ++ stages_to_invert['inversion_order'].insert(0, reg_stage_dir.name) + + if clobber: + common.mkdir_force(specimen_stage_inversion_dir) # Overwrite any inversion file that exist for a single specimen +@@ -142,7 +142,7 @@ def batch_invert_transform_parameters(config: Union[str, LamaConfig], + reg_dir = Path(os.path.relpath(reg_stage_dir, inv_outdir)) + stages_to_invert['registration_directory'] = str(reg_dir) # Doc why we need this + # Create a yaml config file so that inversions can be run seperatley +- invert_config = config['inverted_transforms'] / PROPAGATE_CONFIG ++ invert_config = config['inverted_transforms'] / INVERT_CONFIG + + with open(invert_config, 'w') as yf: + yf.write(yaml.dump(dict(stages_to_invert), default_flow_style=False)) diff --git a/setup.py b/setup.py index c644f674..6188d4ac 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,17 @@ # coding: utf-8 from setuptools import setup, find_packages +from pathlib import Path + +# Get __verison_dunder without importing lama +version_file = Path(__file__).resolve().parent / 'lama' / 'version.py' +exec(open(version_file).read()) + setup( - name='lama_phenotype_detection', - download_url='https://github.com/mpi2/lama/archive/0.9.4.tar.gz', - version='0.9.60', + name='dorkylever_lama_phenotype_detection', + download_url=f'https://github.com/dorkylever/LAMA/archive/refs/tags/1.0.1.tar.gz', + version="1.0.1", packages=find_packages(exclude=("dev")), package_data={'': ['current_commit', 'stats/rscripts/lmFast.R', @@ -13,26 +19,44 @@ include_package_data=True, install_requires=[ 'appdirs', + 'setuptools==59.8.0', 'matplotlib>=2.2.0', - 'numpy>=1.15.0', + 'numpy==1.21.5', 'pandas>=1.1.0', - 'scikit-learn>=0.19.2', + 'scikit-learn==1.0.2', 'scipy>=1.1.0', 'scikit-image==0.17.2', 'seaborn>=0.9.0', 'statsmodels>=0.9.0', 'PyYAML>=3.13', - 'SimpleITK>=1.1.0', + 'catboost==1.1.0', + 'SimpleITK>=2.1.0', + 'pyradiomics>=3.0.1', + 'threadpoolctl==3.1.0', + 'imbalanced-learn==0.9.0', + 'raster-geometry', 'filelock', - 'psutil', - 'logzero', + 'psutil==5.9.3', + 'plotly', + 'logzero==1.7.0', 'addict', 'toml', 'pynrrd', - 'pytest' + 'pytest', + 'tqdm', + 'gitpython', + 'pacmap', + 'shap', + 'joblib', + 'wheel', + 'torch', + 'numexpr', + 'bottleneck', + 'cuda-python==11.8.1', + 'typing_extensions>=4.0.0' ], extras_require={ - 'dev': ['pyradiomics'], + 'dev': ['h5py'], }, url='https://github.com/mpi2/LAMA', license='Apache2', @@ -55,7 +79,11 @@ 'lama_stats=lama.scripts.lama_stats:main', 'lama_pad_volumes=lama.utilities.lama_pad_volumes:main', 'lama_convert_16_to_8=lama.utilities.lama_convert_16_to_8:main', - 'lama_img_info=lama.utilities.lama_img_info:main' + 'lama_img_info=lama.utilities.lama_img_info:main', + 'lama_ark_imp_pro=lama.scripts.lama_ark_img_pro:main', + 'lama_radiomics_runner=lama.scripts.lama_radiomics_runner:main', + 'lama_two_way_plotter=lama.scripts.two_way_plotter:main', + 'lama_machine_learning=lama.scripts.lama_machine_learning:main' ] }, ) diff --git a/tatus b/tatus new file mode 100644 index 00000000..852951cb --- /dev/null +++ b/tatus @@ -0,0 +1,23 @@ + dec2020_output_structure + dec20_lm + dev + folding + for_permutation + gridfix + mask_stage +* master + name_outputs_by_stage + norm_for_standard + oct20_oris + orientations + perm + permtest + pr12 + qc + rev_reg + stage_targets + staging_effect + staging_effect_April_20 + testaug + testmodel + wtincidence