diff --git a/.github/workflows/run-pipeline.yml b/.github/workflows/run-pipeline.yml index f6c9322e..a3d4ae43 100644 --- a/.github/workflows/run-pipeline.yml +++ b/.github/workflows/run-pipeline.yml @@ -134,7 +134,7 @@ jobs: - name: Upload Training Outputs id: uploaded_training_outputs if: inputs.upload_training_outputs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.0 with: name: completed_training_outputs path: | @@ -148,7 +148,7 @@ jobs: - name: Upload Pretrained Outputs id: uploaded_pretrained_outputs if: inputs.upload_pretrained_outputs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.0 with: name: completed_pretrained_outputs path: | @@ -163,7 +163,7 @@ jobs: - name: Upload Regenie Outputs id: uploaded_regenie_outputs if: inputs.upload_regenie_outputs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v4.4.0 with: name: completed_regenie_outputs path: | diff --git a/deeprvat/deeprvat/associate.py b/deeprvat/deeprvat/associate.py index e1f64fdc..c31b6cee 100644 --- a/deeprvat/deeprvat/associate.py +++ b/deeprvat/deeprvat/associate.py @@ -8,7 +8,6 @@ from pathlib import Path from pprint import pprint from typing import Dict, List, Optional, Tuple - import click import numpy as np import pandas as pd @@ -25,7 +24,6 @@ from tqdm import tqdm, trange import zarr import re - import deeprvat.deeprvat.models as deeprvat_models from deeprvat.data import DenseGTDataset @@ -63,10 +61,8 @@ def get_burden( :type agg_models: Dict[str, List[nn.Module]] :param device: Device to perform computations on, defaults to "cpu". :type device: torch.device - :param skip_burdens: Flag to skip burden computation, defaults to False. - :type skip_burdens: bool :return: Tuple containing burden scores, target y phenotype values, x phenotypes and sample ids. - :rtype: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + :rtype: Tuple[torch.Tensor, torch.Tensor] .. note:: Checkpoint models all corresponding to the same repeat are averaged for that repeat. @@ -923,6 +919,14 @@ def compute_burdens_( if bottleneck and i > 20: break + # Calculate Max for this chunk and store for later + max_df = pd.DataFrame(columns=["max"]) + for r in range(len(agg_models)): + chunk_max = np.max(chunk_burden[:, :, r]) # samples x genes x repeats + max_df.loc[r, "max"] = chunk_max + print(f"Saving Burden Max Scores") + max_df.to_csv(f"{Path(cache_dir)}/chunk{chunk}_max.csv", index=False) + burdens[:] = chunk_burden[:] sample_ids[:] = chunk_sampleid[:] @@ -941,6 +945,7 @@ def compute_burdens_( @click.option("--n-chunks", type=int) @click.option("--chunk", type=int) @click.option("--dataset-file", type=click.Path(exists=True)) +@click.option("--center-scale-burdens", is_flag=True) @click.argument("data-config-file", type=click.Path(exists=True)) @click.argument("model-config-file", type=click.Path(exists=True)) @click.argument("checkpoint-files", type=click.Path(exists=True), nargs=-1) @@ -952,6 +957,7 @@ def compute_burdens( n_chunks: Optional[int], chunk: Optional[int], dataset_file: Optional[str], + center_scale_burdens: bool, data_config_file: str, model_config_file: str, checkpoint_files: Tuple[str], @@ -972,6 +978,8 @@ def compute_burdens( :type chunk: Optional[int] :param dataset_file: Path to the dataset file, i.e., association_dataset.pkl. :type dataset_file: Optional[str] + :param center_scale_burdens: Flag to enable calculation of center and scaling parameters for centering and scaling burden results. + :type center_scale_burdens: bool :param data_config_file: Path to the data configuration file. :type data_config_file: str :param model_config_file: Path to the model configuration file. @@ -1024,6 +1032,34 @@ def compute_burdens( bottleneck=bottleneck, ) + if center_scale_burdens: + if (chunk == 0) or not chunk: + # Calculate Mode + anno_len = len( + data_config[data_key]["dataset_config"]["rare_embedding"]["config"][ + "annotations" + ] + ) + empty_batch = { + "rare_variant_annotations": torch.zeros(1, 1, anno_len, 1), + "y": None, + "x_phenotypes": None, + "sample": None, + } + this_mode, _ = get_burden( + empty_batch, + agg_models, + device=device, + ) + this_mode = this_mode.flatten() + center_scale_df = pd.DataFrame(columns=["mode"]) + for r in range(len(agg_models)): + center_scale_df.loc[r, "mode"] = this_mode[r] + pprint(f"Calculated Zero-Effect Burden Score :\n {this_mode}") + center_scale_df.to_csv( + f"{Path(out_dir)}/computed_burdens_stats.csv", index=False + ) + logger.info("Saving computed burdens, corresponding genes, and targets") np.save(Path(out_dir) / "genes.npy", genes) @@ -1145,7 +1181,6 @@ def regress_on_gene_scoretest( :rtype: Tuple[List[str], List[float], List[float]] """ burdens = burdens.reshape(burdens.shape[0], -1) - assert np.all(burdens != 0) # because DeepRVAT burdens are corrently all non-zero logger.info(f"Burdens shape: {burdens.shape}") if np.all(np.abs(burdens) < 1e-6): @@ -1507,6 +1542,7 @@ def combine_regression_results( @cli.command() +@click.option("--center-scale-burdens", is_flag=True) @click.option("--n-chunks", type=int) @click.option("--chunk", type=int) @click.option("-r", "--repeats", multiple=True, type=int) @@ -1514,6 +1550,7 @@ def combine_regression_results( @click.argument("burden-file", type=click.Path(exists=True)) @click.argument("burden-out-file", type=click.Path()) def average_burdens( + center_scale_burdens: bool, repeats: Tuple, burden_file: str, burden_out_file: str, @@ -1526,6 +1563,20 @@ def average_burdens( logger.info(f"Reading burdens to aggregate from {burden_file}") burdens = zarr.open(burden_file) n_total_samples = burdens.shape[0] + + if center_scale_burdens: + center_scale_params_file = ( + Path(os.path.split(burden_out_file)[0]) / "computed_burdens_stats.csv" + ) + center_scale_df = pd.read_csv(center_scale_params_file) + + max_dfs = pd.DataFrame() + max_files_path = Path(os.path.split(burden_out_file)[0]).glob("chunk*_max.csv") + for i, filename in enumerate(max_files_path): + max_dfs[f"Max_Chunk{i}"] = pd.read_csv(filename)["max"] + # compute max across all chunks + max_dfs["max"] = max_dfs.max(axis=1) + if chunk is not None: if n_chunks is None: raise ValueError("n_chunks must be specified if chunk is not None") @@ -1544,7 +1595,7 @@ def average_burdens( f"Computing result for chunk {chunk} out of {n_chunks} in range {chunk_start}, {chunk_end}" ) - batch_size = 100 + batch_size = 1000 logger.info(f"Batch size: {batch_size}") n_batches = n_samples // batch_size + (n_samples % batch_size != 0) @@ -1573,6 +1624,21 @@ def average_burdens( end_idx = min(start_idx + batch_size, chunk_end) print(start_idx, end_idx) this_burdens = np.take(burdens[start_idx:end_idx, :, :], repeats, axis=2) + + # Double-check zarr creation - no computed burdens should equal zero + assert np.all(this_burdens != 0) + + if center_scale_burdens: + print("Centering and Scaling Burdens before aggregating") + for r in range(len(repeats)): + zero_effect_val = center_scale_df.loc[r, "mode"] + repeat_max = max_dfs.loc[r, "max"] + adjusted_max = repeat_max - zero_effect_val + # Subtract off zero effect burden value (mode) and scale + this_burdens[:, :, r] = ( + this_burdens[:, :, r] - zero_effect_val + ) / adjusted_max + this_burdens = AGG_FCT[agg_fct](this_burdens, axis=2) burdens_new[start_idx:end_idx, :, 0] = this_burdens diff --git a/deeprvat/deeprvat/config.py b/deeprvat/deeprvat/config.py index a2abfb9f..3586bf70 100644 --- a/deeprvat/deeprvat/config.py +++ b/deeprvat/deeprvat/config.py @@ -382,6 +382,10 @@ def create_main_config( "correction_method": input_config["evaluation"]["correction_method"], "alpha": input_config["evaluation"]["alpha"], } + if "center_scale_burdens" in input_config["evaluation"]: + full_config["center_scale_burdens"] = input_config["evaluation"][ + "center_scale_burdens" + ] if pretrained_setup: full_config.update( diff --git a/docs/index.rst b/docs/index.rst index c9d1700f..bc92c6c5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,8 +8,6 @@ Welcome to DeepRVAT's documentation! Rare variant association testing using deep learning and data-driven gene impairment scores. -_Coming soon:_ Overview of the DeepRVAT methodaster - How to use this documentation =================================== @@ -39,6 +37,25 @@ If you use this package, please cite: Clarke, Holtkamp et al., “Integration of Variant Annotations Using Deep Set Networks Boosts Rare Variant Association Testing.” Nature Genetics. https://www.nature.com/articles/s41588-024-01919-z +Release notes +==================================== + +v1.1.0 +------------------------------------ + +Adjusts the calibration of DeepRVAT scores to differ from previous versions. Each model in the ensemble has its scores linearly adjusted to lie between -1 and 1, with the "no variant" score set to 0. (Previously, scores were between 0 and 1 with "no variant" close to, but not exactly, 0.5.) + +This can change the results of association testing and phenotype prediction, giving (in our testing) a slight boost in yield and replication of significant gene-trait associations. + + +v1.0.0 +------------------------------------ + +First release version. + +The results of the DeepRVAT publication can be reproduced using this version of the package. + + Contact ==================================== diff --git a/example/config/deeprvat_input_config.yaml b/example/config/deeprvat_input_config.yaml index 397a7afe..36f92618 100644 --- a/example/config/deeprvat_input_config.yaml +++ b/example/config/deeprvat_input_config.yaml @@ -129,6 +129,7 @@ y_transformation: quantile_transform evaluation: correction_method: Bonferroni alpha: 0.05 + center_scale_burdens: True # Subsetting samples for training or association testing #sample_files: diff --git a/example/config/deeprvat_input_config_regenie.yaml b/example/config/deeprvat_input_config_regenie.yaml index f74814fb..131c0765 100644 --- a/example/config/deeprvat_input_config_regenie.yaml +++ b/example/config/deeprvat_input_config_regenie.yaml @@ -129,6 +129,7 @@ y_transformation: quantile_transform evaluation: correction_method: Bonferroni alpha: 0.05 + center_scale_burdens: True # Subsetting samples for training or association testing #sample_files: diff --git a/example/config/deeprvat_input_pretrained_models_config.yaml b/example/config/deeprvat_input_pretrained_models_config.yaml index 66d4c8c6..939f5df0 100644 --- a/example/config/deeprvat_input_pretrained_models_config.yaml +++ b/example/config/deeprvat_input_pretrained_models_config.yaml @@ -52,6 +52,7 @@ y_transformation: quantile_transform evaluation: correction_method: Bonferroni alpha: 0.05 + center_scale_burdens: True # Subsetting samples for association testing #sample_files: diff --git a/lsf/lsf.yaml b/lsf/lsf.yaml deleted file mode 100644 index b230b39b..00000000 --- a/lsf/lsf.yaml +++ /dev/null @@ -1,131 +0,0 @@ -__default__: - - "-q medium" - - "-R \"select[(hname != 'odcf-cn11u15' && hname != 'odcf-cn11u17' && hname != 'odcf-cn33u24s03' && hname != 'odcf-cn23u25' && hname != 'odcf-cn11u13' && hname != 'odcf-cn31u13' && hname != 'odcf-cn31u21' && hname != 'odcf-cn23u23')]\"" - - -# For association testing pipelines - -config: - - "-q short" - -training_dataset: - - "-q long" - -delete_burden_cache: - - "-q short" - -choose_training_genes: - - "-q short" - -best_cv_run: - - "-q short" -link_avg_burdens: - - "-q short" -best_bagging_run: - - "-q short" - -train: - - "-q gpu" - - "-gpu num=1:j_exclusive=yes:gmem=10.7G" - - "-R \"select[(hname != 'e230-dgx2-1' && hname != 'e230-dgx2-2' && hname != 'e230-dgxa100-1' && hname != 'e230-dgxa100-2' && hname != 'e230-dgxa100-3' && hname != 'e230-dgxa100-4' && hname != 'e071-gpu06')]\"" - # - "-R tensorcore" - # - "-L /bin/bash" - -compute_burdens: - - "-q gpu" - - "-gpu num=1:j_exclusive=yes:gmem=15.7G" - - "-R \"select[(hname != 'e230-dgx2-1' && hname != 'e230-dgx2-2' && hname != 'e230-dgxa100-1' && hname != 'e230-dgxa100-2' && hname != 'e230-dgxa100-3' && hname != 'e230-dgxa100-4' && hname != 'e071-gpu06')]\"" - - "-W 180" - # - "-R tensorcore" - # - "-L /bin/bash" - -link_burdens: - - "-q medium" - -compute_plof_burdens: - - "-q medium" - -regress: - - "-q long" - -combine_regression_chunks: - - "-q short" - -regenie_step1_splitl0: - - "-q short" - -regenie_step1_runl0: - - "-q medium" - -regenie_step1_runl1: - - "-q medium" - -regenie_step1: - - "-q verylong" - -regenie_step2: - - "-q verylong" - - -# For CV (phenotype prediction) pipeline - -deeprvat_config: - - "-q short" - -deeprvat_plof_config: - - "-q short" - -deeprvat_training_dataset: - - "-q long" - -deeprvat_delete_burden_cache: - - "-q short" - -deeprvat_best_cv_run: - - "-q short" - -deeprvat_train: - - "-q gpu" - - "-gpu num=1:j_exclusive=yes:gmem=10.7G" - - "-R \"select[(hname != 'e230-dgx2-1' && hname != 'e230-dgx2-2' && hname != 'e230-dgxa100-1' && hname != 'e230-dgxa100-2' && hname != 'e230-dgxa100-3' && hname != 'e230-dgxa100-4' && hname != 'e071-gpu06')]\"" - # - "-R tensorcore" - # - "-L /bin/bash" - - -deeprvat_compute_burdens: - - "-q gpu-lowprio" - - "-gpu num=1:j_exclusive=yes:gmem=10.7G" - - "-R \"select[(hname != 'e230-dgx2-1' && hname != 'e230-dgx2-2' && hname != 'e230-dgxa100-1' && hname != 'e230-dgxa100-2' && hname != 'e230-dgxa100-3' && hname != 'e230-dgxa100-4' && hname != 'e071-gpu06')]\"" - - "-W 180" - # - "-R tensorcore" - # - "-L /bin/bash" - -prepare_genotypes_per_gene: - - "-q long" -deeprvat_regress: - - "-q long" - -average_burdens: - - "-q long" - -regress_missense: - - "-q long" -regress_plof: - - "-q long" - -seed_gene_regress_missense: - - "-q long" - -seed_gene_association_dataset: - - "-q long" -association_dataset: - - "-q long" - -seed_gene_regress_plof: - - "-q medium" - -deeprvat_compute_plof_burdens: - - "-q medium" - -deeprvat_regress: - - "-q long" diff --git a/pipelines/association_testing/burdens.snakefile b/pipelines/association_testing/burdens.snakefile index f3bdd304..a274a382 100644 --- a/pipelines/association_testing/burdens.snakefile +++ b/pipelines/association_testing/burdens.snakefile @@ -77,16 +77,16 @@ rule compute_burdens: ' '.join([ 'deeprvat_associate compute-burdens ' + debug + - ' --n-chunks ' + str(n_burden_chunks) + ' ' + ' --n-chunks '+ str(n_burden_chunks) + ' ' '--chunk {wildcards.chunk} ' '--dataset-file {input.dataset} ' + + center_scale_burdens + '{input.data_config} ' '{input.model_config} ' '{input.checkpoints} ' '{params.prefix}/burdens'], ) - rule reverse_models: input: checkpoints = expand(model_path / 'repeat_{repeat}/best/bag_{bag}.ckpt', diff --git a/pipelines/association_testing/regress_eval.snakefile b/pipelines/association_testing/regress_eval.snakefile index 4c219d1e..995afbae 100644 --- a/pipelines/association_testing/regress_eval.snakefile +++ b/pipelines/association_testing/regress_eval.snakefile @@ -103,6 +103,7 @@ rule average_burdens: shell: ' && '.join([ ('deeprvat_associate average-burdens ' + + center_scale_burdens + '--n-chunks ' + str(n_avg_chunks) + ' ' '--chunk {wildcards.chunk} ' '{params.repeats} ' diff --git a/pipelines/association_testing/regress_eval_regenie.snakefile b/pipelines/association_testing/regress_eval_regenie.snakefile index 03b80e3f..51f93e55 100644 --- a/pipelines/association_testing/regress_eval_regenie.snakefile +++ b/pipelines/association_testing/regress_eval_regenie.snakefile @@ -293,6 +293,7 @@ rule average_burdens: shell: ' && '.join([ ('deeprvat_associate average-burdens ' + + center_scale_burdens + '--n-chunks ' + str(n_avg_chunks) + ' ' '--chunk {wildcards.chunk} ' '{params.repeats} ' diff --git a/pipelines/association_testing_pretrained.snakefile b/pipelines/association_testing_pretrained.snakefile index 2bd12b15..96bd08ac 100644 --- a/pipelines/association_testing_pretrained.snakefile +++ b/pipelines/association_testing_pretrained.snakefile @@ -13,6 +13,7 @@ training_phenotypes = [] n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] diff --git a/pipelines/association_testing_pretrained_regenie.snakefile b/pipelines/association_testing_pretrained_regenie.snakefile index c5c7076f..2ec5f923 100644 --- a/pipelines/association_testing_pretrained_regenie.snakefile +++ b/pipelines/association_testing_pretrained_regenie.snakefile @@ -13,6 +13,7 @@ training_phenotypes = [] n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] debug = '--debug ' if debug_flag else '' diff --git a/pipelines/cv_training/cv_training_association_testing.snakefile b/pipelines/cv_training/cv_training_association_testing.snakefile index 9f507b5b..1b7d275e 100644 --- a/pipelines/cv_training/cv_training_association_testing.snakefile +++ b/pipelines/cv_training/cv_training_association_testing.snakefile @@ -18,6 +18,7 @@ burden_phenotype = phenotypes[0] n_burden_chunks = config.get("n_burden_chunks", 1) if not debug_flag else 2 n_regression_chunks = config.get("n_regression_chunks", 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config["hyperparameter_optimization"]["n_trials"] n_bags = config["training"]["n_bags"] if not debug_flag else 3 n_repeats = config["n_repeats"] diff --git a/pipelines/training_association_testing.snakefile b/pipelines/training_association_testing.snakefile index c90e9fc4..cf6a20fa 100644 --- a/pipelines/training_association_testing.snakefile +++ b/pipelines/training_association_testing.snakefile @@ -17,6 +17,7 @@ training_phenotypes = list(training_phenotypes.keys()) if type(training_phenotyp n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] diff --git a/pipelines/training_association_testing_regenie.snakefile b/pipelines/training_association_testing_regenie.snakefile index 3c5d4bdb..66c8b179 100644 --- a/pipelines/training_association_testing_regenie.snakefile +++ b/pipelines/training_association_testing_regenie.snakefile @@ -16,6 +16,7 @@ training_phenotypes = list(training_phenotypes.keys()) if type(training_phenotyp n_burden_chunks = config.get('n_burden_chunks', 1) if not debug_flag else 2 n_regression_chunks = config.get('n_regression_chunks', 40) if not debug_flag else 2 n_avg_chunks = config.get('n_avg_chunks', 1) +center_scale_burdens = '--center-scale-burdens ' if config.get('center_scale_burdens', True) else '' n_trials = config['hyperparameter_optimization']['n_trials'] n_bags = config['training']['n_bags'] if not debug_flag else 3 n_repeats = config['n_repeats'] @@ -32,7 +33,7 @@ regenie_config_step2 = config["regenie_options"]["step_2"] regenie_step1_bsize = regenie_config_step1["bsize"] regenie_step2_bsize = regenie_config_step2["bsize"] -cv_exp = False +cv_exp = config.get('cv_exp', False) wildcard_constraints: repeat="\d+",