Add FHIR tutorial and simplify code (#626)

* Removed unused dependencies Signed-off-by: zethson <[email protected]> * Tutorials update Signed-off-by: zethson <[email protected]> * Readd fhiry Signed-off-by: zethson <[email protected]> * Skip jupyter_core Signed-off-by: zethson <[email protected]> * Fix import Signed-off-by: zethson <[email protected]> * Don't fail session-info Signed-off-by: zethson <[email protected]> * Revert session info change Signed-off-by: zethson <[email protected]> --------- Signed-off-by: zethson <[email protected]>
theislab · Dec 18, 2023 · 9c2de8c · 9c2de8c
1 parent 94f257c
commit 9c2de8c
Show file tree

Hide file tree

Showing 15 changed files with 40 additions and 36 deletions.
diff --git a/.github/workflows/run_notebooks.yml b/.github/workflows/run_notebooks.yml
@@ -13,6 +13,8 @@ jobs:
                         "docs/tutorials/notebooks/ehrapy_introduction.ipynb",
                         "docs/tutorials/notebooks/mimic_2_introduction.ipynb",
                         "docs/tutorials/notebooks/mimic_2_survival_analysis.ipynb",
+                        "docs/tutorials/notebooks/mimic_2_fate.ipynb",
+                        "docs/tutorials/notebooks/mimic_2_causal_inference.ipynb",
                         "docs/tutorials/notebooks/mimic_3_demo.ipynb",
                         # "docs/tutorials/notebooks/medcat.ipynb",
                     ]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,12 +10,6 @@ repos:
       rev: v3.1.0
       hooks:
           - id: prettier
-            # Newer versions of node don't work on systems that have an older version of GLIBC
-            # (in particular Ubuntu 18.04 and Centos 7)
-            # EOL of Centos 7 is in 2024-06, we can probably get rid of this then.
-            # See https://github.com/scverse/cookiecutter-scverse/issues/143 and
-            # https://github.com/jupyterlab/jupyterlab/issues/12675
-            language_version: "17.9.1"
     - repo: https://github.com/astral-sh/ruff-pre-commit
       rev: v0.1.6
       hooks:

diff --git a/docs/_static/tutorials/fhir.jpg b/docs/_static/tutorials/fhir.jpg
diff --git a/docs/conf.py b/docs/conf.py
@@ -11,7 +11,7 @@
 
 # General information about the project.
 project = "ehrapy"
-copyright = "2021, Lukas Heumos, Theislab"
+copyright = "2021-2024, Lukas Heumos, Theislab"
 author = "Lukas Heumos"
 github_repo = "ehrapy"
 
@@ -181,4 +181,5 @@
     "tutorials/notebooks/medcat": "_static/tutorials/nlp.png",
     "tutorials/notebooks/ml_usecases": "_static/tutorials/machine_learning.png",
     "tutorials/notebooks/ontology_mapping": "_static/tutorials/ontology.png",
+    "tutorials/notebooks/fhir": "_static/tutorials/fhir.png",
 }
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
@@ -25,6 +25,7 @@ For questions about the usage of ehrapy use [Github Discussions].
    notebooks/medcat
    notebooks/ml_usecases
    notebooks/ontology_mapping
+   notebooks/fhir
 
 ```
 

diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks
diff --git a/ehrapy/util/_doc_util.py → ehrapy/_doc_util.py b/ehrapy/util/_doc_util.py → ehrapy/_doc_util.py
diff --git a/ehrapy/core/meta_information.py b/ehrapy/core/meta_information.py
@@ -1,10 +1,7 @@
 from __future__ import annotations
 
 import sys
-from contextlib import closing
 from datetime import datetime
-from io import StringIO
-from typing import IO, Optional
 
 import session_info
 from rich import print
@@ -27,6 +24,7 @@ def print_versions():  # pragma: no cover
             "builtins",
             "stdlib_list",
             "importlib_metadata",
+            "jupyter_core"
             # Special module present if test coverage being calculated
             # https://gitlab.com/joelostblom/session_info/-/issues/10
             "$coverage",

diff --git a/ehrapy/io/_read.py b/ehrapy/io/_read.py
@@ -362,11 +362,18 @@ def read_fhir(
     Uses https://github.com/dermatologist/fhiry to read the FHIR file into a Pandas DataFrame
     which is subsequently transformed into an AnnData object.
 
+    Be aware that FHIR data can be nested and return lists or dictionaries as values.
+    In such cases, one can either:
+    1. Transform the data into an awkward array and flatten it when needed.
+    2. Extract values from all lists and dictionaries to store single values in the fields.
+    3. Remove all lists and dictionaries. Only do this if the information is not relevant to you.
+
     Args:
         dataset_path: Path to one or multiple FHIR files.
         format: The file format of the FHIR data. One of 'json' or 'ndjson'. Defaults to 'json'.
         columns_obs_only: These columns will be added to obs only and not X.
-        columns_x_only: These columns will be added to X only and all remaining columns to obs. Note that datetime columns will always be added to .obs though.
+        columns_x_only: These columns will be added to X only and all remaining columns to obs.
+                        Note that datetime columns will always be added to .obs though.
         return_df: Whether to return one or several Pandas DataFrames.
         cache: Whether to write to cache when reading or not. Defaults to False.
         download_dataset_name: Name of the file or directory in case the dataset is downloaded
@@ -379,6 +386,12 @@ def read_fhir(
     Examples:
         >>> import ehrapy as ep
         >>> adata = ep.io.read_fhir("/path/to/fhir/resources")
+
+        Be aware that most FHIR datasets have nested data that might need to be removed.
+        In such cases consider working with DataFrames.
+        >>> df = ep.io.read_fhir("/path/to/fhir/resources", return_df=True)
+        >>> df.drop(columns=[col for col in df.columns if any(isinstance(x, (list, dict)) for x in df[col].dropna())], inplace=True)
+        >>> df.drop(columns=df.columns[df.isna().all()], inplace=True)
     """
     _check_columns_only_params(columns_obs_only, columns_x_only)
     file_path: Path = Path(dataset_path)

diff --git a/ehrapy/plot/_scanpy_pl_api.py b/ehrapy/plot/_scanpy_pl_api.py
@@ -8,9 +8,8 @@
 
 import scanpy as sc
 from scanpy.plotting import DotPlot, MatrixPlot, StackedViolin
-from scanpy.plotting._tools.scatterplots import _wraps_plot_scatter
 
-from ehrapy.util._doc_util import (
+from ehrapy._doc_util import (
     _doc_params,
     doc_adata_color_etc,
     doc_common_groupby_plot_args,
@@ -19,7 +18,6 @@
     doc_panels,
     doc_scatter_basic,
     doc_scatter_embedding,
-    doc_scatter_spatial,
     doc_show_save_ax,
     doc_vbound_percentile,
     doc_vboundnorm,

diff --git a/ehrapy/preprocessing/_imputation.py b/ehrapy/preprocessing/_imputation.py
@@ -24,7 +24,7 @@ def explicit_impute(
     adata: AnnData,
     replacement: (str | int) | (dict[str, str | int]),
     impute_empty_strings: bool = True,
-    warning_threshold: int = 30,
+    warning_threshold: int = 70,
     copy: bool = False,
 ) -> AnnData:
     """Replaces all missing values in all columns or a subset of columns specified by the user with the passed replacement value.
@@ -124,7 +124,7 @@ def simple_impute(
     var_names: Iterable[str] | None = None,
     strategy: Literal["mean", "median", "most_frequent"] = "mean",
     copy: bool = False,
-    warning_threshold: int = 30,
+    warning_threshold: int = 70,
 ) -> AnnData:
     """Impute missing values in numerical data using mean/median/most frequent imputation.
 
@@ -198,7 +198,7 @@ def knn_impute(
     var_names: Iterable[str] | None = None,
     n_neighbours: int = 5,
     copy: bool = False,
-    warning_threshold: int = 30,
+    warning_threshold: int = 70,
 ) -> AnnData:
     """Imputes missing values in the input AnnData object using K-nearest neighbor imputation.
 
@@ -265,7 +265,7 @@ def knn_impute(
                 adata.X[::, column_indices] = enc.inverse_transform(adata.X[::, column_indices])
     except ValueError as e:
         if "Data matrix has wrong shape" in str(e):
-            print("[bold red]Check that your matrix does not contain any NaN values!")
+            print("[bold red]Check that your matrix does not contain any NaN only columns!")
             raise
 
     if _check_module_importable("sklearnex"):  # pragma: no cover
@@ -306,7 +306,7 @@ def miss_forest_impute(
     max_iter: int = 10,
     n_estimators=100,
     random_state: int = 0,
-    warning_threshold: int = 30,
+    warning_threshold: int = 70,
     copy: bool = False,
 ) -> AnnData:
     """Impute data using the MissForest strategy.
@@ -421,7 +421,7 @@ def miss_forest_impute(
                     adata.X[::, non_num_indices] = enc.inverse_transform(adata.X[::, non_num_indices])
     except ValueError as e:
         if "Data matrix has wrong shape" in str(e):
-            print("[bold red]Check that your matrix does not contain any NaN values!")
+            print("[bold red]Check that your matrix does not contain any NaN only columns!")
             raise
 
     if _check_module_importable("sklearnex"):  # pragma: no cover
@@ -442,7 +442,7 @@ def soft_impute(
     adata: AnnData,
     var_names: Iterable[str] | None = None,
     copy: bool = False,
-    warning_threshold: int = 30,
+    warning_threshold: int = 70,
     shrinkage_value: float | None = None,
     convergence_threshold: float = 0.001,
     max_iters: int = 100,
@@ -587,7 +587,7 @@ def iterative_svd_impute(
     adata: AnnData,
     var_names: Iterable[str] | None = None,
     copy: bool = False,
-    warning_threshold: int = 30,
+    warning_threshold: int = 70,
     rank: int = 10,
     convergence_threshold: float = 0.00001,
     max_iters: int = 200,
@@ -735,7 +735,7 @@ def _iterative_svd_impute(
 def matrix_factorization_impute(
     adata: AnnData,
     var_names: Iterable[str] | None = None,
-    warning_threshold: int = 30,
+    warning_threshold: int = 70,
     rank: int = 40,
     learning_rate: float = 0.01,
     max_iters: int = 50,
@@ -872,7 +872,7 @@ def _matrix_factorization_impute(
 def nuclear_norm_minimization_impute(
     adata: AnnData,
     var_names: Iterable[str] | None = None,
-    warning_threshold: int = 30,
+    warning_threshold: int = 70,
     require_symmetric_solution: bool = False,
     min_value: float | None = None,
     max_value: float | None = None,
@@ -992,7 +992,7 @@ def _nuclear_norm_minimization_impute(
 def mice_forest_impute(
     adata: AnnData,
     var_names: Iterable[str] | None = None,
-    warning_threshold: int = 30,
+    warning_threshold: int = 70,
     save_all_iterations: bool = True,
     random_state: int | None = None,
     inplace: bool = False,
@@ -1075,7 +1075,7 @@ def mice_forest_impute(
                 adata.X[::, column_indices] = enc.inverse_transform(adata.X[::, column_indices])
     except ValueError as e:
         if "Data matrix has wrong shape" in str(e):
-            print("[bold red]Check that your matrix does not contain any NaN values!")
+            print("[bold red]Check that your matrix does not contain any NaN only columns!")
             raise
 
     if var_names:
@@ -1119,7 +1119,7 @@ def _miceforest_impute(
         adata.X = kernel.complete_data(dataset=0, inplace=inplace)
 
 
-def _warn_imputation_threshold(adata: AnnData, var_names: Iterable[str] | None, threshold: int = 30) -> dict[str, int]:
+def _warn_imputation_threshold(adata: AnnData, var_names: Iterable[str] | None, threshold: int = 75) -> dict[str, int]:
     """Warns the user if the more than $threshold percent had to be imputed.
 
     Args:

diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py
@@ -205,10 +205,8 @@ def _var_qc_metrics(adata: AnnData, layer: str = None) -> pd.DataFrame:
         var_metrics.loc[non_categorical_indices, "max"] = np.nanmax(
             np.array(mtx[:, non_categorical_indices], dtype=np.float64), axis=0
         )
-    except TypeError:
+    except (TypeError, ValueError):
         print("[bold yellow]TypeError! Setting quality control metrics to nan. Did you encode your data?")
-    except ValueError:
-        print("[bold yellow]ValueError! Setting quality control metrics to nan. Did you encode your data?")
 
     return var_metrics
 

diff --git a/ehrapy/tools/_scanpy_tl_api.py b/ehrapy/tools/_scanpy_tl_api.py
@@ -1,5 +1,4 @@
-from collections.abc import Iterable, Mapping, Sequence
-from types import MappingProxyType
+from collections.abc import Iterable, Sequence
 from typing import Any, Literal, Optional, Union
 
 import numpy as np

diff --git a/ehrapy/util/__init__.py b/ehrapy/util/__init__.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,12 +33,12 @@ classifiers = [
     "Intended Audience :: Science/Research",
     "Natural Language :: English",
     "Operating System :: MacOS :: MacOS X",
-    "Operating System :: Microsoft :: Windows",
     "Operating System :: POSIX :: Linux",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "Topic :: Scientific/Engineering :: Bio-Informatics",
     "Topic :: Scientific/Engineering :: Visualization",
 ]
@@ -60,8 +60,8 @@ dependencies = [
     "lifelines",
     "missingno",
     "thefuzz[speedup]",
-    "fhiry",
-    "dowhy"
+    "dowhy",
+    "fhiry"
 ]
 
 [project.optional-dependencies]
+4 −0		.gitignore
+5 −6		.pre-commit-config.yaml
+1,219 −0		fhir.ipynb
+395 −359		medcat.ipynb