diff --git a/docs/src/methods_list.rst b/docs/src/methods_list.rst deleted file mode 100644 index 18485f40a..000000000 --- a/docs/src/methods_list.rst +++ /dev/null @@ -1,6 +0,0 @@ -.. _methods_list: - - -=============== -List of methods -=============== \ No newline at end of file diff --git a/docs/src/model_agnostic_methods.rst b/docs/src/model_agnostic_methods.rst new file mode 100644 index 000000000..b4383ce52 --- /dev/null +++ b/docs/src/model_agnostic_methods.rst @@ -0,0 +1,11 @@ +.. _model_agnostic_methods: + + +====================== +Model-agnostic methods +====================== + +.. toctree:: + :maxdepth: 2 + + model_agnostic_methods/pdp_importance.rst \ No newline at end of file diff --git a/docs/src/model_agnostic_methods/pdp_importance.rst b/docs/src/model_agnostic_methods/pdp_importance.rst new file mode 100644 index 000000000..db1f7a1bc --- /dev/null +++ b/docs/src/model_agnostic_methods/pdp_importance.rst @@ -0,0 +1,51 @@ + +.. _pdp_importance: + +============================ +PDP-based feature importance +============================ + +A Partial Dependence Plot (PDP) :footcite:`friedman2001greedy` is a tool used to visualize +the dependence between the target variable and a feature (or set of features) of interest. +To see how they are used to understand the relationship between features and the target, +see :ref:`partial_dependance_plots`. Here we will see how a marginal measure of feature +importance can be derived from PDPs. + +A PDP is a plot of the partial dependence function that defined as + +.. math:: + f_S(x_S) &= \mathbb{E}_{X_{-S}}\left[ f(x_S, X_{-S}) \right]\\ + &= \int f(x_S, x_{-S}) d\mathbb{P}(X_{-S}), + +and estimated with + +.. math:: + \bar{f}_S(x_S) \approx \frac{1}{n_\text{n}} \sum_{i=1}^n f(x_S, x_{-S,i}). + +It will show how much the value of the target changes when a (or multiple) feature varies. Intuitively, the +flatter the PDP, the less importance a feature should have as it appears to have little impact +on the target. On the other hand, the more a PDP varies, the more signal on the target should +be present in the feature. +Greenwell, Boehmke, and McCarthy :footcite:`greenwell2018simple` propose the following measure +of feature importance for regression: + +.. math:: + \Psi^{PDP}_S = \sqrt{ \frac{1}{K-1} \sum_{k=1}^K (\bar{f}_S(x_S^k) - \frac{1}{K} \sum_{k=1}^K \bar{f}_S(x_S^k))^2 }. + +It corresponds to the deviation of each unique feature value from the average curve. + +In classification they suggest: + +.. math:: + \Psi^{PDP}_S = \frac{ \max_k(\bar{f}_S(x_S^k)) - \min_k(\bar{f}_S(x_S^k)) }{4}. + +It is an estimation of the deviation based on the range of values, based on the fact that +for the normal distribution, roughly 95% of the of the data are bewteen minus two and plus +two standard deviations. Therefore the range divided by four is a rough estimation of the +deviation. + + +References +--------- +.. footbibliography:: + diff --git a/docs/src/visualization.rst b/docs/src/visualization.rst index 6562ae28d..2874417b3 100644 --- a/docs/src/visualization.rst +++ b/docs/src/visualization.rst @@ -3,4 +3,9 @@ ======================= Tools for visualization -======================= \ No newline at end of file +======================= + +.. toctree:: + :maxdepth: 2 + + visualization/partial_dependance_plots \ No newline at end of file diff --git a/docs/src/visualization/partial_dependence.rst b/docs/src/visualization/partial_dependence.rst new file mode 100644 index 000000000..646074478 --- /dev/null +++ b/docs/src/visualization/partial_dependence.rst @@ -0,0 +1,53 @@ + +.. _partial_dependance_plots: + +======================== +Partial Dependence plots +======================== + +Definition +========== + +A Partial Dependence Plot (PDP) :footcite:`friedman2001greedy` is a tool used to visualize +the dependence between the target variable and a feature (or set of features) of interest. +For important features, it can be used to understand their relationship with the target e.g. +is it linear, monotonic or more complex. + +.. note:: + We are limited to a small subset of features (less than 3) as it becomes tricky to + display more than 3/4 variables simultaneously. +.. maybe put the note later or not at all + +The partial dependence function that is plotted is defined as + +.. math:: + f_S(x_S) &= \mathbb{E}_{X_{-S}}\left[ f(x_S, X_{-S}) \right]\\ + &= \int f(x_S, x_{-S}) d\mathbb{P}(X_{-S}), + +where :math:`X_S` is the set of input features of interest, :math:`X_{-S}` is its complement +and :math:`f(x_S, x_{-S})` is the learned decision function of the model of interest, evalutated +on a sample :math:`x` whose values for the features in S are :math:`x_S` and for features in -S +are :math:`x_{-S}`. The expectation is taken marginally on the values of :math:`X-{-S}`. + +The partial dependence function :math:`f_S` is estimated by Monte-Carlo with +:math:`\bar{f}_S` defined as + +.. math:: + \bar{f}_S(x_S) \approx \frac{1}{n_\text{n}} \sum_{i=1}^n f(x_S, x_{-S,i}), + +where :math:`\{x_{-S,i}}_{i=1}^n` are the values on the training set of the features in -S. +This approximation is equivalent to averaging all the Individual Conditional Expectation (ICE) +curves. These curves are the per-instance version of a PDP, where we display the evolution of +the target when some features change, for one sample of the dataset. +Most plots will include all ICE curves on the same plot with the PDP highlighted. + +It is possible to get a measure of feature importance from PDPs, which is explained in this section +of the user guide: :ref:`pdp_importance`. + +Example(s) +========== + + +References +---------- +.. bibliography:: ../../tools/references.bib diff --git a/docs/tools/references.bib b/docs/tools/references.bib index fe04dc12f..eadea946e 100644 --- a/docs/tools/references.bib +++ b/docs/tools/references.bib @@ -155,6 +155,15 @@ @article{fan2012variance year = {2012} } +@article{friedman2001greedy, + title = {Greedy function approximation: a gradient boosting machine}, + author = {Friedman, Jerome H}, + journal = {Annals of statistics}, + pages = {1189--1232}, + year = {2001}, + publisher = {JSTOR} +} + @article{gaonkar_deriving_2012, author = {Gaonkar, Bilwaj and Davatzikos, Christos}, journal = {International Conference on Medical Image Computing and Computer-Assisted Intervention}, @@ -169,6 +178,24 @@ @article{gaonkar_deriving_2012 year = {2012} } +@article{greenwell2018simple, + title = {A simple and effective model-based variable importance measure}, + author = {Greenwell, Brandon M and Boehmke, Bradley C and McCarthy, Andrew J}, + journal = {arXiv preprint arXiv:1805.04755}, + year = {2018} +} + +@article{goldstein2015peeking, + title = {Peeking inside the black box: Visualizing statistical learning with plots of individual conditional expectation}, + author = {Goldstein, Alex and Kapelner, Adam and Bleich, Justin and Pitkin, Emil}, + journal = {journal of Computational and Graphical Statistics}, + volume = {24}, + number = {1}, + pages = {44--65}, + year = {2015}, + publisher = {Taylor \& Francis} +} + @article{hirschhorn2005genome, author = {Hirschhorn, Joel N and Daly, Mark J}, journal = {Nature reviews genetics}, @@ -202,6 +229,17 @@ @article{liu2022fast year = {2022} } +@article{molnar2025, + title = {Interpretable Machine Learning}, + subtitle = {A Guide for Making Black Box Models Explainable}, + author = {Christoph Molnar}, + journal = {github}, + year = {2025}, + edition = {3}, + isbn = {978-3-911578-03-5}, + url = {https://christophm.github.io/interpretable-ml-book} +} + @article{meinshausen2009pvalues, author = {Nicolai Meinshausen, Lukas Meier and Peter Buhlmann}, doi = {10.1198/jasa.2009.tm08647}, diff --git a/examples/plot_partial_dependence.py b/examples/plot_partial_dependence.py new file mode 100644 index 000000000..aa396acad --- /dev/null +++ b/examples/plot_partial_dependence.py @@ -0,0 +1,471 @@ +""" +Partial Dependence and Individual Conditional Expectation Plots +=============================================================== +This example is a modified version of the the example proposed in Scikit-learn: +https://scikit-learn.org/1.7/auto_examples/inspection/plot_partial_dependence.html +Partial dependence plots show the dependence between the target function [1]_ +and a set of features of interest, marginalizing over the values of all other +features (the complement features). We limit to only one feature for the +calculation of the importance because the extension of this method to multiple +features is not trivial. +Similarly, an individual conditional expectation (ICE) plot :footcite:t:`goldstein2015peeking` +shows the dependence between the target function and a feature of interest. +However, unlike partial dependence plots, which show the average effect of the +features of interest, ICE plots visualize the dependence of the prediction on a +feature for each sample separately, with one line per sample. +Only one feature of interest is supported for ICE plots. +This example shows how to obtain partial dependence and ICE plots from a +:class:`~sklearn.neural_network.MLPRegressor` and a +:class:`~sklearn.ensemble.HistGradientBoostingRegressor` trained on the +bike sharing dataset. The example is inspired by :footcite:t:`molnar2025`. + +Notes +----- +.. [1] For classification you can think of it as the regression score before + the link function. + +References +---------- +.. footbibliography:: + +""" + +# %% +# Bike sharing dataset preprocessing +# ---------------------------------- +# +# We will use the bike sharing dataset. The goal is to predict the number of bike +# rentals using weather and season data as well as the datetime information. +from sklearn.datasets import fetch_openml + +bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True) +# Make an explicit copy to avoid "SettingWithCopyWarning" from pandas +X, y = bikes.data.copy(), bikes.target + +# We use only a subset of the data to speed up the example. +X = X.iloc[::5, :] +y = y[::5] + +# %% +# The feature `"weather"` has a particularity: the category `"heavy_rain"` is a rare +# category. +X["weather"].value_counts() + +# %% +# Because of this rare category, we collapse it into `"rain"`. +X["weather"] = ( + X["weather"] + .astype(object) + .replace(to_replace="heavy_rain", value="rain") + .astype("category") +) + +# %% +# We now have a closer look at the `"year"` feature: +X["year"].value_counts() + +# %% +# We see that we have data from two years. We use the first year to train the +# model and the second year to test the model. +mask_training = X["year"] == 0.0 +X = X.drop(columns=["year"]) +X_train, y_train = X[mask_training], y[mask_training] +X_test, y_test = X[~mask_training], y[~mask_training] + +# %% +# We can check the dataset information to see that we have heterogeneous data types. We +# have to preprocess the different columns accordingly. +X_train.info() + +# %% +# From the previous information, we will consider the `category` columns as nominal +# categorical features. In addition, we will consider the date and time information as +# categorical features as well. +# +# We manually define the columns containing numerical and categorical +# features. +numerical_features = [ + "temp", + "feel_temp", + "humidity", + "windspeed", +] +categorical_features = X_train.columns.drop(numerical_features) + +# %% +# Before we go into the details regarding the different machine +# learning pipelines, we will try to get some additional intuition regarding the dataset +# that will be helpful to understand the model's statistical performance and results of +# the partial dependence analysis. +# +# We plot the average number of bike rentals by grouping the data by season and +# by year. +from itertools import product + +import matplotlib.pyplot as plt +import numpy as np + +days = ("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat") +hours = tuple(range(24)) +xticklabels = [f"{day}\n{hour}:00" for day, hour in product(days, hours)] +xtick_start, xtick_period = 6, 12 + +fig, axs = plt.subplots(nrows=2, figsize=(8, 6), sharey=True, sharex=True) +average_bike_rentals = bikes.frame.groupby( + ["year", "season", "weekday", "hour"], observed=True +).mean(numeric_only=True)["count"] +for ax, (idx, df) in zip(axs, average_bike_rentals.groupby("year")): + df.groupby("season", observed=True).plot(ax=ax, legend=True) + + # decorate the plot + ax.set_xticks( + np.linspace( + start=xtick_start, + stop=len(xticklabels), + num=len(xticklabels) // xtick_period, + ) + ) + ax.set_xticklabels(xticklabels[xtick_start::xtick_period]) + ax.set_xlabel("") + ax.set_ylabel("Average number of bike rentals") + ax.set_title( + f"Bike rental for {'2010 (train set)' if idx == 0.0 else '2011 (test set)'}" + ) + ax.set_ylim(0, 1_000) + ax.set_xlim(0, len(xticklabels)) + ax.legend(loc=2) + +# %% +# The first striking difference between the train and test set is that the number of +# bike rentals is higher in the test set. For this reason, it will not be surprising to +# get a machine learning model that underestimates the number of bike rentals. We +# also observe that the number of bike rentals is lower during the spring season. In +# addition, we see that during working days, there is a specific pattern around 6-7 +# am and 5-6 pm with some peaks of bike rentals. We can keep in mind these different +# insights and use them to understand the partial dependence plot. +# +# Preprocessor for machine-learning models +# ---------------------------------------- +# +# Since we later use two different models, a +# :class:`~sklearn.neural_network.MLPRegressor` and a +# :class:`~sklearn.ensemble.HistGradientBoostingRegressor`, we create two different +# preprocessors, specific for each model. +# +# Preprocessor for the neural network model +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We will use a :class:`~sklearn.preprocessing.QuantileTransformer` to scale the +# numerical features and encode the categorical features with a +# :class:`~sklearn.preprocessing.OneHotEncoder`. +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import OneHotEncoder, QuantileTransformer + +mlp_preprocessor = ColumnTransformer( + transformers=[ + ("num", QuantileTransformer(n_quantiles=100), numerical_features), + ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features), + ] +) +mlp_preprocessor + +# %% +# Preprocessor for the gradient boosting model +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# For the gradient boosting model, we leave the numerical features as-is and only +# encode the categorical features using a +# :class:`~sklearn.preprocessing.OrdinalEncoder`. +from sklearn.preprocessing import OrdinalEncoder + +hgbdt_preprocessor = ColumnTransformer( + transformers=[ + ("cat", OrdinalEncoder(), categorical_features), + ("num", "passthrough", numerical_features), + ], + sparse_threshold=1, + verbose_feature_names_out=False, +).set_output(transform="pandas") +hgbdt_preprocessor + +# %% +# 1-way partial dependence with different models +# ---------------------------------------------- +# +# In this section, we will compute 1-way partial dependence with two different +# machine-learning models: (i) a multi-layer perceptron and (ii) a +# gradient-boosting model. With these two models, we illustrate how to compute and +# interpret partial dependence plots (PDP) for both numerical and categorical +# features and individual conditional expectation (ICE). +# +# Multi-layer perceptron +# ~~~~~~~~~~~~~~~~~~~~~~ +# +# Let's fit a :class:`~sklearn.neural_network.MLPRegressor` and compute +# single-variable partial dependence plots. +from time import time + +from sklearn.neural_network import MLPRegressor +from sklearn.pipeline import make_pipeline + +print("Training MLPRegressor...") +tic = time() +mlp_model = make_pipeline( + mlp_preprocessor, + MLPRegressor( + hidden_layer_sizes=(30, 15), + learning_rate_init=0.01, + early_stopping=True, + random_state=0, + ), +) +mlp_model.fit(X_train, y_train) +print(f"done in {time() - tic:.3f}s") +print(f"Test R2 score: {mlp_model.score(X_test, y_test):.2f}") + +# %% +# We configured a pipeline using the preprocessor that we created specifically for the +# neural network and tuned the neural network size and learning rate to get a reasonable +# compromise between training time and predictive performance on a test set. +# +# Importantly, this tabular dataset has very different dynamic ranges for its +# features. Neural networks tend to be very sensitive to features with varying +# scales and forgetting to preprocess the numeric feature would lead to a very +# poor model. +# +# It would be possible to get even higher predictive performance with a larger +# neural network but the training would also be significantly more expensive. +# +# Note that it is important to check that the model is accurate enough on a +# test set before plotting the partial dependence since there would be little +# use in explaining the impact of a given feature on the prediction function of +# a model with poor predictive performance. In this regard, our MLP model works +# reasonably well. +# +# We will plot the averaged partial dependence. +import matplotlib.pyplot as plt + +from hidimstat.marginal.partial_dependence_plot import PartialDependencePlot + +common_params = { + "n_jobs": 2, + "grid_resolution": 20, +} + +print("Computing partial dependence plots...") +features_info = { + # features of interest + "features": ["temp", "humidity", "windspeed", "season", "weather", "hour"], + # information regarding categorical features + "categorical_features": categorical_features, +} +tic = time() +pdp = PartialDependencePlot( + mlp_model, + **features_info, + **common_params, +) +pdp.fit_importance(X=X_train) +# plot the figure +fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(9, 8), constrained_layout=True) +for i, ax in enumerate(axs.ravel()): + _ = pdp.plot(feature_id=i, ax=ax, percentage_ice=0.1) + +print(f"done in {time() - tic:.3f}s") +_ = fig.suptitle( + ( + "Partial dependence of the number of bike rentals\n" + "for the bike rental dataset with an MLPRegressor" + ), + fontsize=16, +) + +# %% +# Gradient boosting +# ~~~~~~~~~~~~~~~~~ +# +# Let's now fit a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` and +# compute the partial dependence on the same features. We also use the +# specific preprocessor we created for this model. +from sklearn.ensemble import HistGradientBoostingRegressor + +print("Training HistGradientBoostingRegressor...") +tic = time() +hgbdt_model = make_pipeline( + hgbdt_preprocessor, + HistGradientBoostingRegressor( + categorical_features=categorical_features, + random_state=0, + max_iter=50, + ), +) +hgbdt_model.fit(X_train, y_train) +print(f"done in {time() - tic:.3f}s") +print(f"Test R2 score: {hgbdt_model.score(X_test, y_test):.2f}") + +# %% +# Here, we used the default hyperparameters for the gradient boosting model +# without any preprocessing as tree-based models are naturally robust to +# monotonic transformations of numerical features. +# +# Note that on this tabular dataset, Gradient Boosting Machines are both +# significantly faster to train and more accurate than neural networks. It is +# also significantly cheaper to tune their hyperparameters (the defaults tend +# to work well while this is not often the case for neural networks). +# +# We will plot the partial dependence for some of the numerical and categorical +# features. +print("Computing partial dependence plots...") +tic = time() +fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(9, 8), constrained_layout=True) +pdp = PartialDependencePlot( + hgbdt_model, + **features_info, + **common_params, +) +pdp.fit_importance(X=X_train) +for i, ax in enumerate(axs.ravel()): + _ = pdp.plot(feature_id=i, ax=ax, X=X_train) +print(f"done in {time() - tic:.3f}s") +_ = fig.suptitle( + ( + "Partial dependence of the number of bike rentals\n" + "for the bike rental dataset with a gradient boosting" + ), + fontsize=16, +) + +# %% +# Analysis of the plots +# ~~~~~~~~~~~~~~~~~~~~~ +# +# We will first look at the PDPs for the numerical features. For both models, the +# general trend of the PDP of the temperature is that the number of bike rentals is +# increasing with temperature. We can make a similar analysis but with the opposite +# trend for the humidity features. The number of bike rentals is decreasing when the +# humidity increases. Finally, we see the same trend for the wind speed feature. The +# number of bike rentals is decreasing when the wind speed is increasing for both +# models. We also observe that :class:`~sklearn.neural_network.MLPRegressor` has much +# smoother predictions than :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. +# +# Now, we will look at the partial dependence plots for the categorical features. +# +# We observe that the spring season is the lowest bar for the season feature. With the +# weather feature, the rain category is the lowest bar. Regarding the hour feature, +# we see two peaks around the 7 am and 6 pm. These findings are in line with the +# the observations we made earlier on the dataset. +# +# However, it is worth noting that we are creating potential meaningless +# synthetic samples if features are correlated. +# +# .. _ice-vs-pdp: +# +# ICE vs. PDP +# ~~~~~~~~~~~ +# +# PDP is an average of the marginal effects of the features. We are averaging the +# response of all samples of the provided set. Thus, some effects could be hidden. In +# this regard, it is possible to plot each individual response. This representation is +# called the Individual Effect Plot (ICE). In the plot below, we plot 50 randomly +# selected ICEs for the temperature and humidity features. +print("Computing partial dependence plots and individual conditional expectation...") +tic = time() +# sphinx_gallery_thumbnail_number = 4 +fig, axs = plt.subplots(ncols=2, figsize=(6, 4), sharey=True, constrained_layout=True) + +features_info = { + "features": ["temp", "humidity"], + # "centered": True, +} + +pdp = PartialDependencePlot( + hgbdt_model, + **features_info, + **common_params, +) +# plot the figure +pdp.fit_importance(X=X_train) +for i, ax in enumerate(axs.ravel()): + _ = pdp.plot(feature_id=i, ax=ax, X=X_train) +print(f"done in {time() - tic:.3f}s") +_ = fig.suptitle("ICE and PDP representations", fontsize=16) + +# %% +# We see that the ICE for the temperature feature gives us some additional information: +# Some of the ICE lines are flat while some others show a decrease of the dependence +# for temperature above 35 degrees Celsius. We observe a similar pattern for the +# humidity feature: some of the ICEs lines show a sharp decrease when the humidity is +# above 80%. +# +# Not all ICE lines are parallel, this indicates that the model finds +# interactions between features. We can repeat the experiment by constraining the +# gradient boosting model to not use any interactions between features using the +# parameter `interaction_cst`: +from sklearn.base import clone + +interaction_cst = [[i] for i in range(X_train.shape[1])] +hgbdt_model_without_interactions = ( + clone(hgbdt_model) + .set_params(histgradientboostingregressor__interaction_cst=interaction_cst) + .fit(X_train, y_train) +) +print(f"Test R2 score: {hgbdt_model_without_interactions.score(X_test, y_test):.2f}") + +# %% +fig, axs = plt.subplots(ncols=2, figsize=(6, 4), sharey=True, constrained_layout=True) + +# features_info["centered"] = False +pdp = PartialDependencePlot( + hgbdt_model_without_interactions, + **features_info, + **common_params, +) +pdp.fit_importance(X=X_train) +for i, ax in enumerate(axs.ravel()): + _ = pdp.plot(feature_id=i, ax=ax, X=X_train) +_ = fig.suptitle("ICE and PDP representations", fontsize=16) + +# %% +# .. _plt_partial_dependence_custom_values: +# +# Custom Inspection Points +# ~~~~~~~~~~~~~~~~~~~~~~~~ +# +# None of the examples so far specify _which_ points are evaluated to create the +# partial dependence plots. By default we use percentiles defined by the input dataset. +# In some cases it can be helpful to specify the exact points where one would like the +# model evaluated. For instance, if a user wants to test the model behavior on +# out-of-distribution data or compare two models that were fit on slightly different +# data. The `custom_values` parameter allows the user to pass in the values that they +# want the model to be evaluated on. This overrides the `grid_resolution` and +# `percentiles` parameters. Let's return to our gradient boosting example above +# but with custom values + +print("Computing partial dependence plots with custom evaluation values...") +tic = time() +fig, axs = plt.subplots(ncols=2, figsize=(6, 4), sharey=True, constrained_layout=True) + +features_info = { + "features": ["temp", "humidity"], +} + +pdp = PartialDependencePlot( + hgbdt_model, + **features_info, + **common_params, + # we set custom values for temp feature - + # all other features are evaluated based on the data + custom_values={"temp": np.linspace(0, 40, 10)}, +) +pdp.fit_importance(X=X_train) +for i, ax in enumerate(axs.ravel()): + _ = pdp.plot(feature_id=i, ax=ax, X=X_train) +print(f"done in {time() - tic:.3f}s") +_ = fig.suptitle( + ( + "Partial dependence of the number of bike rentals\n" + "for the bike rental dataset with a gradient boosting" + ), + fontsize=16, +) +plt.show() diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py index 3bb1ef1bf..3ff85084c 100644 --- a/src/hidimstat/__init__.py +++ b/src/hidimstat/__init__.py @@ -25,6 +25,11 @@ from .statistical_tools.aggregation import quantile_aggregation +# marginal methods +from .marginal import PartialDependencePlot # for having documentation +from .marginal import PartialDependencePlot as PDP + + try: from ._version import __version__ except ImportError: @@ -49,4 +54,6 @@ "CFI", "LOCO", "PFI", + # marginal methods + "PDP", ] diff --git a/src/hidimstat/marginal/__init__.py b/src/hidimstat/marginal/__init__.py new file mode 100644 index 000000000..2483ff3a1 --- /dev/null +++ b/src/hidimstat/marginal/__init__.py @@ -0,0 +1,3 @@ +from .partial_dependence_plot import PartialDependencePlot + +__all__ = ["PartialDependencePlot"] diff --git a/src/hidimstat/marginal/partial_dependence_plot.py b/src/hidimstat/marginal/partial_dependence_plot.py new file mode 100644 index 000000000..e9e2310d1 --- /dev/null +++ b/src/hidimstat/marginal/partial_dependence_plot.py @@ -0,0 +1,838 @@ +from collections.abc import Iterable +from copy import deepcopy +from tqdm import tqdm +import warnings + +from joblib import Parallel, delayed +import numpy as np +from scipy import sparse +from scipy.stats.mstats import mquantiles +from sklearn.base import is_classifier, is_regressor +from sklearn.utils import _safe_indexing, check_array, check_random_state +from sklearn.utils._indexing import ( + _determine_key_type, + _get_column_indices, + _safe_assign, +) +from sklearn.utils._response import _get_response_values +from sklearn.utils.validation import _check_sample_weight, check_is_fitted +from sklearn.inspection._pd_utils import _check_feature_names, _get_feature_index +from sklearn.inspection._partial_dependence import _grid_from_X + +from hidimstat.base_variable_importance import ( + BaseVariableImportance, +) + + +def _grid_from_X( + X, + percentiles, + is_categorical, + grid_resolution, + custom_values, + statistical_resolution=False, +): + """ + Generate a grid of points based on the percentiles of X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_target_features) + The data from which to generate the grid. + percentiles : a pair of float (p1, p2) + The lower and upper percentiles to use for the grid boundaries. + Must satisfy 0 <= p1 < p2 <= 1. + is_categorical : list of bool + For each feature, indicates whether it is categorical or not. + For categorical features, unique values are used instead of percentiles. + grid_resolution : int + Number of equally spaced points for the grid. + Must be greater than 1. + custom_values : dict + Mapping from column index of X to array-like of custom values + to use for that feature instead of the generated grid. + statistical_resolution : bool, default=False + If True, uses quantiles for grid points instead of equally spaced points + between percentile boundaries. + + Returns + ------- + values : list of 1d ndarrays + List containing the unique grid values for each feature. + Each array has length <= grid_resolution. + indexes : list of 1d ndarrays + For each feature, contains the indices mapping each sample in X + to its position in the grid. + + Notes + ----- + Based on scikit-learn's _grid_from_X implementation: + https://github.com/scikit-learn/scikit-learn/blob/c5497b7f7eacfaff061cf68e09bcd48aa93d4d6b/sklearn/inspection/_partial_dependence.py#L40 + """ + assert ( + isinstance(percentiles, Iterable) and len(percentiles) == 2 + ), "'percentiles' must be a sequence of 2 elements." + assert all( + 0 <= x <= 1 for x in percentiles + ), "'percentiles' values must be in [0, 1]." + assert ( + percentiles[0] < percentiles[1] + ), "percentiles[0] must be strictly less than percentiles[1]." + assert grid_resolution > 1, "'grid_resolution' must be strictly greater than 1." + + def _convert_custom_values(values): + # Convert custom types such that object types are always used for string arrays + dtype = object if any(isinstance(v, str) for v in values) else None + return np.asarray(values, dtype=dtype) + + custom_values = {k: _convert_custom_values(v) for k, v in custom_values.items()} + if any(v.ndim != 1 for v in custom_values.values()): + error_string = ", ".join( + f"Feature {k}: {v.ndim} dimensions" + for k, v in custom_values.items() + if v.ndim != 1 + ) + + raise ValueError( + "The custom grid for some features is not a one-dimensional array. " + f"{error_string}" + ) + + values = [] + indexes = [] + # TODO: we should handle missing values (i.e. `np.nan`) specifically and store them + # in a different Bunch attribute. + for feature_idx, is_cat in enumerate(is_categorical): + data = _safe_indexing(X, feature_idx, axis=1) + if feature_idx in custom_values: + # Use values in the custom range + axis = custom_values[feature_idx] + else: + try: + uniques = np.unique(data) + except TypeError as exc: + # `np.unique` will fail in the presence of `np.nan` and `str` categories + # due to sorting. Temporary, we reraise an error explaining the problem. + raise ValueError( + f"The column #{feature_idx} contains mixed data types. Finding unique " + "categories fail due to sorting. It usually means that the column " + "contains `np.nan` values together with `str` categories. Such use " + "case is not yet supported in scikit-learn." + ) from exc + + if is_cat or uniques.shape[0] < grid_resolution: + # Use the unique values either because: + # - feature has low resolution use unique values + # - feature is categorical + axis = uniques + else: + # create axis based on percentiles and grid resolution + if statistical_resolution: + axis = np.unique( + mquantiles( + data, + prob=np.linspace(0.0, 1.0, grid_resolution + 1), + axis=0, + ) + ) + else: + emp_percentiles = mquantiles(data, prob=percentiles, axis=0) + if np.allclose(emp_percentiles[0], emp_percentiles[1]): + raise ValueError( + "percentiles are too close to each other, " + "unable to build the grid. Please choose percentiles " + "that are further apart." + ) + axis = np.linspace( + emp_percentiles[0], + emp_percentiles[1], + num=grid_resolution, + endpoint=True, + ) + values.append(axis) + if not is_cat: + # -1 is for correction of the number of classes + digitize = np.digitize(data, axis, right=True) - 1 + else: + # convert the object in string if it's necesarry + string_to_num = {s: i for i, s in enumerate(sorted(set(data)))} + digitize = np.digitize( + np.array([string_to_num[s] for s in data]), + [string_to_num[s] for s in axis], + right=True, + ) + indexes.append(np.clip(digitize, 0, None)) + return values, indexes + + +def _partial_dependence_brute(est, grid, features, X, method, n_jobs): + """ + Calculate partial dependence using the brute force method. + For each value in `grid`, replaces the target features with that value for all samples + in X, makes predictions, and averages them. This computes the mean model response + across the data distribution for each grid point. + + Parameters + ---------- + est : BaseEstimator + A fitted estimator implementing predict, predict_proba, or decision_function. + Multioutput-multiclass classifiers not supported. + grid : list of 1d arrays + List containing grid values for each target feature. Each array contains the + values where partial dependence will be evaluated. + features : array-like of int + Indices of the features for which to compute partial dependence. + X : array-like of shape (n_samples, n_features) + Training data used to compute predictions. + method : {'auto', 'predict_proba', 'decision_function'}, default='auto' + Method to get model predictions: + - 'auto': uses predict_proba if available, otherwise decision_function + - 'predict_proba': uses predict_proba + - 'decision_function': uses decision_function + For regressors, always uses predict. + n_jobs : int + Number of parallel jobs to run. + + Returns + ------- + list of arrays + List containing arrays of predictions for each feature, averaged over X. + For each feature: + - Shape (n_grid_points, n_samples) for regression/binary classification + - Shape (n_grid_points, n_samples, n_classes) for multiclass + """ + predictions = [] + + if method == "auto": + method = ( + "predict" if is_regressor(est) else ["predict_proba", "decision_function"] + ) + predictions = Parallel(n_jobs=n_jobs)( + delayed(_joblib_get_predictions)(variable, values, X, est, method) + for variable, values in tqdm(zip(features, grid)) + ) + return predictions + + +def _joblib_get_predictions(variable, values, X, est, method): + """ + Helper function to get predictions for a single feature for parallel processing. + + Parameters + ---------- + variable : int + Index of the feature for which to get predictions. + values : array-like + Grid values for the target feature. + X : array-like of shape (n_samples, n_features) + Training data used to compute predictions. + est : BaseEstimator + A fitted estimator implementing predict, predict_proba, or decision_function. + method : str or list of str + Method to get model predictions: 'predict', 'predict_proba', or 'decision_function'. + + Returns + ------- + list of array + List of predictions for each grid point, each with shape: + - (n_samples,) for regression + - (n_samples, 1) for binary classification + - (n_samples, n_classes) for multiclass + """ + predictions = [] + for new_values in values: + _safe_assign(X, new_values, column_indexer=variable) + + # Note: predictions is of shape + # (n_points,) for non-multioutput regressors + # (n_points, n_tasks) for multioutput regressors + # (n_points, 1) for the regressors in cross_decomposition (I think) + # (n_points, 1) for binary classification (positive class already selected) + # (n_points, n_classes) for multiclass classification + pred, _ = _get_response_values(est, X, response_method=method) + if len(pred.shape) > 1 and pred.shape[1] > 1: + pred = np.max(pred, axis=1) + predictions.append(np.squeeze(pred)) + return predictions + + +class PartialDependencePlot(BaseVariableImportance): + """ + Partial Dependence Plot (PDP):footcite:t:`friedman2001greedy` for analyzing + feature effects on model predictions. This is based on individual conditional + expectation (ICE) :footcite:t:`goldstein2015peeking`. + PDP shows the average model prediction across different values of target features, + while marginalizing over the values of all other features. It helps understand + how features affect predictions on average. ICE curves show predictions for + individual samples as the feature value changes. + Feature importance scores are computed following :footcite:t:`greenwell2018simple`: + - For continuous features: standard deviation of PDP curve + - For categorical features: range of PDP values divided by 4 + + Parameters + ---------- + estimator : BaseEstimator + A fitted estimator object implementing predict, predict_proba, or + decision_function. Must be a regressor or binary/multiclass classifier. + Multioutput-multiclass classifiers are not supported. + features : array-like of {int, str}, default=None + Features for which to compute partial dependence. Can be: + - Single feature: int or str + - Multiple features: list of int or str + Feature indices or names must match training data. + method : {'auto', 'predict_proba', 'decision_function'}, default='auto' + Method for getting predictions: + - 'auto': tries predict_proba first, falls back to decision_function + - 'predict_proba': uses predicted probabilities + - 'decision_function': uses decision function scores + Ignored for regressors which always use predict. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights for computing weighted averages of predictions. + If None, samples are weighted equally. + categorical_features : array-like, default=None + Specifies categorical features. Can be: + - None: no categorical features + - boolean array: mask indicating categorical features + - array of int/str: indices/names of categorical features + feature_names : array-like of str, default=None + Names of features in the training data. + If None, uses array indices for NumPy arrays or column names for pandas. + percentiles : tuple of float, default=(0.05, 0.95) + Lower and upper percentiles for grid boundaries. + Used for continuous features if custom_values not provided. + Must be in [0, 1]. + grid_resolution : int, default=100 + Number of grid points for continuous features. + Higher values give more granular curves but increase computation time. + Ignored if custom_values provided. + custom_values : dict, default=None + Custom grid values for features. Dictionary mapping feature index/name + to array of values to evaluate. + Overrides percentiles and grid_resolution for specified features. + statistical_resolution : bool, default=False + If True, uses quantile-based grid points instead of evenly spaced points. + Can better capture feature distribution. + n_jobs : int, default=1 + Number of CPU cores to use for parallel processing. + -1 means using all processors. + + Attributes + ---------- + importances_ : ndarray of shape (n_features,) + Computed feature importance scores based on PDP variance + ices_ : list of arrays + Individual Conditional Expectation curves for each feature + values_ : list of arrays + Grid values used for each feature + + See Also + -------- + sklearn.inspection.partial_dependence : Similar functionality in scikit-learn + + References + ---------- + .. footbibliography:: + """ + + def __init__( + self, + estimator, + features=None, + method: str = "auto", + sample_weight=None, + categorical_features=None, + feature_names=None, + percentiles=(0.05, 0.95), + grid_resolution=100, + custom_values=None, + statistical_resolution=False, + n_jobs: int = 1, + ): + super().__init__() + check_is_fitted(estimator) + self.estimator = estimator + if not (is_classifier(self.estimator) or is_regressor(self.estimator)): + raise ValueError("'estimator' must be a fitted regressor or classifier.") + + if is_classifier(self.estimator) and isinstance( + self.estimator.classes_[0], np.ndarray + ): + raise ValueError("Multiclass-multioutput estimators are not supported") + + if is_regressor(self.estimator) and (method != "auto" and method != "predict"): + raise ValueError( + "The method parameter is ignored for regressors and " + "must be 'auto' or 'predict'." + ) + self.features = features + self.method = method + self.sample_weight = sample_weight + self.categorical_features = categorical_features + self.feature_names = feature_names + self.percentiles = percentiles + self.grid_resolution = grid_resolution + self.custom_values = custom_values + self.statistical_resolution = statistical_resolution + self.n_jobs = n_jobs + + def fit(self, X=None, y=None): + """ + Fits the PartialDependencePlot model. This method has no effect as PDP + only needs a fitted estimator. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + (Not used) Training data. Not used, present here for API consistency. + y : array-like of shape (n_samples,) + (Not used) Target values. Not used, present here for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + if X is not None: + warnings.warn("X won't be used") + if y is not None: + warnings.warn("y won't be used") + return self + + def _set_environment(self, X_): + """ + Set up environment variables needed for importance calculation. + + Parameters + ---------- + X_ : array-like of shape (n_samples, n_features) + Input data on which to compute feature importance. + + Returns + ------- + tuple + Contains: + - X_subset : array-like, subset of X_ with only target features + - custom_values_for_X_subset : dict, custom values for target features + + Raises + ------ + ValueError + If features indices are negative. + ValueError + If categorical_features is empty list. + ValueError + If categorical_features boolean mask has wrong length. + ValueError + If categorical_features has invalid dtype. + ValueError + If features contain integer data. + """ + if self.sample_weight is not None: + self.sample_weight = _check_sample_weight(self.sample_weight, X_) + + if _determine_key_type(self.features, accept_slice=False) == "int": + # _get_column_indices() supports negative indexing. Here, we limit + # the indexing to be positive. The upper bound will be checked + # by _get_column_indices() + if np.any(np.less(self.features, 0)): + raise ValueError( + "all features must be in [0, {}]".format(X_.shape[1] - 1) + ) + + if self.features is None: + self.features = list(range(X_.shape[1])) + self.features_indices = np.asarray( + _get_column_indices(X_, self.features), dtype=np.intp, order="C" + ).ravel() + + self.feature_names = _check_feature_names(X_, self.feature_names) + + n_features = X_.shape[1] + if self.categorical_features is None: + self.is_categorical = [False] * len(self.features_indices) + else: + categorical_features = np.asarray(self.categorical_features) + if categorical_features.size == 0: + raise ValueError( + "Passing an empty list (`[]`) to `categorical_features` is not " + "supported. Use `None` instead to indicate that there are no " + "categorical features." + ) + if categorical_features.dtype.kind == "b": + # categorical features provided as a list of boolean + if categorical_features.size != n_features: + raise ValueError( + "When `categorical_features` is a boolean array-like, " + "the array should be of shape (n_features,). Got " + f"{categorical_features.size} elements while `X` contains " + f"{n_features} features." + ) + self.is_categorical = [ + categorical_features[idx] for idx in self.features_indices + ] + elif categorical_features.dtype.kind in ("i", "O", "U"): + # categorical features provided as a list of indices or feature names + categorical_features_idx = [ + _get_feature_index(cat, feature_names=self.feature_names) + for cat in categorical_features + ] + self.is_categorical = [ + idx in categorical_features_idx for idx in self.features_indices + ] + else: + raise ValueError( + "Expected `categorical_features` to be an array-like of boolean," + f" integer, or string. Got {categorical_features.dtype} instead." + ) + + custom_values = self.custom_values or {} + if isinstance(self.features, (str, int)): + self.features = [self.features] + + for feature_idx, feature, is_cat in zip( + self.features_indices, self.features, self.is_categorical + ): + if is_cat: + continue + + if _safe_indexing(X_, feature_idx, axis=1).dtype.kind in "iu": + # TODO(1.9): raise a ValueError instead. + raise ValueError( + f"The column {feature!r} contains integer data. Partial " + "dependence plots are not supported for integer data: this " + "can lead to implicit roun[val.shape[0] for val in self.values_]ding with NumPy arrays or even errors " + "with newer pandas versions. Please convert numerical features" + "to floating point dtypes ahead of time to avoid problems. " + ) + + X_subset = _safe_indexing(X_, self.features_indices, axis=1) + + custom_values_for_X_subset = { + index: custom_values.get(feature) + for index, feature in enumerate(self.features) + if feature in custom_values + } + return X_subset, custom_values_for_X_subset + + def importance(self, X, y=None): + """ + Calculate partial dependence importance scores for each feature. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to compute partial dependence on. Must have same features + as training data. + y : array-like of shape (n_samples,) + (not used) Target values. Kept for API compatibility. + + Returns + ------- + ndarray of shape (n_features,) + Importance scores for each feature based on partial dependence. + Higher values indicate greater importance. + + Raises + ------ + ValueError + If features indices are negative. + ValueError + If categorical_features is empty list. + ValueError + If categorical_features boolean mask has wrong length. + ValueError + If categorical_features has invalid dtype. + ValueError + If features contain integer data. + """ + assert self.importances_ is None, "Partial Dependance Plot already fitted" + + if y is not None: + warnings.warn("y won't be used") + # Use check_array only on lists and other non-array-likes / sparse. Do not + # convert DataFrame into a NumPy array. + if not (hasattr(X, "__array__") or sparse.issparse(X)): + # TODO the option was ensure_all_finite added in version 1.6 of scikit learn, it's removed for retrocompatibility + # ensure_all_finite="allow-nan")) + X_ = deepcopy(check_array(X, dtype=object)) + else: + X_ = deepcopy(X) + X_subset, custom_values_for_X_subset = self._set_environment(X_) + + self.values_, _ = _grid_from_X( + X_subset, + self.percentiles, + self.is_categorical, + self.grid_resolution, + custom_values_for_X_subset, + self.statistical_resolution, + ) + + self.ices_ = _partial_dependence_brute( + self.estimator, + self.values_, + self.features_indices, + X_, + self.method, + self.n_jobs, + ) + + averaged_predictions_continuous = [] + averaged_predictions_categorie = [] + # average over samples + for index, ice in enumerate(self.ices_): + if not self.is_categorical[index]: + averaged_predictions_continuous.append( + np.average(ice, axis=1, weights=self.sample_weight) + ) + else: + averaged_predictions_categorie.append( + np.average(ice, axis=1, weights=self.sample_weight) + ) + + # compute importance from equation 4 of greenwell2018simple + self.importances_ = np.zeros_like(self.is_categorical, dtype=float) + # importance for continous variable + if len(averaged_predictions_continuous) > 0: + importance_continuous = [] + for averaged_prediction in averaged_predictions_continuous: + importance_continuous.append( + np.sqrt( + np.sum( + (averaged_prediction - np.mean(averaged_prediction)) ** 2 + ) + / (len(averaged_prediction) - 1) + ) + ) + self.importances_[np.logical_not(self.is_categorical)] = ( + importance_continuous + ) + # importance for categoritcal features + if len(averaged_predictions_categorie) > 0: + importance_categories = [] + for averaged_prediction in averaged_predictions_categorie: + importance_categories.append( + (np.max(averaged_prediction) - np.min(averaged_prediction)) / 4 + ) + self.importances_[self.is_categorical] = importance_categories + self.pvalues_ = None + + return self.importances_ + + def fit_importance(self, X, y=None, cv=None): + """ + Convenience method to fit and calculate importance scores in one step. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to compute partial dependence on. Must have same features + as training data. + y : array-like of shape (n_samples,), default=None + Not used, kept for API compatibility. + cv : object, default=None + Not used, kept for API compatibility. + + Returns + ------- + ndarray of shape (n_features,) + Importance scores for each feature based on partial dependence. + Higher values indicate greater importance. + """ + if y is not None: + warnings.warn("y won't be used") + if cv is not None: + warnings.warn("cv won't be used") + self.fit() + return self.importance(X) + + def plot( + self, + feature_id, + ax=None, + X=None, + n_bins=5, + percentage_ice=1.0, + random_state=None, + **kwargs, + ): + """ + Plot partial dependence and ICE curves for a given feature. + + Parameters + ---------- + feature_id : int + Index of the feature to plot from the features used to compute PDP. + ax : matplotlib.axes.Axes, default=None + Pre-existing axes for the plot. If None, generates new figure and axes. + X : array-like, default=None + Training data used to compute the quantiles and feature distribution. + If provided, adds rugplot and quantile indicators to visualization. + nbins : int, default=5 + Number of bins/quantiles to show in the plot when X is provided. + percentage_ice : float, default=1.0 + Proportion of ICE curves to plot, between 0 and 1. + Lower values reduce visual clutter. + random_state : int or RandomState, default=None + Controls random sampling of ICE curves when percentage_ice < 1. + **kwargs : dict + Additional keyword arguments passed to plot function. + + Returns + ------- + matplotlib.axes.Axes + The axes containing the plot. + + Raises + ------ + Exception + If seaborn is not installed. + + Notes + ----- + For continuous features: + - Blue lines show individual ICE curves + - Black line shows averaged PDP curve + - Rug plot shows data distribution + - Top axis shows percentile values + For categorical features: + - Box plot shows distribution of predictions per category + """ + try: + import seaborn as sns + import matplotlib.pyplot as plt + except ImportError: + raise Exception("You need to install seabor for using this functionnality") + assert ( + percentage_ice <= 1.0 and percentage_ice >= 0.0 + ), "percentage of ice need to be 0 and 1" + self._check_importance() + + rng = check_random_state(random_state) + # subsample ice + ice_lines_idx = rng.choice( + len(self.ices_[feature_id][0]), + int(len(self.ices_[feature_id][0]) * percentage_ice), + replace=False, + ) + # centers = np.array(self.values_[1:] + self.values_[:-1]) / 2 + if ax is None: + fig, ax = plt.subplots() + # add the percentage of the quantiles distributions + if not self.is_categorical[feature_id]: + # plot ice + ax.plot( + np.array(self.values_[feature_id]), + np.array(self.ices_[feature_id])[:, ice_lines_idx], + color="lightblue", + alpha=0.5, + linewidth=0.5, + ) + # plot pdp + ax.plot( + self.values_[feature_id], + np.average( + self.ices_[feature_id], + axis=1, + weights=self.sample_weight, + ), + color="black", + **kwargs, + ) + if X is not None: + data = (_safe_indexing(X, self.features_indices[feature_id], axis=1),) + _ax_quantiles( + ax, + np.unique( + np.quantile(data, np.linspace(0, 1, n_bins), method="lower") + ), + ) + # add distribution of value + sns.rugplot(data, ax=ax, alpha=0.2, legend=False) + ax.set_xlabel(self.feature_names[feature_id]) + else: + ax.boxplot( + self.ices_[feature_id], + positions=np.arange(len(self.values_[feature_id])), + ) + ax.set_xticks(np.arange(len(self.values_[feature_id]))) + ax.set_xticklabels( + self.values_[feature_id], rotation=90, fontdict={"fontsize": 8} + ) + if X is not None: + # add an histogram of the number of element by categoris + # in bottom of the figure + ax_histx = ax.inset_axes([0, -0.45, 1, 0.2], sharex=ax) + ax_histx.hist( + _safe_indexing(X, self.features_indices[feature_id], axis=1), + histtype="bar", + bins=np.arange(len(self.values_[feature_id]) + 1), + align="left", + rwidth=0.5, + ) + ax_histx.set_xlim(ax.get_xlim()) + ax_histx.set_xticklabels( + self.values_[feature_id], rotation=90, fontdict={"fontsize": 8} + ) + ax_histx.set_xlabel(self.feature_names[feature_id]) + else: + ax.set_xlabel(self.feature_names[feature_id]) + ax.grid(True, linestyle="-", alpha=0.4) + return ax + + +def _ax_quantiles(ax, quantiles, twin="x"): + """ + Add quantile percentage labels on a twin axis. + + Parameters + ---------- + ax : matplotlib.axes.Axes + The matplotlib axis to add quantile labels to. + quantiles : array-like + The quantile values to label. + twin : {'x', 'y'}, default='x' + Which axis to add the twin axis and labels to: + - 'x': Add labels on top x-axis + - 'y': Add labels on right y-axis + + Raises + ------ + ValueError + If twin is not 'x' or 'y'. + + Notes + ----- + Creates a twin axis with percentage labels (0-100%) corresponding to the + quantile values. Useful for showing the distribution of data alongside + the actual values. + """ + assert twin in ("x", "y"), "'twin' should be one of 'x' or 'y'." + + # Duplicate the 'opposite' axis so we can define a distinct set of ticks for the + # desired axis (`twin`). + ax_mod = ax.twiny() if twin == "x" else ax.twinx() + + # Set the new axis' ticks for the desired axis. + getattr(ax_mod, "set_{twin}ticks".format(twin=twin))(quantiles) + # Set the corresponding tick labels. + + # Calculate tick label percentage values for each quantile (bin edge). + percentages = ( + 100 * np.arange(len(quantiles), dtype=np.float64) / (len(quantiles) - 1) + ) + + # If there is a fractional part, add a decimal place to show (part of) it. + fractional = (~np.isclose(percentages % 1, 0)).astype("int8") + + getattr(ax_mod, "set_{twin}ticklabels".format(twin=twin))( + [ + "{0:0.{1}f}%".format(percent, format_fraction) + for percent, format_fraction in zip(percentages, fractional) + ], + color="#545454", + fontsize=7, + ) + getattr(ax_mod, "set_{twin}lim".format(twin=twin))( + getattr(ax, "get_{twin}lim".format(twin=twin))() + ) diff --git a/test/baseline/test_plot_pdp.png b/test/baseline/test_plot_pdp.png new file mode 100644 index 000000000..7f70ef61b Binary files /dev/null and b/test/baseline/test_plot_pdp.png differ diff --git a/test/baseline/test_plot_pdp_not_fig.png b/test/baseline/test_plot_pdp_not_fig.png new file mode 100644 index 000000000..defe47329 Binary files /dev/null and b/test/baseline/test_plot_pdp_not_fig.png differ diff --git a/test/marginal/test_partial_dependence_plot.py b/test/marginal/test_partial_dependence_plot.py new file mode 100644 index 000000000..c04980822 --- /dev/null +++ b/test/marginal/test_partial_dependence_plot.py @@ -0,0 +1,652 @@ +from copy import deepcopy +import sys + +import numpy as np +import pytest +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier +from sklearn.cluster import FeatureAgglomeration +from sklearn.multioutput import MultiOutputClassifier + +from hidimstat import PDP + + +def seaborn_installed(): + try: + import seaborn + + return True + except ImportError: + return False + + +def configure_linear_categorial_pdp(X, y): + """ + Configure Partial Dependance Plot (PDP) model with linear regression + for feature importance analysis. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data matrix where each column represents a feature + and each row a sample. + y : array-like of shape (n_samples,) + Target variable array. + Returns + ------- + importance : array-like + Array containing importance scores for each feature. + Higher values indicate greater feature importance in predicting + the target variable. + Notes + ----- + The function performs the following steps: + 1. Fits a linear regression model on training data + 2. Configures PDP with linear regression + 3. Calculates feature importance using the test set + The PDP method is a marginal methods scoring with linear + regression as the base model. + """ + # create and fit a linear regression model on the training set + regression_model = LinearRegression() + regression_model.fit(X, y) + + # instantiate CPI model with linear regression imputer + pdp = PDP( + estimator=regression_model, + n_jobs=1, + ) + # calculate feature importance using the test set + importance = pdp.importance(X) + return np.array(importance) + + +parameter_exact = [ + ("HiDim", 150, 200, 1, 0.0, 42, 1.0, np.inf, 0.0), + ("HiDim with noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.0), + ("HiDim with correlated noise", 150, 200, 1, 0.0, 42, 1.0, 10.0, 0.5), + ("HiDim with correlated features", 150, 200, 1, 0.8, 42, 1.0, np.inf, 0.0), + ("HiDim with high level noise", 150, 200, 10, 0.2, 42, 1.0, 0.5, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_exact))[1:])), + ids=list(zip(*parameter_exact))[0], +) +def test_pdp_linear_data_exact(data_generator): + """Tests the method on linear cases with noise and correlation""" + X, y, important_features, _ = data_generator + + importance = configure_linear_categorial_pdp(X, y) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.all([int(i) in important_features for i in np.argsort(importance)[-1:]]) + + +parameter_bad_detection = [ + ("HiDim with high correlated features", 150, 200, 1, 1.0, 42, 1.0, 5.0, 0.0), +] + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_bad_detection))[1:])), + ids=list(zip(*parameter_bad_detection))[0], +) +def test_pdp_linear_data_fail(data_generator): + """Tests the method on linear cases with noise and correlation""" + X, y, important_features, _ = data_generator + + importance = configure_linear_categorial_pdp(X, y) + # check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # check that important features have the highest importance scores + assert np.any( + [int(i) not in important_features for i in np.argsort(importance)[-1:]] + ) + + +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + zip(*(list(zip(*parameter_exact))[1:])), + ids=list(zip(*parameter_exact))[0], +) +def test_pdp_classication(data_generator): + """Test PDP for a classification problem""" + X, y, important_features, not_important_features = data_generator + # Create categories + y_clf = deepcopy(y) + y_clf[np.where(y > 2)] = 0 + y_clf[np.where(np.logical_and(y <= 2, y > 0))] = 1 + y_clf[np.where(np.logical_and(y <= 0, y > -2))] = 2 + y_clf[np.where(y <= -2)] = 3 + y_clf = np.array(y_clf, dtype=int) + + # Create and fit a logistic regression model on the training set + logistic_model = LogisticRegression() + logistic_model.fit(X, y_clf) + + pdp = PDP( + estimator=logistic_model, + n_jobs=1, + method="predict_proba", + ) + importance = pdp.fit_importance(X) + + # Check that importance scores are defined for each feature + assert importance.shape == (X.shape[1],) + # Check that important features have higher mean importance scores + assert ( + importance[important_features].mean() + > importance[not_important_features].mean() + ) + + +# test plot of the pdp +@pytest.mark.skipif(not seaborn_installed(), reason="seaborn is not installed") +@pytest.mark.skipif( + sys.version_info < (3, 12), + reason="Don't test retrocompatibility for the generation of figures.", +) +@pytest.mark.mpl_image_compare +def test_plot_pdp(): + """Test simple plot of pdp""" + import matplotlib.pyplot as plt + + seed = 42 + n_samples = 150 + rng = np.random.default_rng(seed) + X_cont = rng.random((n_samples, 2)) + X_cat = np.concatenate( + [ + rng.integers(low=0, high=3, size=(n_samples, 1)), + rng.integers(low=0, high=5, size=(n_samples, 1)), + ], + axis=1, + ) + X = np.hstack([X_cont, X_cat]) + categories_features = np.ones(X.shape[1], dtype=bool) + categories_features[: X_cont.shape[1]] = False + y = rng.random((n_samples, 1)) + fitted_model = LinearRegression().fit(X, y) + + pdp = PDP(estimator=fitted_model, categorical_features=categories_features) + pdp.fit_importance(X) + + fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(9, 8), constrained_layout=True) + for i in range(4): + pdp.plot(feature_id=i, X=X, ax=axs.ravel()[i]) + return fig + + +@pytest.mark.skipif(not seaborn_installed(), reason="seaborn is not installed") +@pytest.mark.skipif( + sys.version_info < (3, 12), + reason="Don't test retrocompatibility for the generation of figures.", +) +@pytest.mark.mpl_image_compare +def test_plot_pdp_not_fig(): + """Test plot when the figure is not define before""" + seed = 42 + n_samples = 150 + rng = np.random.default_rng(seed) + X_cont = rng.random((n_samples, 2)) + X_cat = np.concatenate( + [ + rng.integers(low=0, high=3, size=(n_samples, 1)), + rng.integers(low=0, high=5, size=(n_samples, 1)), + ], + axis=1, + ) + X = np.hstack([X_cont, X_cat]) + categories_features = np.ones(X.shape[1], dtype=bool) + categories_features[: X_cont.shape[1]] = False + y = rng.random((n_samples, 1)) + fitted_model = LinearRegression().fit(X, y) + + pdp = PDP(estimator=fitted_model, categorical_features=categories_features) + pdp.fit_importance(X) + + ax = pdp.plot(feature_id=0, X=X) + return ax.get_figure() + + +############################################################################## +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [(150, 200, 1, 0.0, 42, 1.0, 0.0, 0.0)], + ids=["default data"], +) +class TestPDPClass: + """Test the element of the class""" + + def test_pdp_init(self, data_generator): + """Test PDP initialization""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP( + estimator=fitted_model, + method="predict", + ) + assert pdp.n_jobs == 1 + assert pdp.method == "predict" + + def test_pdp_importance(self, data_generator): + """Test importance function PDP""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP( + estimator=fitted_model, + ) + # Test importance + pdp.importance(X.tolist()) + assert len(pdp.importances_) > 0 + assert len(pdp.ices_) > 0 + assert pdp.pvalues_ is None + + def test_pdp_resolution_stat(self, data_generator): + """Test resolution statistic PDP""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, statistical_resolution=True) + # Test importance + pdp.importance(X) + assert len(pdp.importances_) > 0 + assert len(pdp.ices_) > 0 + assert pdp.pvalues_ is None + + def test_pdp_custom_values(self, data_generator): + """Test custom values""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, custom_values={1: [0.1, 0.2, 0.3]}) + # Test importance + pdp.importance(X) + assert len(pdp.importances_) > 0 + assert len(pdp.ices_) > 0 + assert pdp.pvalues_ is None + + def test_pdp_sample_weight(self, data_generator): + """Test sample weight""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, sample_weight=np.random.rand(150)) + # Test importance + pdp.importance(X) + assert len(pdp.importances_) > 0 + assert len(pdp.ices_) > 0 + assert pdp.pvalues_ is None + + def test_pdp_categories(self, data_generator): + """Test categories boolean""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, categorical_features=np.ones(200, dtype=bool)) + # Test importance + pdp.importance(X) + assert len(pdp.importances_) > 0 + assert len(pdp.ices_) > 0 + assert pdp.pvalues_ is None + + def test_pdp_categories_2(self, data_generator): + """Test categories integer""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, categorical_features=range(200)) + # Test importance + pdp.importance(X) + assert len(pdp.importances_) > 0 + assert len(pdp.ices_) > 0 + assert pdp.pvalues_ is None + + def test_pdp_features(self, data_generator): + """Test features""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, features=0) + # Test importance + pdp.importance(X) + assert len(pdp.importances_) > 0 + assert len(pdp.ices_) > 0 + assert pdp.pvalues_ is None + + def test_pdp_categorical( + self, + n_samples, + n_features, + support_size, + rho, + seed, + value, + signal_noise_ratio, + rho_serial, + ): + """Test PDP with categorical variables""" + rng = np.random.default_rng(seed) + X_cont = rng.random((n_samples, 2)) + X_cat = np.concatenate( + [ + rng.integers(low=0, high=3, size=(n_samples, 1)), + rng.integers(low=0, high=5, size=(n_samples, 1)), + ], + axis=1, + ) + X = np.hstack([X_cont, X_cat]) + categories_features = np.ones(X.shape[1], dtype=bool) + categories_features[: X_cont.shape[1]] = False + y = rng.random((n_samples, 1)) + fitted_model = LinearRegression().fit(X, y) + + pdp = PDP(estimator=fitted_model, categorical_features=categories_features) + + importances = pdp.fit_importance(X) + assert len(importances) == 4 + assert np.all(importances < 0.1) # no informative, worse than dummy model + + +############################################################################## +@pytest.mark.parametrize( + "n_samples, n_features, support_size, rho, seed, value, signal_noise_ratio, rho_serial", + [(150, 200, 10, 0.0, 42, 1.0, 0.0, 0.0)], + ids=["default data"], +) +class TestPDPExceptions: + """Test class for PDP exceptions""" + + def test_unfitted_estimator(self, data_generator): + """Test when using an unfitted estimator""" + with pytest.raises(NotFittedError): + PDP( + estimator=LinearRegression(), + method="predict", + ) + + def test_unknown_wrong_method_regression(self, data_generator): + """Test when a wrong prediction method is provided""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + + with pytest.raises( + ValueError, + match="The method parameter is ignored for regressors and " + "must be 'auto' or 'predict'.", + ): + PDP( + estimator=fitted_model, + method="predict_proba", + ) + + def test_unknown_predict_method_regression(self, data_generator): + """Test when an prediction method is not correct for regressor""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + + with pytest.raises( + ValueError, + match="The method parameter is ignored for regressors and must be 'auto' or 'predict'.", + ): + PDP( + estimator=fitted_model, + method="unknown method", + ) + + def test_unknown_predict_method_classification(self, data_generator): + """Test when an unknown prediction method is provided""" + X, y, _, _ = data_generator + y_clf = deepcopy(y) + y_clf[np.where(y > 2)] = 0 + y_clf[np.where(np.logical_and(y <= 2, y > 0))] = 1 + y_clf[np.where(np.logical_and(y <= 0, y > -2))] = 2 + y_clf[np.where(y <= -2)] = 3 + y_clf = np.array(y_clf, dtype=int) + fitted_model = RidgeClassifier().fit(X, y_clf) + pdp = PDP( + estimator=fitted_model, + method="unknown method", + ) + + with pytest.raises( + AttributeError, + match="RidgeClassifier has none of the following attributes: unknown method", + ): + pdp.importance(X) + + def test_unknown_predict_method_multioutput(self, data_generator): + """Test when an no multioutput""" + X, y, _, _ = data_generator + y_clf = deepcopy(y) + y_clf[np.where(y > 2)] = 0 + y_clf[np.where(np.logical_and(y <= 2, y > 0))] = 1 + y_clf[np.where(np.logical_and(y <= 0, y > -2))] = 2 + y_clf[np.where(y <= -2)] = 3 + y_clf = np.array(y_clf, dtype=int) + fitted_model = MultiOutputClassifier(RidgeClassifier()).fit( + X, np.array([y_clf, y_clf]).T + ) + + with pytest.raises( + ValueError, + match="Multiclass-multioutput estimators are not supported", + ): + PDP( + estimator=fitted_model, + ) + + def test_wrong_estimator(self, data_generator): + """Test with wrong type of estimator""" + X, y, _, _ = data_generator + fitted_model = FeatureAgglomeration().fit(X, y) + + with pytest.raises( + ValueError, match="'estimator' must be a fitted regressor or classifier." + ): + pdp = PDP(estimator=fitted_model) + + def test_no_importance_double(self, data_generator): + """Test can't compute importance 2 times""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model) + pdp.fit_importance(X=X) + + with pytest.raises( + AssertionError, match="Partial Dependance Plot already fitted" + ): + pdp.importance(X=X) + + def test_fit_warning(self, data_generator): + """Test warning of methods""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model) + + with pytest.warns(match="y won't be used"): + pdp.fit(y=y) + + with pytest.warns(match="X won't be used"): + pdp.fit(X=X) + + with pytest.warns(match="X won't be used"): + with pytest.warns(match="y won't be used"): + pdp.fit(X=X, y=y) + + def test_fit_importance_warning(self, data_generator): + """Test warning of methods""" + X, y, _, _ = data_generator + pdp = PDP(estimator=LinearRegression().fit(X, y)) + with pytest.warns(match="y won't be used"): + pdp.importance(X=X, y=y) + pdp = PDP(estimator=LinearRegression().fit(X, y)) + with pytest.warns(match="y won't be used"): + pdp.fit_importance(X=X, y=y) + pdp = PDP(estimator=LinearRegression().fit(X, y)) + with pytest.warns(match="y won't be used"): + with pytest.warns(match="cv won't be used"): + pdp.fit_importance(X=X, y=y, cv=[]) + + def test_percentage_sequence(self, data_generator): + """Test importance method with percentage is a squence of more that 2 elements""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, percentiles=[2, 3, 4]) + + with pytest.raises( + AssertionError, match="'percentiles' must be a sequence of 2 elements." + ): + pdp.importance(X=X) + + def test_percentage_no_0_1(self, data_generator): + """Test importance method with wrong percentage""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, percentiles=[2, 3]) + + with pytest.raises( + AssertionError, match=r"'percentiles' values must be in \[0, 1\]." + ): + pdp.importance(X=X) + + def test_percentage_no_right_order(self, data_generator): + """Test importance method with wrong order for percentage""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, percentiles=[0.7, 0.3]) + + with pytest.raises( + AssertionError, + match=r"percentiles\[0\] must be strictly less than percentiles\[1\].", + ): + pdp.importance(X=X) + + def test_feature_wrong(self, data_generator): + """Test importance method with wrong grid resolution""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, features=[0, 1, 3, -10]) + + with pytest.raises(ValueError, match=r"all features must be in \[0, "): + pdp.importance(X=X) + + def test_empty_categorical_features(self, data_generator): + """Test empty list for categorical features""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, categorical_features=[]) + + with pytest.raises( + ValueError, + match="supported. Use `None` instead to indicate that there are no ", + ): + pdp.importance(X=X) + + def test_empty_categorical_features_binary(self, data_generator): + """Test missing feature for categories in binary selection""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, categorical_features=np.ones(50, dtype=bool)) + + with pytest.raises( + ValueError, + match="When `categorical_features` is a boolean array-like, ", + ): + pdp.importance(X=X) + + def test_empty_categorical_features_wrong_type(self, data_generator): + """Test categorical feature is wrong type""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP( + estimator=fitted_model, categorical_features=np.ones(200, dtype=float) + ) + + with pytest.raises( + ValueError, + match="Expected `categorical_features` to be an array-like of boolean, ", + ): + pdp.importance(X=X) + + def test_grid(self, data_generator): + """Test importance method with wrong grid resolution""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, grid_resolution=-10) + + with pytest.raises( + AssertionError, match="'grid_resolution' must be strictly greater than 1." + ): + pdp.importance(X=X) + + def test_percentiles_close(self, data_generator): + """Test importance method with closest percentile""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, percentiles=[0.1, 0.100001]) + + with pytest.raises( + ValueError, match="percentiles are too close to each other," + ): + pdp.importance(X=X) + + def test_wrong_custom_value(self, data_generator): + """Test high dimention of custom values""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP(estimator=fitted_model, custom_values={1: [[1, 2, 3], [4, 5, 6]]}) + + with pytest.raises( + ValueError, + match="The custom grid for some features is not a one-dimensional array.", + ): + pdp.importance(X=X) + + def test_wrong_type(self, data_generator): + """Test wrong type""" + X = np.array(["A", "B", "C", np.nan], dtype=object).reshape(-1, 1) + y = np.array([0, 1, 0, 1]) + + from sklearn.preprocessing import OrdinalEncoder + from sklearn.pipeline import make_pipeline + + fitted_model = make_pipeline( + OrdinalEncoder(encoded_missing_value=-1), + LogisticRegression(), + ).fit(X, y) + + pdp = PDP( + estimator=fitted_model, + ) + + with pytest.raises( + ValueError, match="Finding unique categories fail due to sorting." + ): + pdp.importance(X=X) + + def test_reject_interger_data(self, data_generator): + """integer should be categorical""" + X = np.arange(8).reshape(4, 2) + y = np.array([0, 1, 0, 1]) + fitted_model = LinearRegression().fit(X, y) + pdp = PDP( + estimator=fitted_model, + ) + with pytest.raises( + ValueError, + match="Partial dependence plots are not supported for integer data: this ", + ): + pdp.importance(X=X) + + @pytest.mark.skipif(not seaborn_installed(), reason="seaborn is not installed") + def test_unfitted_importance(self, data_generator): + """Test plot before fitting""" + X, y, _, _ = data_generator + fitted_model = LinearRegression().fit(X, y) + pdp = PDP( + estimator=fitted_model, + ) + + with pytest.raises( + ValueError, match="The importances need to be called before." + ): + pdp.plot(feature_id=0, X=X)