cyrraz · 0ctagon · Jul 19, 2023 · Jul 18, 2023 · Jul 18, 2023 · Jul 19, 2023
diff --git a/docs/img/1d_comparison.svg b/docs/img/1d_comparison.svg
diff --git a/docs/img/1d_elt1.svg b/docs/img/1d_elt1.svg
diff --git a/docs/img/1d_elt2.svg b/docs/img/1d_elt2.svg
diff --git a/docs/img/1d_elt3.svg b/docs/img/1d_elt3.svg
diff --git a/docs/img/1d_hist_simple.svg b/docs/img/1d_hist_simple.svg
diff --git a/docs/img/2d_hist_correlations.svg b/docs/img/2d_hist_correlations.svg
diff --git a/docs/img/2d_hist_simple.svg b/docs/img/2d_hist_simple.svg
diff --git a/docs/img/2d_hist_uneven.svg b/docs/img/2d_hist_uneven.svg
diff --git a/docs/img/hep_examples_dataMC_flatten2D.svg b/docs/img/hep_examples_dataMC_flatten2D.svg
diff --git a/docs/img/hep_examples_dataMC_pull.svg b/docs/img/hep_examples_dataMC_pull.svg
diff --git a/docs/img/hep_examples_dataMC_stacked.svg b/docs/img/hep_examples_dataMC_stacked.svg
diff --git a/docs/img/hep_examples_dataMC_unstacked.svg b/docs/img/hep_examples_dataMC_unstacked.svg
diff --git a/plothist/default_style.mplstyle b/plothist/default_style.mplstyle
@@ -54,7 +54,7 @@ ytick.major.size    : 6
 xtick.minor.size    : 3
 ytick.minor.size    : 3
 
-xtick.major.pad     : 5.
+xtick.major.pad     : 7.
 ytick.major.pad     : 5.
 axes.axisbelow      : False
 image.cmap          : viridis

diff --git a/plothist/hep_plotters.py b/plothist/hep_plotters.py
@@ -2,6 +2,7 @@
 Collection of functions to plot histograms in the context of High Energy Physics
 """
 import numpy as np
+import scipy.stats as stats
 import matplotlib.pyplot as plt
 import matplotlib as mpl
 from plothist.plotters import (
@@ -11,6 +12,7 @@
     compare_two_hist,
     plot_comparison,
     create_comparison_figure,
+    _hist_ratio_variances,
 )
 
 
@@ -31,7 +33,7 @@ def compare_data_mc(
     **comparison_kwargs,
 ):
     """
-    Compare data to MC simulations.
+    Compare data to MC simulations. The data uncertainties are computed using the Poisson confidence interval.
 
     Parameters
     ----------
@@ -78,6 +80,10 @@ def compare_data_mc(
     plot_comparison : Plot the comparison between data and MC simulations.
 
     """
+    comparison_kwargs.setdefault("h1_label", "Data")
+    comparison_kwargs.setdefault("h2_label", "Pred.")
+    comparison_kwargs.setdefault("comparison", "ratio")
+    comparison_kwargs.setdefault("ratio_uncertainty", "split")
 
     if fig is None and ax_main is None and ax_comparison is None:
         fig, (ax_main, ax_comparison) = create_comparison_figure()
@@ -92,6 +98,8 @@ def compare_data_mc(
         if signal_hist:
             signal_hist = _flatten_2d_hist(signal_hist)
 
+    mc_hist_total = sum(mc_hist_list)
+
     plot_mc(
         mc_hist_list,
         signal_hist=signal_hist,
@@ -104,12 +112,56 @@ def compare_data_mc(
         flatten_2d_hist=False,  # Already done
     )
 
-    plot_error_hist(data_hist, ax=ax_main, color="black", label="Data")
+    uncertainties_low, uncertainties_high = _get_poisson_uncertainties(data_hist)
+
+    if comparison_kwargs["comparison"] == "pull":
+        data_variances = np.where(
+            data_hist.values() >= mc_hist_total.values(),
+            uncertainties_low**2,
+            uncertainties_high**2,
+        )
+        data_hist[:] = np.stack([data_hist.values(), data_variances], axis=-1)
+    elif comparison_kwargs["comparison"] == "ratio":
+        if comparison_kwargs["ratio_uncertainty"] == "split":
+            np.seterr(divide="ignore", invalid="ignore")
+            # Compute asymmetrical uncertainties to plot_comparison()
+            comparison_kwargs.setdefault(
+                "yerr",
+                [
+                    uncertainties_low / mc_hist_total.values(),
+                    uncertainties_high / mc_hist_total.values(),
+                ],
+            )
+            np.seterr(divide="warn", invalid="warn")
+        elif comparison_kwargs["ratio_uncertainty"] == "uncorrelated":
+            data_hist_high = data_hist.copy()
+            data_hist_high[:] = np.stack(
+                [data_hist_high.values(), uncertainties_high**2], axis=-1
+            )
+            data_hist_low = data_hist.copy()
+            data_hist_low[:] = np.stack(
+                [data_hist_low.values(), uncertainties_low**2], axis=-1
+            )
+            # Compute asymmetrical uncertainties to plot_comparison()
+            comparison_kwargs.setdefault(
+                "yerr",
+                [
+                    np.sqrt(_hist_ratio_variances(data_hist_low, mc_hist_total)),
+                    np.sqrt(_hist_ratio_variances(data_hist_high, mc_hist_total)),
+                ],
+            )
+
+    plot_error_hist(
+        data_hist,
+        ax=ax_main,
+        yerr=[uncertainties_low, uncertainties_high],
+        color="black",
+        label="Data",
+    )
 
     _ = ax_main.xaxis.set_ticklabels([])
 
     # Plot MC statistical uncertainty
-    mc_hist_total = sum(mc_hist_list)
     mc_uncertainty = np.sqrt(mc_hist_total.variances())
     ax_main.bar(
         x=mc_hist_total.axes[0].centers,
@@ -125,21 +177,12 @@ def compare_data_mc(
 
     ax_main.legend()
 
-    # Update the default comparison settings
-    default_comparison_kwargs = {
-        "h1_label": "Data",
-        "h2_label": "Pred.",
-        "comparison": "ratio",
-        "ratio_uncertainty": "split",
-    }
-    default_comparison_kwargs.update(comparison_kwargs)
-
     plot_comparison(
         data_hist,
         mc_hist_total,
         ax=ax_comparison,
         xlabel=xlabel,
-        **default_comparison_kwargs,
+        **comparison_kwargs,
     )
 
     if save_as is not None:
@@ -148,6 +191,18 @@ def compare_data_mc(
     return fig, ax_main, ax_comparison
 
 
+def _get_poisson_uncertainties(data_hist):
+    """
+    Get Poisson asymmetrical uncertainties for a histogram.
+    """
+    conf_level = 0.682689492
+    alpha = 1.0 - conf_level
+    n = data_hist.values()
+    uncertainties_low = n - stats.gamma.ppf(alpha / 2, n, scale=1)
+    uncertainties_high = stats.gamma.ppf(1 - alpha / 2, n + 1, scale=1) - n
+    return uncertainties_low, uncertainties_high
+
+
 def plot_mc(
     mc_hist_list,
     signal_hist=None,

diff --git a/plothist/plotters.py b/plothist/plotters.py
@@ -265,15 +265,15 @@ def plot_hist(hist, ax, **kwargs):
         ax.hist(
             x=hist.axes[0].centers,
             bins=hist.axes[0].edges,
-            weights=hist.values(),
+            weights=np.nan_to_num(hist.values(), 0),
             **kwargs,
         )
     else:
         # Multiple histograms
         ax.hist(
             x=[h.axes[0].centers for h in hist],
             bins=hist[0].axes[0].edges,
-            weights=[h.values() for h in hist],
+            weights=[np.nan_to_num(h.values(), 0) for h in hist],
             **kwargs,
         )
 
@@ -303,7 +303,6 @@ def plot_2d_hist(hist, ax, pcolormesh_kwargs={}, colorbar_kwargs={}):
 
 
 def plot_error_hist(hist, ax, **kwargs):
-    # TODO: Allow the user to provide xerr, yerr, fmt themselves.
     """
     Create an errorbar plot from a boost histogram.
 
@@ -316,12 +315,12 @@ def plot_error_hist(hist, ax, **kwargs):
     **kwargs
         Additional keyword arguments forwarded to ax.errorbar().
     """
+    kwargs.setdefault("yerr", np.sqrt(hist.variances()))
+    kwargs.setdefault("fmt", ".")
+
     ax.errorbar(
         x=hist.axes[0].centers,
-        xerr=None,
         y=hist.values(),
-        yerr=np.sqrt(hist.variances()),
-        fmt=".",
         **kwargs,
     )
 
@@ -417,6 +416,42 @@ def compare_two_hist(
     return fig, ax_main, ax_comparison
 
 
+def _hist_ratio_variances(hist_1, hist_2):
+    """
+    Calculate the variances of the ratio of two histograms (hist_1/hist_2).
+
+    Parameters
+    ----------
+    hist_1 : boost_histogram.Histogram
+        The first histogram.
+    hist_2 : boost_histogram.Histogram
+        The second histogram.
+
+    Returns
+    -------
+    variances : np.ndarray
+        The variances of the ratio of the two histograms.
+
+    Raises
+    ------
+    ValueError
+        If the bins of the histograms are not equal.
+    """
+    if not np.all(hist_1.axes[0].edges == hist_2.axes[0].edges):
+        raise ValueError("The bins of the histograms must be equal.")
+
+    np.seterr(divide="ignore", invalid="ignore")
+    ratio_variances = np.where(
+        hist_2.values() != 0,
+        hist_1.variances() / hist_2.values() ** 2
+        + hist_2.variances() * hist_1.values() ** 2 / hist_2.values() ** 4,
+        np.nan,
+    )
+    np.seterr(divide="warn", invalid="warn")
+
+    return ratio_variances
+
+
 def plot_comparison(
     hist_1,
     hist_2,
@@ -428,6 +463,7 @@ def plot_comparison(
     comparison_ylabel=None,
     comparison_ylim=None,
     ratio_uncertainty="uncorrelated",
+    **plot_hist_kwargs,
 ):
     """
     Plot the comparison between two histograms.
@@ -453,7 +489,9 @@ def plot_comparison(
     comparison_ylim : tuple or None, optional
         The y-axis limits for the comparison plot. Default is None.
     ratio_uncertainty : str, optional
-        How to treat the uncertainties of the histograms when comparison = "ratio" ("uncorrelated" for simple comparison, "split" for scaling and split hist_1 and hist_2 uncertainties). This argument has no effect if comparison != "ration". Default is "uncorrelated".
+        How to treat the uncertainties of the histograms when comparison = "ratio" ("uncorrelated" for simple comparison, "split" for scaling and split hist_1 and hist_2 uncertainties). This argument has no effect if comparison != "ratio". Default is "uncorrelated".
+    **plot_hist_kwargs : optional
+        Arguments to be passed to plot_hist() or plot_error_hist(), called in case the comparison is "pull" or "ratio", respectively. In case of pull, the default arguments are histtype="stepfilled" and color="darkgrey". In case of ratio, the default argument is color="black".
 
     Returns
     -------
@@ -474,52 +512,45 @@ def plot_comparison(
             hist_2.values() != 0, hist_1.values() / hist_2.values(), np.nan
         )
         if ratio_uncertainty == "split":
-            h1_scaled_uncertainty = np.where(
+            h1_scaled_uncertainties = np.where(
                 hist_2.values() != 0,
                 np.sqrt(hist_1.variances()) / hist_2.values(),
                 np.nan,
             )
-        elif ratio_uncertainty == "uncorrelated":
-            ratio_variance = np.where(
+            h2_scaled_uncertainties = np.where(
                 hist_2.values() != 0,
-                hist_1.variances() / hist_2.values() ** 2
-                + hist_2.variances() * hist_1.values() ** 2 / hist_2.values() ** 4,
+                np.sqrt(hist_2.variances()) / hist_2.values(),
                 np.nan,
             )
+            comparison_variances = h1_scaled_uncertainties**2
+        elif ratio_uncertainty == "uncorrelated":
+            comparison_variances = _hist_ratio_variances(hist_1, hist_2)
         else:
             raise ValueError("ratio_uncertainty not in ['uncorrelated', 'split'].")
-        if ratio_uncertainty == "split":
-            h2_scaled_uncertainty = np.sqrt(hist_2.variances()) / hist_2.values()
-
     elif comparison == "pull":
         comparison_values = np.where(
-            hist_2.values() != 0,
+            hist_1.variances() + hist_2.variances() != 0,
             (hist_1.values() - hist_2.values())
             / np.sqrt(hist_1.variances() + hist_2.variances()),
             np.nan,
         )
-        ratio_variance = np.where(
-            hist_1.values() != 0,
-            1,
-            np.nan,
-        )
+        comparison_variances = np.ones_like(comparison_values)
     else:
         raise ValueError(
             f"{comparison} not available as a comparison (use ratio or pull)."
         )
-
     np.seterr(divide="warn", invalid="warn")
 
-    ax.errorbar(
-        x=hist_2.axes[0].centers,
-        xerr=None,
-        y=comparison_values,
-        yerr=np.sqrt(ratio_variance)
-        if (ratio_uncertainty == "uncorrelated" or comparison == "pull")
-        else h1_scaled_uncertainty,
-        fmt=".",
-        color="black",
-    )
+    hist_comparison = bh.Histogram(hist_2.axes[0], storage=bh.storage.Weight())
+    hist_comparison[:] = np.stack([comparison_values, comparison_variances], axis=-1)
+
+    if comparison == "pull":
+        plot_hist_kwargs.setdefault("histtype", "stepfilled")
+        plot_hist_kwargs.setdefault("color", "darkgrey")
+        plot_hist(hist_comparison, ax=ax, **plot_hist_kwargs)
+    else:
+        plot_hist_kwargs.setdefault("color", "black")
+        plot_error_hist(hist_comparison, ax=ax, **plot_hist_kwargs)
 
     if comparison == "ratio":
         if comparison_ylim is None:
@@ -528,9 +559,11 @@ def plot_comparison(
         if ratio_uncertainty == "split":
             ax.bar(
                 x=hist_2.axes[0].centers,
-                bottom=np.nan_to_num(1 - h2_scaled_uncertainty, nan=comparison_ylim[0]),
+                bottom=np.nan_to_num(
+                    1 - h2_scaled_uncertainties, nan=comparison_ylim[0]
+                ),
                 height=np.nan_to_num(
-                    2 * h2_scaled_uncertainty, nan=2 * comparison_ylim[-1]
+                    2 * h2_scaled_uncertainties, nan=2 * comparison_ylim[-1]
                 ),
                 width=hist_2.axes[0].widths,
                 edgecolor="dimgrey",
@@ -612,7 +645,7 @@ def f(x0, x1):
         def color(lambda_):
             # emphasise either low intensity values (gamma < 1),
             # or high intensity values (γ > 1)
-            lambda_gamma = lambda_ ** gamma
+            lambda_gamma = lambda_**gamma
 
             # Angle and amplitude for the deviation
             # from the black to white diagonal

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "pyyaml>=5.3.1",
     "pandas>=1.3.5",
     "scikit-learn>=1.0.2",
+    "scipy>=1.6.0",
 ]
 
 [project.urls]