Merge pull request #345 from DoubleML/p-edits-cs-did

SvenKlaassen · web-flow · commit efdb8a6b5d94 · 2025-07-07T08:03:03.000+02:00
Edits on CS DID PR
diff --git a/doubleml/did/datasets/dgp_did_cs_CS2021.py b/doubleml/did/datasets/dgp_did_cs_CS2021.py
@@ -97,8 +97,8 @@ def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambd
 
            P(G_i = g) = \\frac{1}{G} \\text{ for all } g
 
-    7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated indivials is increased
-    to `n_obs/lambda_t`, where `lambda_t` denotes the pobability to observe a unit at each time period (time constant).
+    7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated individuals is increased
+    to `n_obs/lambda_t`, where `lambda_t` denotes the probability to observe a unit at each time period (time constant).
     for each
 
 
@@ -133,7 +133,8 @@ def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambd
         Whether to include units that are never treated.
 
     lambda_t : float, default=0.5
-        Probability of observing a unit at each time period.
+        Probability of observing a unit at each time period. Note that internally `n_obs/lambda_t` individuals are
+        generated of which only a fraction `lambda_t` is observed at each time period (see Step 7 in the DGP description).
 
     time_type : str, default="datetime"
         Type of time variable. Either "datetime" or "float".
diff --git a/doubleml/did/did.py b/doubleml/did/did.py
@@ -37,7 +37,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
@@ -47,7 +47,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML):
         Default is ``'observational'``.
 
     in_sample_normalization : bool
-        Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020).
+        Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
         Default is ``True``.
 
     trimming_rule : str
diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py
@@ -70,7 +70,7 @@ class DoubleMLDIDBinary(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
@@ -80,7 +80,7 @@ class DoubleMLDIDBinary(LinearScoreMixin, DoubleML):
         Default is ``'observational'``.
 
     in_sample_normalization : bool
-        Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020).
+        Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
         Default is ``True``.
 
     trimming_rule : str
diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py
@@ -37,7 +37,7 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
@@ -47,7 +47,7 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML):
         Default is ``'observational'``.
 
     in_sample_normalization : bool
-        Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020).
+        Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
         Default is ``True``.
 
     trimming_rule : str
diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py
@@ -28,6 +28,78 @@
 
 
 class DoubleMLDIDCSBinary(LinearScoreMixin, DoubleML):
+    """Double machine learning for difference-in-differences models with repeated cross sections (binary setting in terms of group and time
+        combinations).
+
+        Parameters
+        ----------
+        obj_dml_data : :class:`DoubleMLPanelData` object
+            The :class:`DoubleMLPanelData` object providing the data and specifying the variables for the causal model.
+
+        g_value : int
+            The value indicating the treatment group (first period with treatment).
+            Default is ``None``. This implements the case for the smallest, non-zero value of G.
+
+        t_value_pre : int
+            The value indicating the baseline pre-treatment period.
+
+        t_value_eval : int
+            The value indicating the period for evaluation.
+
+        ml_g : estimator implementing ``fit()`` and ``predict()``
+            A machine learner implementing ``fit()`` and ``predict()`` methods (e.g.
+            :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`g_0(d,X) = E[Y_1-Y_0|D=d, X]`.
+            For a binary outcome variable :math:`Y` (with values 0 and 1), a classifier implementing ``fit()`` and
+            ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` returns ``True``,
+            ``predict_proba()`` is used otherwise ``predict()``.
+
+        ml_m : classifier implementing ``fit()`` and ``predict_proba()``
+            A machine learner implementing ``fit()`` and ``predict_proba()`` methods (e.g.
+            :py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance function :math:`m_0(X) = E[D=1|X]`.
+            Only relevant for ``score='observational'``.
+
+        control_group : str
+            Specifies the control group. Either ``'never_treated'`` or ``'not_yet_treated'``.
+            Default is ``'never_treated'``.
+
+        anticipation_periods : int
+            Number of anticipation periods. Default is ``0``.
+
+        n_folds : int
+            Number of folds.
+            Default is ``5``.
+
+        n_rep : int
+            Number of repetitions for the sample splitting.
+            Default is ``1``.
+
+        score : str
+            A str (``'observational'`` or ``'experimental'``) specifying the score function.
+            The ``'experimental'`` scores refers to an A/B setting, where the treatment is independent
+            from the pretreatment covariates.
+            Default is ``'observational'``.
+
+        in_sample_normalization : bool
+            Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
+            Default is ``True``.
+
+        trimming_rule : str
+            A str (``'truncate'`` is the only choice) specifying the trimming approach.
+            Default is ``'truncate'``.
+
+        trimming_threshold : float
+            The threshold used for trimming.
+            Default is ``1e-2``.
+
+        draw_sample_splitting : bool
+            Indicates whether the sample splitting should be drawn during initialization of the object.
+            Default is ``True``.
+
+        print_periods : bool
+            Indicates whether to print information about the evaluated periods.
+            Default is ``False``.
+
+        """
 
     def __init__(
         self,
diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
@@ -46,7 +46,7 @@ class DoubleMLAPO(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/irm/cvar.py b/doubleml/irm/cvar.py
@@ -54,7 +54,7 @@ class DoubleMLCVAR(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py
@@ -45,7 +45,7 @@ class DoubleMLIIVM(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py
@@ -47,7 +47,7 @@ class DoubleMLIRM(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/irm/lpq.py b/doubleml/irm/lpq.py
@@ -49,7 +49,7 @@ class DoubleMLLPQ(NonLinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
diff --git a/doubleml/irm/pq.py b/doubleml/irm/pq.py
@@ -56,7 +56,7 @@ class DoubleMLPQ(NonLinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py
@@ -39,7 +39,7 @@ class DoubleMLQTE:
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
diff --git a/doubleml/irm/ssm.py b/doubleml/irm/ssm.py
@@ -39,7 +39,7 @@ class DoubleMLSSM(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/plm/pliv.py b/doubleml/plm/pliv.py
@@ -45,7 +45,7 @@ class DoubleMLPLIV(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py
@@ -44,7 +44,7 @@ class DoubleMLPLR(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py
@@ -50,7 +50,7 @@ class RDFlex:
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     cutoff : float or int