Add NA validation for survey strata/PSU/cluster IDs and fix results label consistency from PR #218 review (round 15)

igerber · claude · igerber · commit d12bab26d943 · 2026-03-21T09:26:51.000-04:00
Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/results.py b/diff_diff/results.py
@@ -34,9 +34,9 @@ class DiDResults:
     n_obs : int
         Number of observations used in estimation.
     n_treated : int
-        Number of treated observations.
+        Number of treated units/observations.
     n_control : int
-        Number of control observations.
+        Number of control units/observations.
     """
 
     att: float
@@ -93,8 +93,8 @@ def summary(self, alpha: Optional[float] = None) -> str:
             "=" * 70,
             "",
             f"{'Observations:':<25} {self.n_obs:>10}",
-            f"{'Treated obs:':<25} {self.n_treated:>10}",
-            f"{'Control obs:':<25} {self.n_control:>10}",
+            f"{'Treated:':<25} {self.n_treated:>10}",
+            f"{'Control:':<25} {self.n_control:>10}",
         ]
 
         if self.r_squared is not None:
@@ -312,9 +312,9 @@ class MultiPeriodDiDResults:
     n_obs : int
         Number of observations used in estimation.
     n_treated : int
-        Number of treated observations.
+        Number of treated units/observations.
     n_control : int
-        Number of control observations.
+        Number of control units/observations.
     pre_periods : list
         List of pre-treatment period identifiers.
     post_periods : list
@@ -645,9 +645,9 @@ class SyntheticDiDResults:
     n_obs : int
         Number of observations used in estimation.
     n_treated : int
-        Number of treated observations.
+        Number of treated units/observations.
     n_control : int
-        Number of control observations.
+        Number of control units/observations.
     unit_weights : dict
         Dictionary mapping control unit IDs to their synthetic weights.
     time_weights : dict
@@ -714,8 +714,8 @@ def summary(self, alpha: Optional[float] = None) -> str:
             "=" * 75,
             "",
             f"{'Observations:':<25} {self.n_obs:>10}",
-            f"{'Treated obs:':<25} {self.n_treated:>10}",
-            f"{'Control obs:':<25} {self.n_control:>10}",
+            f"{'Treated:':<25} {self.n_treated:>10}",
+            f"{'Control:':<25} {self.n_control:>10}",
             f"{'Pre-treatment periods:':<25} {len(self.pre_periods):>10}",
             f"{'Post-treatment periods:':<25} {len(self.post_periods):>10}",
         ]
diff --git a/diff_diff/survey.py b/diff_diff/survey.py
@@ -128,7 +128,13 @@ def resolve(self, data: pd.DataFrame) -> "ResolvedSurveyDesign":
         if self.strata is not None:
             if self.strata not in data.columns:
                 raise ValueError(f"Strata column '{self.strata}' not found in data")
-            strata_arr = _factorize_cluster_ids(data[self.strata].values)
+            strata_vals = data[self.strata].values
+            if pd.isna(strata_vals).any():
+                raise ValueError(
+                    f"Strata column '{self.strata}' contains missing values. "
+                    "All observations must have valid strata identifiers."
+                )
+            strata_arr = _factorize_cluster_ids(strata_vals)
             n_strata = len(np.unique(strata_arr))
 
         # --- PSU ---
@@ -138,6 +144,11 @@ def resolve(self, data: pd.DataFrame) -> "ResolvedSurveyDesign":
             if self.psu not in data.columns:
                 raise ValueError(f"PSU column '{self.psu}' not found in data")
             psu_raw = data[self.psu].values
+            if pd.isna(psu_raw).any():
+                raise ValueError(
+                    f"PSU column '{self.psu}' contains missing values. "
+                    "All observations must have valid PSU identifiers."
+                )
 
             if self.nest and strata_arr is not None:
                 # Make PSU IDs unique within strata by combining
@@ -440,6 +451,14 @@ def _inject_cluster_as_psu(resolved, cluster_ids):
     if resolved.psu is not None:
         return resolved  # PSU already present; _resolve_effective_cluster handles this
 
+    # Validate no missing cluster IDs before factorization
+    if pd.isna(cluster_ids).any():
+        raise ValueError(
+            "Cluster IDs contain missing values. "
+            "All observations must have valid cluster identifiers "
+            "when used as effective PSUs for survey variance estimation."
+        )
+
     # Factorize cluster_ids for consistent integer encoding
     codes, uniques = pd.factorize(cluster_ids)
     n_clusters = len(uniques)
diff --git a/tests/test_survey.py b/tests/test_survey.py
@@ -2824,3 +2824,51 @@ def test_multiperiod_bootstrap_survey_fallback(self):
                 survey_design=sd,
             )
         assert np.isfinite(result.avg_att)
+
+
+class TestRound15Fixes:
+    """Tests for PR #218 review round 15: NA validation for survey identifiers."""
+
+    def test_strata_with_na_rejected(self):
+        """SurveyDesign.resolve() rejects NA values in strata column."""
+        df = pd.DataFrame(
+            {
+                "y": [1.0, 2.0, 3.0, 4.0],
+                "w": [1.0, 1.0, 1.0, 1.0],
+                "strat": [0, 1, None, 0],  # NA in strata
+            }
+        )
+        sd = SurveyDesign(weights="w", weight_type="pweight", strata="strat")
+        with pytest.raises(ValueError, match="Strata column.*missing values"):
+            sd.resolve(df)
+
+    def test_psu_with_na_rejected(self):
+        """SurveyDesign.resolve() rejects NA values in PSU column."""
+        df = pd.DataFrame(
+            {
+                "y": [1.0, 2.0, 3.0, 4.0],
+                "w": [1.0, 1.0, 1.0, 1.0],
+                "cluster": [0, 1, np.nan, 0],  # NA in PSU
+            }
+        )
+        sd = SurveyDesign(weights="w", weight_type="pweight", psu="cluster")
+        with pytest.raises(ValueError, match="PSU column.*missing values"):
+            sd.resolve(df)
+
+    def test_cluster_as_psu_with_na_rejected(self):
+        """_inject_cluster_as_psu rejects NA values in cluster IDs."""
+        from diff_diff.survey import _inject_cluster_as_psu
+
+        resolved = ResolvedSurveyDesign(
+            weights=np.ones(4),
+            weight_type="pweight",
+            strata=None,
+            psu=None,
+            fpc=None,
+            n_strata=0,
+            n_psu=0,
+            lonely_psu="remove",
+        )
+        cluster_ids = np.array([0, 1, np.nan, 0])
+        with pytest.raises(ValueError, match="Cluster IDs contain missing"):
+            _inject_cluster_as_psu(resolved, cluster_ids)