|
84 | 84 | _VALID_VCE = ("nn", "hc0", "hc1", "hc2", "hc3") |
85 | 85 |
|
86 | 86 |
|
| 87 | +def _cluster_has_missing(cluster: np.ndarray) -> bool: |
| 88 | + """Detect missing cluster IDs across float / object / string dtypes. |
| 89 | +
|
| 90 | + nprobust::lpbwselect complete-case-filters (x, y, cluster) before |
| 91 | + dispatch. This port deliberately rejects missingness instead so |
| 92 | + callers see it rather than silently losing rows. Used by |
| 93 | + ``lpbwselect_mse_dpi`` and ``lprobust`` (and the public |
| 94 | + ``bias_corrected_local_linear`` wrapper) so all three surfaces |
| 95 | + honor the same contract. |
| 96 | + """ |
| 97 | + if cluster.dtype.kind in ("f", "c"): |
| 98 | + return bool(np.any(~np.isfinite(cluster))) |
| 99 | + # Object / string / None-containing arrays: treat None and NaN-like |
| 100 | + # sentinels as missing. |
| 101 | + try: |
| 102 | + if bool(np.any([v is None for v in cluster])): |
| 103 | + return True |
| 104 | + except TypeError: |
| 105 | + pass |
| 106 | + try: |
| 107 | + # np.nan comparisons are False; cast to float and check finiteness. |
| 108 | + cluster_f = cluster.astype(np.float64, copy=False) |
| 109 | + return bool(np.any(~np.isfinite(cluster_f))) |
| 110 | + except (TypeError, ValueError): |
| 111 | + return False |
| 112 | + |
| 113 | + |
87 | 114 | # ============================================================================= |
88 | 115 | # Kernel (W.fun, npfunctions.R:1-7) |
89 | 116 | # ============================================================================= |
@@ -684,25 +711,9 @@ def lpbwselect_mse_dpi( |
684 | 711 | # before dispatch; this port deliberately rejects instead so |
685 | 712 | # callers see the missingness rather than lose rows silently. |
686 | 713 | # The "reject" vs "filter" choice is documented in the module |
687 | | - # docstring deviations list. |
688 | | - has_missing = False |
689 | | - if cluster.dtype.kind in ("f", "c"): |
690 | | - has_missing = bool(np.any(~np.isfinite(cluster))) |
691 | | - else: |
692 | | - # object / string / None-containing arrays: treat None and |
693 | | - # NaN-like sentinels as missing. |
694 | | - try: |
695 | | - has_missing = bool(np.any([x is None for x in cluster])) |
696 | | - except TypeError: |
697 | | - has_missing = False |
698 | | - if not has_missing: |
699 | | - try: |
700 | | - # np.nan comparisons are False; use pd-style check. |
701 | | - cluster_f = cluster.astype(np.float64, copy=False) |
702 | | - has_missing = bool(np.any(~np.isfinite(cluster_f))) |
703 | | - except (TypeError, ValueError): |
704 | | - pass |
705 | | - if has_missing: |
| 714 | + # docstring deviations list. Dtype-agnostic via |
| 715 | + # `_cluster_has_missing`. |
| 716 | + if _cluster_has_missing(cluster): |
706 | 717 | raise ValueError( |
707 | 718 | "cluster contains missing values (NaN / None). Unlike " |
708 | 719 | "nprobust::lpbwselect which complete-case-filters " |
@@ -1130,13 +1141,17 @@ def lprobust( |
1130 | 1141 | raise ValueError( |
1131 | 1142 | f"cluster length ({cluster.shape[0]}) does not match x/y ({N})." |
1132 | 1143 | ) |
1133 | | - # Reject NaN cluster IDs (Phase 1b convention: surface missingness |
1134 | | - # rather than silently drop rows). |
1135 | | - cluster_float = np.asarray(cluster, dtype=np.float64).ravel() if np.issubdtype( |
1136 | | - cluster.dtype, np.floating |
1137 | | - ) else None |
1138 | | - if cluster_float is not None and np.any(~np.isfinite(cluster_float)): |
1139 | | - raise ValueError("cluster contains non-finite values (NaN or Inf).") |
| 1144 | + # Dtype-agnostic missingness check. Float NaN/Inf, object None, |
| 1145 | + # and object np.nan all get rejected here (shared with |
| 1146 | + # `lpbwselect_mse_dpi` via `_cluster_has_missing`) so the |
| 1147 | + # downstream `lprobust_vce` cluster grouping on `np.unique` |
| 1148 | + # cannot silently treat a missing sentinel as a real cluster. |
| 1149 | + if _cluster_has_missing(cluster): |
| 1150 | + raise ValueError( |
| 1151 | + "cluster contains missing values (NaN / None). " |
| 1152 | + "Filter your data before the call or drop missing " |
| 1153 | + "observations explicitly." |
| 1154 | + ) |
1140 | 1155 |
|
1141 | 1156 | # --- vce="nn" setup: sort ascending, precompute dups --- |
1142 | 1157 | dups: Optional[np.ndarray] = None |
|
0 commit comments