From a05beaa3be58ec293602e09cfceea70ef1d39b98 Mon Sep 17 00:00:00 2001 From: microslaw Date: Thu, 12 Jun 2025 17:34:38 +0000 Subject: [PATCH 1/4] implement the exception --- pandas/core/sample.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/sample.py b/pandas/core/sample.py index 4f12563e3c5e2..d44409d08abf4 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -150,6 +150,14 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") + if weights is not None: + is_max_weight_dominating = size * weights.max() > 1 + if is_max_weight_dominating and not replace: + raise ValueError( + "Invalid weights: If `replace`=False, " + "total unit probabilities have to be less than 1" + ) + return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( np.intp, copy=False ) From 92122f37f2ef675db492d359082791a7745fa655 Mon Sep 17 00:00:00 2001 From: microslaw Date: Thu, 12 Jun 2025 17:36:39 +0000 Subject: [PATCH 2/4] remove impossible test scenario --- pandas/tests/frame/methods/test_sample.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index a9d56cbfd2b46..95aa0c0f60778 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -113,9 +113,6 @@ def test_sample_invalid_weight_lengths(self, obj): with pytest.raises(ValueError, match=msg): obj.sample(n=3, weights=[0.5] * 11) - with pytest.raises(ValueError, match="Fewer non-zero entries in p than size"): - obj.sample(n=4, weights=Series([0, 0, 0.2])) - def test_sample_negative_weights(self, obj): # Check won't accept negative weights bad_weights = [-0.1] * 10 From 23926f63a9e104dc64440d650ef136645637af85 Mon Sep 17 00:00:00 2001 From: microslaw Date: Thu, 12 Jun 2025 17:37:10 +0000 Subject: [PATCH 3/4] add new tests --- pandas/tests/frame/methods/test_sample.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 95aa0c0f60778..a7a916da14bc4 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -134,6 +134,28 @@ def test_sample_inf_weights(self, obj): with pytest.raises(ValueError, match=msg): obj.sample(n=3, weights=weights_with_ninf) + def test_sample_unit_probabilities_raises(self, obj): + # GH#61516 + high_variance_weights = [1] * 10 + high_variance_weights[0] = 100 + msg = ( + "Invalid weights: If `replace`=False, " + "total unit probabilities have to be less than 1" + ) + with pytest.raises(ValueError, match=msg): + obj.sample(n=2, weights=high_variance_weights, replace=False) + + # edge case, n*max(weights)/sum(weights) == 1 + edge_variance_weights = [1] * 10 + edge_variance_weights[0] = 9 + # should not raise + obj.sample(n=2, weights=edge_variance_weights, replace=False) + + low_variance_weights = [1] * 10 + low_variance_weights[0] = 8 + # should not raise + obj.sample(n=2, weights=low_variance_weights, replace=False) + def test_sample_zero_weights(self, obj): # All zeros raises errors From 11df61335c3b8cb8ae28f0a2cfcf5e79a246cd94 Mon Sep 17 00:00:00 2001 From: microslaw Date: Thu, 12 Jun 2025 22:32:24 +0000 Subject: [PATCH 4/4] update documentation --- doc/source/user_guide/indexing.rst | 4 ++-- doc/source/whatsnew/v0.16.1.rst | 6 +++--- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/generic.py | 2 ++ 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 605f9501c5b23..47ff92c163b01 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -700,7 +700,7 @@ to have different probabilities, you can pass the ``sample`` function sampling w s = pd.Series([0, 1, 2, 3, 4, 5]) example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] - s.sample(n=3, weights=example_weights) + s.sample(n=2, weights=example_weights) # Weights will be re-normalized automatically example_weights2 = [0.5, 0, 0, 0, 0, 0] @@ -714,7 +714,7 @@ as a string. df2 = pd.DataFrame({'col1': [9, 8, 7, 6], 'weight_column': [0.5, 0.4, 0.1, 0]}) - df2.sample(n=3, weights='weight_column') + df2.sample(n=2, weights='weight_column') ``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index b376530358f53..be5d0b396fd51 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -196,7 +196,7 @@ facilitate replication. (:issue:`2419`) # weights are accepted. example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] - example_series.sample(n=3, weights=example_weights) + example_series.sample(n=2, weights=example_weights) # weights will also be normalized if they do not sum to one, # and missing values will be treated as zeros. @@ -209,8 +209,8 @@ when sampling from rows. .. ipython:: python - df = pd.DataFrame({"col1": [9, 8, 7, 6], "weight_column": [0.5, 0.4, 0.1, 0]}) - df.sample(n=3, weights="weight_column") + df = pd.DataFrame({"col1": [9, 8, 7, 6], "weight_column": [0.5, 0.4, 0.1, 0]}) + df.sample(n=2, weights="weight_column") .. _whatsnew_0161.enhancements.string: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 03a386708323d..0e2de66728e89 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -890,6 +890,7 @@ Other - Bug in :meth:`DataFrame.query` where using duplicate column names led to a ``TypeError``. (:issue:`59950`) - Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`) - Bug in :meth:`DataFrame.query` which raised an exception when querying integer column names using backticks. (:issue:`60494`) +- Bug in :meth:`DataFrame.sample` with ``replace=False`` and ``(n * max(weights) / sum(weights)) > 1``, the method would return biased results. Now raises ``ValueError``. (:issue:`61516`) - Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.sort_values` where sorting by a column explicitly named ``None`` raised a ``KeyError`` instead of sorting by the column as expected. (:issue:`61512`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8aae4609b1833..ec5e105b24020 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5815,6 +5815,8 @@ def sample( If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. + When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``, + in order to avoid biased results. random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given.