From 2a0b343dea80df8ea0d11bcf914b46ae7dc9702d Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 27 May 2026 12:25:24 -0700
Subject: [PATCH 1/8] collider: switch phase-1a contact sort to
 qd.simt.subgroup.bitonic_sort_kv_tiled

Replaces the inlined 15-stage bitonic compare-exchange schedule in
``func_clamp_prune_and_sort_contacts_coop`` (phase 1a) with a one-line
call to the new Quadrants subgroup primitive:

    my_key, my_idx = qd.simt.subgroup.bitonic_sort_kv_tiled(my_key, my_idx, 5)

The primitive (added in quadrants hp/bitonic-sort-kv) is a @qd.func that
inlines at compile time and unrolls the same 15 compare-exchange stages
this code used to write inline, so the generated kernel IR is bit-identical
to today on CUDA. Net change: ~30 lines of hand-rolled bitonic code removed,
the sentinel load + write-back wrapper is unchanged, and the rest of the
kernel (clamp + key init + bucket walk + phase 2 + phase 3) is untouched.

``log2_size = 5`` pins the sort to a 32-lane tile, matching the kernel's
hard-coded ``block_dim = _K = 32``. Using the tiled form rather than the
bare ``bitonic_sort_kv(...)`` wrapper keeps the sort width fixed at 32 even
on AMDGPU wave64, where the bare wrapper would otherwise sort across all
64 lanes and mix in garbage from the inactive upper half.

Requires the matching quadrants change to be installed (the public symbol
``qd.simt.subgroup.bitonic_sort_kv_tiled`` is added by that PR).
---
 .../engine/solvers/rigid/collider/contact.py  | 29 +++++--------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index 900d894e3..4a6d0fc3f 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -981,8 +981,12 @@ def func_clamp_prune_and_sort_contacts_coop(
                 ii += _K
 
             # Phase 1a sort: parallel bitonic sort across 32 lanes when n_con <= 32; fall back to serial-on-lane-0
-            # insertion sort otherwise. Bitonic is 15 compare-exchange stages (k=2..32, j=k/2..1), each a single
-            # subgroup shuffle + compare, replacing the O(n^2/2) lane-0 insertion sort.
+            # insertion sort otherwise. The 15-stage compare-exchange schedule (k=2..32, j=k/2..1) is provided by
+            # ``qd.simt.subgroup.bitonic_sort_kv_tiled``; we pass ``log2_size = 5`` to pin the sort to a 32-lane
+            # tile, which matches the kernel's ``block_dim = _K = 32`` exactly regardless of the active subgroup
+            # width on the target backend (the tiled form sorts 32 lanes even on AMDGPU wave64, where the bare
+            # ``bitonic_sort_kv(...)`` wrapper would otherwise reach across all 64 lanes and mix in garbage from
+            # the inactive upper half).
             if n_con <= _K:
                 # Load with sentinel for out-of-range lanes (pushes them to the end of ascending sort).
                 my_key = qd.cast(gs.qd_float(1.0e30), gs.qd_float)
@@ -991,26 +995,7 @@ def func_clamp_prune_and_sort_contacts_coop(
                     my_key = collider_state.contact_sort_key[tid, i_b]
                     my_idx = collider_state.contact_sort_idx[tid, i_b]
 
-                # 15 bitonic stages: (k, j) pairs walking the standard schedule. Stable compare (tiebreak on idx).
-                for k_log2 in qd.static(range(1, 6)):
-                    k_mask = qd.static(1 << k_log2)
-                    for j_log2 in qd.static(range(k_log2 - 1, -1, -1)):
-                        j = qd.static(1 << j_log2)
-                        partner = qd.u32(tid ^ j)
-                        their_key = qd.simt.subgroup.shuffle(my_key, partner)
-                        their_idx = qd.simt.subgroup.shuffle(my_idx, partner)
-                        i_am_low = (tid & j) == 0
-                        asc = (tid & k_mask) == 0
-                        take_min = i_am_low == asc
-                        their_lt_mine = (their_key < my_key) or (their_key == my_key and their_idx < my_idx)
-                        if take_min:
-                            if their_lt_mine:
-                                my_key = their_key
-                                my_idx = their_idx
-                        else:
-                            if not their_lt_mine and (their_key != my_key or their_idx != my_idx):
-                                my_key = their_key
-                                my_idx = their_idx
+                my_key, my_idx = qd.simt.subgroup.bitonic_sort_kv_tiled(my_key, my_idx, 5)
 
                 # Write back the sorted values for the real range.
                 if tid < n_con:

From ddf6809e834bf9a82568d5ef4b1749110f58930f Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 27 May 2026 14:33:18 -0700
Subject: [PATCH 2/8] collider: shorten the bitonic-sort comment

7 lines of prose -> 3.  Same intent: explain why we use the tiled form with
``log2_size = 5`` rather than the bare ``bitonic_sort_kv`` wrapper.
---
 genesis/engine/solvers/rigid/collider/contact.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index 4a6d0fc3f..efe6e652e 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -980,13 +980,9 @@ def func_clamp_prune_and_sort_contacts_coop(
                 )
                 ii += _K
 
-            # Phase 1a sort: parallel bitonic sort across 32 lanes when n_con <= 32; fall back to serial-on-lane-0
-            # insertion sort otherwise. The 15-stage compare-exchange schedule (k=2..32, j=k/2..1) is provided by
-            # ``qd.simt.subgroup.bitonic_sort_kv_tiled``; we pass ``log2_size = 5`` to pin the sort to a 32-lane
-            # tile, which matches the kernel's ``block_dim = _K = 32`` exactly regardless of the active subgroup
-            # width on the target backend (the tiled form sorts 32 lanes even on AMDGPU wave64, where the bare
-            # ``bitonic_sort_kv(...)`` wrapper would otherwise reach across all 64 lanes and mix in garbage from
-            # the inactive upper half).
+            # Phase 1a sort: bitonic sort across 32 lanes when n_con <= _K, serial-on-lane-0 insertion sort
+            # otherwise.  ``log2_size = 5`` pins the tiled sort to 32 lanes to match ``block_dim = _K`` on every
+            # backend (correct on AMDGPU wave64 where the bare ``bitonic_sort_kv`` would reach all 64 lanes).
             if n_con <= _K:
                 # Load with sentinel for out-of-range lanes (pushes them to the end of ascending sort).
                 my_key = qd.cast(gs.qd_float(1.0e30), gs.qd_float)

From 0758d92a7873ae4fd34852455761b51a203a8238 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 27 May 2026 14:41:34 -0700
Subject: [PATCH 3/8] collider: use _LOG2_K rather than magic ``5`` in the
 bitonic-sort call

Pair ``_K = qd.static(32)`` with ``_LOG2_K = qd.static(5)`` at the top
of the kernel and pass ``_LOG2_K`` into ``bitonic_sort_kv_tiled``.  The
relationship between the sort width and ``_K`` is now visible at the
binding site instead of being a magic 5 sitting next to ``_K = 32``.
---
 genesis/engine/solvers/rigid/collider/contact.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index efe6e652e..3049c4a82 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -946,6 +946,7 @@ def func_clamp_prune_and_sort_contacts_coop(
     EPS = rigid_global_info.EPS[None]
 
     _K = qd.static(32)
+    _LOG2_K = qd.static(5)  # log2(_K); kept as a paired constant so the bitonic-sort call below is self-documenting.
     qd.loop_config(name="clamp_prune_and_sort_contacts_coop", block_dim=_K)
     for i_flat in range(_B * _K):
         tid = i_flat % _K
@@ -981,8 +982,8 @@ def func_clamp_prune_and_sort_contacts_coop(
                 ii += _K
 
             # Phase 1a sort: bitonic sort across 32 lanes when n_con <= _K, serial-on-lane-0 insertion sort
-            # otherwise.  ``log2_size = 5`` pins the tiled sort to 32 lanes to match ``block_dim = _K`` on every
-            # backend (correct on AMDGPU wave64 where the bare ``bitonic_sort_kv`` would reach all 64 lanes).
+            # otherwise.  ``log2_size = _LOG2_K`` pins the tiled sort to _K = 32 lanes on every backend (correct
+            # on AMDGPU wave64 where the bare ``bitonic_sort_kv`` would reach all 64 lanes).
             if n_con <= _K:
                 # Load with sentinel for out-of-range lanes (pushes them to the end of ascending sort).
                 my_key = qd.cast(gs.qd_float(1.0e30), gs.qd_float)
@@ -991,7 +992,7 @@ def func_clamp_prune_and_sort_contacts_coop(
                     my_key = collider_state.contact_sort_key[tid, i_b]
                     my_idx = collider_state.contact_sort_idx[tid, i_b]
 
-                my_key, my_idx = qd.simt.subgroup.bitonic_sort_kv_tiled(my_key, my_idx, 5)
+                my_key, my_idx = qd.simt.subgroup.bitonic_sort_kv_tiled(my_key, my_idx, _LOG2_K)
 
                 # Write back the sorted values for the real range.
                 if tid < n_con:

From eeba2c63bee523e8d83777f1d1fe18023d8ea8ce Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 27 May 2026 14:42:57 -0700
Subject: [PATCH 4/8] collider: drop redundant ``log2_size = _LOG2_K``
 explainer comment

With ``_K`` and ``_LOG2_K`` defined together at the top of the kernel
and ``_LOG2_K`` flowing straight into ``bitonic_sort_kv_tiled``, the
explainer ("pins the tiled sort to _K lanes on every backend ...")
just restates what the names already convey.
---
 genesis/engine/solvers/rigid/collider/contact.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index 3049c4a82..a8925a29f 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -981,9 +981,8 @@ def func_clamp_prune_and_sort_contacts_coop(
                 )
                 ii += _K
 
-            # Phase 1a sort: bitonic sort across 32 lanes when n_con <= _K, serial-on-lane-0 insertion sort
-            # otherwise.  ``log2_size = _LOG2_K`` pins the tiled sort to _K = 32 lanes on every backend (correct
-            # on AMDGPU wave64 where the bare ``bitonic_sort_kv`` would reach all 64 lanes).
+            # Phase 1a sort: bitonic sort across _K lanes when n_con <= _K, serial-on-lane-0 insertion sort
+            # otherwise.
             if n_con <= _K:
                 # Load with sentinel for out-of-range lanes (pushes them to the end of ascending sort).
                 my_key = qd.cast(gs.qd_float(1.0e30), gs.qd_float)

From 47bbe1969c32a4db60e41233404e2014558d6321 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 27 May 2026 14:43:46 -0700
Subject: [PATCH 5/8] collider: derive _LOG2_K from _K instead of hardcoding
 ``5``

``qd.static(32)`` is just the int ``32`` at compile time, so
``_K.bit_length() - 1`` evaluates to ``5`` and keeps _K and _LOG2_K
in sync if _K is ever retuned.
---
 genesis/engine/solvers/rigid/collider/contact.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index a8925a29f..f50036ad0 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -946,7 +946,7 @@ def func_clamp_prune_and_sort_contacts_coop(
     EPS = rigid_global_info.EPS[None]
 
     _K = qd.static(32)
-    _LOG2_K = qd.static(5)  # log2(_K); kept as a paired constant so the bitonic-sort call below is self-documenting.
+    _LOG2_K = qd.static(_K.bit_length() - 1)  # = log2(_K), assuming _K is a power of two.
     qd.loop_config(name="clamp_prune_and_sort_contacts_coop", block_dim=_K)
     for i_flat in range(_B * _K):
         tid = i_flat % _K

From 0b67d15e1e1d99af4ff0e8fe029281f1561456e1 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Wed, 27 May 2026 14:45:00 -0700
Subject: [PATCH 6/8] collider: drop redundant ``qd.static(...)`` wrappers on
 _K and _LOG2_K

``qd.static()`` is a no-op on Python int literals -- it evaluates its
argument at compile time, and a plain ``32`` is already a Python
compile-time int.  Several other Genesis solver files wrap kernel-scope
``BLOCK_DIM`` / ``WARP_SIZE`` / ``_K`` constants this way as a defensive
marker, but it doesn't change codegen and the bare ints read more
directly.
---
 genesis/engine/solvers/rigid/collider/contact.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index f50036ad0..59106e603 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -945,8 +945,8 @@ def func_clamp_prune_and_sort_contacts_coop(
     LP_KEY_STRIDE = gs.qd_float(1.0e7)
     EPS = rigid_global_info.EPS[None]
 
-    _K = qd.static(32)
-    _LOG2_K = qd.static(_K.bit_length() - 1)  # = log2(_K), assuming _K is a power of two.
+    _K = 32
+    _LOG2_K = _K.bit_length() - 1  # = log2(_K), assuming _K is a power of two.
     qd.loop_config(name="clamp_prune_and_sort_contacts_coop", block_dim=_K)
     for i_flat in range(_B * _K):
         tid = i_flat % _K

From 0af18cc7a800c14ec2b6118430747250ea563362 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Thu, 4 Jun 2026 08:14:03 -0400
Subject: [PATCH 7/8] 1.0.2

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d92ddca71..03cb36300 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ readme = "README.md"
 requires-python = ">=3.10,<3.14"
 dependencies = [
     "psutil",
-    "quadrants==0.8.0",
+    "quadrants==1.0.2",
     "pydantic>=2.11.0",
     "numpy>=1.26.4",
     "frozendict",

From 39caa417efd17d78ee1d5ccb48e3a647216373be Mon Sep 17 00:00:00 2001
From: hugh <hugh@slurm-login-0.slurm-login.tenant-slurm.svc.cluster.local>
Date: Thu, 4 Jun 2026 14:30:48 +0000
Subject: [PATCH 8/8] collider: restore qd.static on _K so _K.bit_length()
 folds at compile time

Reverts the qd.static() removal that broke kernel compilation. Without
qd.static, _K = 32 becomes a kernel-local Expr rather than a Python int, so
_K.bit_length() is routed to quadrants.lang.matrix_ops by the AST transformer
and fails with AttributeError. Wrapping in qd.static keeps _K a compile-time
Python int, letting int.bit_length() evaluate to a folded literal.
---
 genesis/engine/solvers/rigid/collider/contact.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index 6ae654c37..6bd80df50 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -945,8 +945,8 @@ def func_clamp_prune_and_sort_contacts_coop(
     LP_KEY_STRIDE = gs.qd_float(1.0e7)
     EPS = rigid_global_info.EPS[None]
 
-    _K = 32
-    _LOG2_K = _K.bit_length() - 1  # = log2(_K), assuming _K is a power of two.
+    _K = qd.static(32)
+    _LOG2_K = qd.static(_K.bit_length() - 1)  # = log2(_K), assuming _K is a power of two.
     qd.loop_config(name="clamp_prune_and_sort_contacts_coop", block_dim=_K)
     for i_flat in range(_B * _K):
         tid = i_flat % _K