From 7b227c36574e82ad26c5ade398cf4b9c7371eeed Mon Sep 17 00:00:00 2001
From: Trinity Chung <trinityjchung@gmail.com>
Date: Sun, 24 May 2026 16:26:21 -0400
Subject: [PATCH 1/7] rebase update

---
 examples/sensors/tactile_sandbox.py           | 100 ++-
 genesis/engine/sensors/kinematic_tactile.py   | 533 +++++++++++----
 genesis/engine/sensors/point_cloud_tactile.py | 499 +++++++-------
 genesis/engine/sensors/probe.py               | 369 +++++++++-
 .../engine/sensors/surface_distance_probe.py  |  49 +-
 genesis/engine/sensors/tactile_shared.py      | 296 ++++++++
 genesis/options/sensors/options.py            |  60 +-
 genesis/options/sensors/tactile.py            | 188 ++++-
 genesis/typing.py                             |   8 +
 tests/test_sensors.py                         | 647 +++++++++++++++++-
 10 files changed, 2224 insertions(+), 525 deletions(-)
 create mode 100644 genesis/engine/sensors/tactile_shared.py

diff --git a/examples/sensors/tactile_sandbox.py b/examples/sensors/tactile_sandbox.py
index 3cc52bbb02..20f2aa8c7e 100644
--- a/examples/sensors/tactile_sandbox.py
+++ b/examples/sensors/tactile_sandbox.py
@@ -1,6 +1,6 @@
 """
 Interactive demo of tactile sensors on a fixed taxel pad (box or dome) with controllable objects.
-Sensor types: ContactDepthProbe, ElastomerTaxel, KinematicTaxel, ProximityTaxel.
+Sensor types: ContactDepthProbe, ContactProbe, ElastomerTaxel, KinematicTaxel, ProximityTaxel.
 
 Note that the sensor readings here have not been calibrated to any units, and is purely for visualization purposes.
 """
@@ -24,9 +24,9 @@
     from genesis.engine.entities.rigid_entity import RigidEntity
     from genesis.engine.sensors.base_sensor import Sensor
 
-KEY_DPOS = 0.005
+KEY_DPOS = 0.001
 FORCE_SCALE = 100.0
-ROT_FORCE_SCALE = 200.0
+ROT_FORCE_SCALE = 100.0
 
 GRID_SIZE = 20  # 20x20 taxels for square
 PROBE_RADIUS = 0.004
@@ -48,6 +48,7 @@ def _add_tactile_sensor(
     probe_local_pos: np.ndarray,
     probe_normal: tuple[float, float, float] | np.ndarray,
     track_link_idx: tuple[int, ...],
+    noise: bool,
 ) -> "Sensor":
     common = dict(
         entity_idx=entity.idx,
@@ -55,6 +56,15 @@ def _add_tactile_sensor(
         draw_debug=True,
         probe_radius=PROBE_RADIUS,
     )
+    if noise:
+        # Sensor imperfections shared by every tactile sensor type: viscoelastic hysteresis on the measured
+        # branch, a noised sensing radius, and a per-taxel measured-branch depth gain.
+        common.update(
+            hysteresis_strength=0.5,  # viscoelastic overshoot fraction
+            hysteresis_tau=0.1,  # viscoelastic relaxation time constant (seconds)
+            probe_radius_noise=0.001,  # additive sensing-radius noise (meters)
+            probe_gain=1.5,  # per-taxel measured-branch depth gain
+        )
     if sensor_type == "elastomer":
         return scene.add_sensor(
             gs.sensors.ElastomerTaxel(
@@ -63,7 +73,8 @@ def _add_tactile_sensor(
                 track_link_idx=track_link_idx,
                 n_sample_points=2000,
                 dilate_scale=1.0,
-                shear_scale=100.0,
+                shear_scale=2.0,
+                normal_exponent=1.0,
                 **common,
             )
         )
@@ -76,6 +87,17 @@ def _add_tactile_sensor(
                 **common,
             )
         )
+    if sensor_type == "contact":
+        # Schmitt-trigger thresholds (contact depth in meters): a taxel latches on above contact_threshold and
+        # only releases once the depth drops back below the lower release_threshold.
+        return scene.add_sensor(
+            gs.sensors.ContactProbe(
+                probe_local_pos=probe_local_pos,
+                contact_threshold=0.004,
+                release_threshold=0.002,
+                **common,
+            )
+        )
     if sensor_type == "kinematic":
         return scene.add_sensor(
             gs.sensors.KinematicTaxel(
@@ -83,25 +105,26 @@ def _add_tactile_sensor(
                 probe_local_normal=probe_normal,
                 normal_stiffness=500.0,
                 normal_damping=1.0,
-                shear_scalar=5.0,
-                twist_scalar=5.0,
+                shear_scalar=4.0,
+                twist_scalar=4.0,
                 normal_exponent=1.5,
                 **common,
             )
         )
 
-    common["probe_radius"] = PROBE_RADIUS * 2
-    common["debug_point_cloud_radius"] = 0.001
+    common["probe_radius"] = PROBE_RADIUS * 5
     if sensor_type == "proximity":
         return scene.add_sensor(
             gs.sensors.ProximityTaxel(
                 probe_local_pos=probe_local_pos,
                 track_link_idx=track_link_idx,
                 n_sample_points=4000,
-                stiffness=200.0,
-                shear_coupling=100.0,
+                stiffness=40.0,
+                shear_coupling=10.0,
                 probe_local_normal=probe_normal,
-                probe_radius_noise=0.0001,
+                debug_point_cloud_radius=0.0005,
+                debug_probe_color=(0.2, 0.6, 1.0),
+                debug_contact_color=(1.0, 0.2, 0.2),
                 **common,
             )
         )
@@ -111,7 +134,6 @@ def _add_tactile_sensor(
 def _plot_tactile_sensor(
     scene: gs.Scene,
     sensor_type: str,
-    labels: tuple[str, ...],
     sensors: "tuple[Sensor, ...]",
     n_envs: int = 1,
     plot_normal: tuple[float, float, float] = (0.0, 0.0, -1.0),
@@ -122,24 +144,24 @@ def _plot_tactile_sensor(
 
     if sensor_type == "elastomer":
         for env_idx in range(n_envs):
-            for label, sensor in zip(labels, sensors):
+            for sensor in sensors:
                 scene.start_recording(
                     lambda s=sensor, i=env_idx: s.read()[i],
                     gs.recorders.MPLVectorFieldPlot(
-                        title=f"({label} {OBJ_PER_ENV_LABELS[env_idx]}) ElastomerTaxel marker displacements",
+                        title=f"({OBJ_PER_ENV_LABELS[env_idx]}) ElastomerTaxel marker displacements",
                         positions=sensor.probe_local_pos.reshape(-1, 3),
                         normal=plot_normal,
-                        scale_factor=1.0,
-                        max_magnitude=0.01,
+                        scale_factor=0.1,
+                        max_magnitude=0.1,
                     ),
                 )
     elif sensor_type == "kinematic":
         for env_idx in range(n_envs):
-            for label, sensor in zip(labels, sensors):
+            for sensor in sensors:
                 scene.start_recording(
                     lambda s=sensor, i=env_idx: s.read().force[i],
                     gs.recorders.MPLVectorFieldPlot(
-                        title=f"({label} {OBJ_PER_ENV_LABELS[env_idx]}) KinematicTaxel force",
+                        title=f"({OBJ_PER_ENV_LABELS[env_idx]}) KinematicTaxel force",
                         positions=sensor.probe_local_pos.reshape(-1, 3),
                         normal=plot_normal,
                         scale_factor=0.01,
@@ -148,14 +170,14 @@ def _plot_tactile_sensor(
                 )
     elif sensor_type == "proximity":
         for env_idx in range(n_envs):
-            for label, sensor in zip(labels, sensors):
+            for sensor in sensors:
                 scene.start_recording(
                     lambda s=sensor, i=env_idx: s.read().force[i],
                     gs.recorders.MPLVectorFieldPlot(
-                        title=f"({label} {OBJ_PER_ENV_LABELS[env_idx]}) ProximityTaxel force",
+                        title=f"({OBJ_PER_ENV_LABELS[env_idx]}) ProximityTaxel force",
                         positions=sensor.probe_local_pos.reshape(-1, 3),
                         normal=plot_normal,
-                        scale_factor=0.5,
+                        scale_factor=0.1,
                         max_magnitude=1.0,
                     ),
                 )
@@ -165,12 +187,22 @@ def _plot_tactile_sensor(
                 lambda i=env_idx: tuple(sensor.read()[i].max() for sensor in sensors),
                 gs.recorders.MPLLinePlot(
                     title=f"ContactDepthProbe max depth ({OBJ_PER_ENV_LABELS[env_idx]})",
-                    labels=labels,
                     x_label="step",
                     y_label="depth",
                     history_length=200,
                 ),
             )
+    elif sensor_type == "contact":
+        for env_idx in range(n_envs):
+            scene.start_recording(
+                lambda i=env_idx: tuple(sensor.read()[i].sum() for sensor in sensors),
+                gs.recorders.MPLLinePlot(
+                    title=f"ContactProbe taxels in contact ({OBJ_PER_ENV_LABELS[env_idx]})",
+                    x_label="step",
+                    y_label="# taxels",
+                    history_length=200,
+                ),
+            )
 
 
 def _print_sensor_reading(sensor_type: str, sensor: "Sensor", t: float) -> None:
@@ -183,6 +215,10 @@ def _print_sensor_reading(sensor_type: str, sensor: "Sensor", t: float) -> None:
         max_depth = data.max()
         if max_depth > gs.EPS:
             print(f"t={t:.2f}s  max depth={max_depth:.4f}")
+    elif sensor_type == "contact":
+        n_contact = int(data.sum())
+        if n_contact > 0:
+            print(f"t={t:.2f}s  taxels in contact={n_contact}")
     elif sensor_type == "kinematic":
         magnitude = torch.linalg.norm(data.force, axis=-1).max()
         if magnitude > gs.EPS:
@@ -204,10 +240,15 @@ def main() -> None:
     parser.add_argument("--dome", action="store_true", help="Change the sensor object to a dome instead of a box")
     parser.add_argument(
         "--sensor",
-        choices=("elastomer", "depth", "kinematic", "proximity"),
+        choices=("elastomer", "depth", "contact", "kinematic", "proximity"),
         default="elastomer",
         help="Type of tactile sensor to use.",
     )
+    parser.add_argument(
+        "--noise",
+        action="store_true",
+        help="Enable sensor imperfections (viscoelastic hysteresis, probe_radius_noise, probe_gain).",
+    )
     args = parser.parse_args()
 
     gs.init(
@@ -276,17 +317,17 @@ def main() -> None:
             ny=GRID_SIZE,
         )
 
-    # Procedural torus written to a temp .obj (avoids checking a 2k-line mesh into the repo).
     torus_path = os.path.join(tempfile.gettempdir(), "tactile_sandbox_torus.obj")
     if not os.path.exists(torus_path):
-        trimesh.creation.torus(major_radius=0.3, minor_radius=0.1).export(torus_path)
+        trimesh.creation.torus(major_radius=1.0, minor_radius=0.5).export(torus_path)
 
     obj = scene.add_entity(
         morph=[
             gs.morphs.Mesh(
                 file=torus_path,
-                euler=(90.0, 0.0, 0.0),
-                scale=OBJECT_SIZE,
+                euler=(0.0, 0.0, 0.0),
+                scale=OBJECT_SIZE * 2,
+                convexify=False,
             ),
             gs.morphs.Sphere(
                 radius=OBJECT_SIZE / 2,
@@ -314,9 +355,10 @@ def main() -> None:
         probe_local_pos,
         probe_normal,
         track_link_idx=(obj.base_link_idx,),
+        noise=args.noise,
     )
     if args.vis and "PYTEST_VERSION" not in os.environ:
-        _plot_tactile_sensor(scene, args.sensor, ("",), (sensor,), n_envs=4, plot_normal=probe_normal_axis)
+        _plot_tactile_sensor(scene, args.sensor, (sensor,), n_envs=4, plot_normal=probe_normal_axis)
     scene.build(n_envs=4, env_spacing=(SENSOR_OBJ_SIZE * 1.2, SENSOR_OBJ_SIZE * 1.2))
 
     obj_init_pos = tensor_to_array(obj.get_pos())
@@ -382,7 +424,7 @@ def rotate(axis_idx: int, is_negative: bool):
     print("\n=== Tactile Sensor Sandbox ===")
     n_taxels = probe_local_pos.reshape(-1, 3).shape[0]
     layout = f"dome ({GRID_SIZE} latitude rings)" if args.dome else f"plane grid {probe_local_pos.shape[:-1]}"
-    print(f"sensor={args.sensor}; taxels={n_taxels}; {layout}")
+    print(f"sensor={args.sensor}; taxels={n_taxels}; {layout}; noise={'on' if args.noise else 'off'}")
     if args.vis and IS_MATPLOTLIB_AVAILABLE:
         print("Matplotlib live plot enabled when supported.")
     if args.vis:
diff --git a/genesis/engine/sensors/kinematic_tactile.py b/genesis/engine/sensors/kinematic_tactile.py
index f4e74e9d2b..cc04228871 100644
--- a/genesis/engine/sensors/kinematic_tactile.py
+++ b/genesis/engine/sensors/kinematic_tactile.py
@@ -1,5 +1,6 @@
+import math
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Callable, NamedTuple
+from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar
 
 import numpy as np
 import quadrants as qd
@@ -23,6 +24,15 @@
     ProbesWithNormalSensorMixin,
     ProbesWithNormalSensorSharedMetadataT,
     func_noised_probe_radius,
+    get_measured_bufs,
+)
+from .tactile_shared import (
+    GridFFTConvMetadataMixin,
+    ViscoelasticHysteresisMetadataMixin,
+    ViscoelasticHysteresisMixin,
+    next_pow2,
+    normalize_grid_probe_layout,
+    register_grid_fft_sensor,
 )
 
 if TYPE_CHECKING:
@@ -149,12 +159,66 @@ def _func_query_contact_depth(
     return max_pen_gt, contact_link_gt, contact_normal_gt, max_pen_m, contact_link_m, contact_normal_m
 
 
+@qd.func
+def _func_kinematic_spring_damper(
+    i_b: int,
+    max_penetration: float,
+    contact_link: int,
+    contact_normal: qd.types.vector(3),
+    sensor_link_idx: int,
+    probe_pos: qd.types.vector(3),
+    probe_pos_local: qd.types.vector(3),
+    link_quat: qd.types.vector(4),
+    normal_stiffness: float,
+    normal_damping: float,
+    normal_exponent: float,
+    shear_scalar: float,
+    twist_scalar: float,
+    links_state: array_class.LinksState,
+):
+    """
+    Kinematic spring-damper force / torque in the sensor link frame from a single probe's contact query.
+
+    Shared by the GT and measured branches of ``_kernel_kinematic_taxel`` (they differ only in which dual-radius
+    query result is fed in). Returns ``(force_local, torque_local)``; both zero when ``max_penetration <= 0``.
+    """
+    force_local = qd.Vector.zero(gs.qd_float, 3)
+    torque_local = qd.Vector.zero(gs.qd_float, 3)
+    if max_penetration > 0:
+        contact_normal_local = gu.qd_inv_transform_by_quat(contact_normal, link_quat)
+        s = qd.pow(max_penetration, normal_exponent)
+        force_local = contact_normal_local * (normal_stiffness * s)
+
+        if contact_link >= 0:
+            contact_vel = links_state.cd_vel[contact_link, i_b] + links_state.cd_ang[contact_link, i_b].cross(
+                probe_pos - links_state.root_COM[contact_link, i_b]
+            )
+            sensor_vel = links_state.cd_vel[sensor_link_idx, i_b] + links_state.cd_ang[sensor_link_idx, i_b].cross(
+                probe_pos - links_state.root_COM[sensor_link_idx, i_b]
+            )
+            rel_vel_world = contact_vel - sensor_vel
+            rel_vel_local = gu.qd_inv_transform_by_quat(rel_vel_world, link_quat)
+
+            vn_dot = rel_vel_local.dot(contact_normal_local)
+            v_t_local = rel_vel_local - contact_normal_local * vn_dot
+            force_local += contact_normal_local * (normal_damping * s * vn_dot) - shear_scalar * v_t_local
+
+            rel_ang_world = links_state.cd_ang[contact_link, i_b] - links_state.cd_ang[sensor_link_idx, i_b]
+            omega_n = rel_ang_world.dot(contact_normal)
+            torque_local = probe_pos_local.cross(force_local) - contact_normal_local * (twist_scalar * omega_n)
+        else:
+            torque_local = probe_pos_local.cross(force_local)
+
+    return force_local, torque_local
+
+
 @qd.kernel
 def _kernel_kinematic_taxel(
     probe_positions_local: qd.types.ndarray(),
     probe_sensor_idx: qd.types.ndarray(),
     probe_radii: qd.types.ndarray(),
     probe_radii_noise: qd.types.ndarray(),
+    probe_gains: qd.types.ndarray(),
     normal_stiffness: qd.types.ndarray(),
     normal_damping: qd.types.ndarray(),
     normal_exponent: qd.types.ndarray(),
@@ -172,6 +236,7 @@ def _kernel_kinematic_taxel(
     rigid_global_info: array_class.RigidGlobalInfo,
     sdf_info: array_class.SDFInfo,
     eps: float,
+    measured_equals_gt: int,
     output_gt: qd.types.ndarray(),
     output_measured: qd.types.ndarray(),
 ):
@@ -180,6 +245,20 @@ def _kernel_kinematic_taxel(
 
     for i_p, i_b in qd.ndrange(total_n_probes, n_batches):
         i_s = probe_sensor_idx[i_p]
+        probe_idx_in_sensor = i_p - sensor_probe_start[i_s]
+        cache_start = sensor_cache_start[i_s]
+        n_probes = n_probes_per_sensor[i_s]
+        force_start = cache_start + probe_idx_in_sensor * 3
+        torque_start = cache_start + n_probes * 3 + probe_idx_in_sensor * 3
+
+        # Inactive filler probe (probe_radius == 0): reads zero force/torque, no contact query.
+        if probe_radii[i_p] <= gs.qd_float(0.0):
+            for j in qd.static(range(3)):
+                output_gt[force_start + j, i_b] = gs.qd_float(0.0)
+                output_gt[torque_start + j, i_b] = gs.qd_float(0.0)
+                output_measured[force_start + j, i_b] = gs.qd_float(0.0)
+                output_measured[torque_start + j, i_b] = gs.qd_float(0.0)
+            continue
 
         probe_pos_local = qd.Vector(
             [probe_positions_local[i_p, 0], probe_positions_local[i_p, 1], probe_positions_local[i_p, 2]]
@@ -220,77 +299,47 @@ def _kernel_kinematic_taxel(
             eps,
         )
 
-        force_local_gt = qd.Vector.zero(gs.qd_float, 3)
-        torque_local_gt = qd.Vector.zero(gs.qd_float, 3)
-        if max_penetration_gt > 0:
-            contact_normal_local = gu.qd_inv_transform_by_quat(contact_normal_gt, link_quat)
-            s = qd.pow(max_penetration_gt, normal_exponent[i_s])
-            force_local_gt = contact_normal_local * (normal_stiffness[i_s] * s)
-
-            if contact_link_gt >= 0:
-                contact_vel = links_state.cd_vel[contact_link_gt, i_b] + links_state.cd_ang[contact_link_gt, i_b].cross(
-                    probe_pos - links_state.root_COM[contact_link_gt, i_b]
-                )
-                sensor_vel = links_state.cd_vel[sensor_link_idx, i_b] + links_state.cd_ang[sensor_link_idx, i_b].cross(
-                    probe_pos - links_state.root_COM[sensor_link_idx, i_b]
-                )
-                rel_vel_world = contact_vel - sensor_vel
-                rel_vel_local = gu.qd_inv_transform_by_quat(rel_vel_world, link_quat)
-
-                vn_dot = rel_vel_local.dot(contact_normal_local)
-                v_t_local = rel_vel_local - contact_normal_local * vn_dot
-                force_local_gt += (
-                    contact_normal_local * (normal_damping[i_s] * s * vn_dot) - shear_scalar[i_s] * v_t_local
-                )
-
-                rel_ang_world = links_state.cd_ang[contact_link_gt, i_b] - links_state.cd_ang[sensor_link_idx, i_b]
-                omega_n = rel_ang_world.dot(contact_normal_gt)
-                torque_local_gt = probe_pos_local.cross(force_local_gt) - contact_normal_local * (
-                    twist_scalar[i_s] * omega_n
-                )
-            else:
-                torque_local_gt = probe_pos_local.cross(force_local_gt)
-
-        force_local_m = qd.Vector.zero(gs.qd_float, 3)
-        torque_local_m = qd.Vector.zero(gs.qd_float, 3)
-        if not use_noised_radius:
-            for j in qd.static(range(3)):
-                force_local_m[j] = force_local_gt[j]
-                torque_local_m[j] = torque_local_gt[j]
-        elif max_penetration_m > 0:
-            contact_normal_local = gu.qd_inv_transform_by_quat(contact_normal_m, link_quat)
-            s = qd.pow(max_penetration_m, normal_exponent[i_s])
-            force_local_m = contact_normal_local * (normal_stiffness[i_s] * s)
-
-            if contact_link_m >= 0:
-                contact_vel = links_state.cd_vel[contact_link_m, i_b] + links_state.cd_ang[contact_link_m, i_b].cross(
-                    probe_pos - links_state.root_COM[contact_link_m, i_b]
-                )
-                sensor_vel = links_state.cd_vel[sensor_link_idx, i_b] + links_state.cd_ang[sensor_link_idx, i_b].cross(
-                    probe_pos - links_state.root_COM[sensor_link_idx, i_b]
-                )
-                rel_vel_world = contact_vel - sensor_vel
-                rel_vel_local = gu.qd_inv_transform_by_quat(rel_vel_world, link_quat)
-
-                vn_dot = rel_vel_local.dot(contact_normal_local)
-                v_t_local = rel_vel_local - contact_normal_local * vn_dot
-                force_local_m += (
-                    contact_normal_local * (normal_damping[i_s] * s * vn_dot) - shear_scalar[i_s] * v_t_local
-                )
-
-                rel_ang_world = links_state.cd_ang[contact_link_m, i_b] - links_state.cd_ang[sensor_link_idx, i_b]
-                omega_n = rel_ang_world.dot(contact_normal_m)
-                torque_local_m = probe_pos_local.cross(force_local_m) - contact_normal_local * (
-                    twist_scalar[i_s] * omega_n
-                )
-            else:
-                torque_local_m = probe_pos_local.cross(force_local_m)
+        force_local_gt, torque_local_gt = _func_kinematic_spring_damper(
+            i_b,
+            max_penetration_gt,
+            contact_link_gt,
+            contact_normal_gt,
+            sensor_link_idx,
+            probe_pos,
+            probe_pos_local,
+            link_quat,
+            normal_stiffness[i_s],
+            normal_damping[i_s],
+            normal_exponent[i_s],
+            shear_scalar[i_s],
+            twist_scalar[i_s],
+            links_state,
+        )
+
+        force_local_m = force_local_gt
+        torque_local_m = torque_local_gt
+        if measured_equals_gt == 0:
+            # The measured branch differs from GT: either some probe has a noised sensing radius or a non-unit
+            # per-(env, probe) gain. Gain scales the measured penetration only; force / torque then scale as
+            # ``gain ** normal_exponent`` since they derive from ``s = max_penetration_m ** normal_exponent``.
+            max_penetration_m = max_penetration_m * probe_gains[i_b, i_p]
+            force_local_m, torque_local_m = _func_kinematic_spring_damper(
+                i_b,
+                max_penetration_m,
+                contact_link_m,
+                contact_normal_m,
+                sensor_link_idx,
+                probe_pos,
+                probe_pos_local,
+                link_quat,
+                normal_stiffness[i_s],
+                normal_damping[i_s],
+                normal_exponent[i_s],
+                shear_scalar[i_s],
+                twist_scalar[i_s],
+                links_state,
+            )
 
-        probe_idx_in_sensor = i_p - sensor_probe_start[i_s]
-        cache_start = sensor_cache_start[i_s]
-        n_probes = n_probes_per_sensor[i_s]
-        force_start = cache_start + probe_idx_in_sensor * 3
-        torque_start = cache_start + n_probes * 3 + probe_idx_in_sensor * 3
         for j in qd.static(range(3)):
             output_gt[force_start + j, i_b] = force_local_gt[j]
             output_gt[torque_start + j, i_b] = torque_local_gt[j]
@@ -304,6 +353,7 @@ def _kernel_contact_depth_probe(
     probe_sensor_idx: qd.types.ndarray(),
     probe_radii: qd.types.ndarray(),
     probe_radii_noise: qd.types.ndarray(),
+    probe_gains: qd.types.ndarray(),
     links_idx: qd.types.ndarray(),
     sensor_cache_start: qd.types.ndarray(),
     sensor_probe_start: qd.types.ndarray(),
@@ -321,6 +371,13 @@ def _kernel_contact_depth_probe(
     for i_p, i_b in qd.ndrange(total_n_probes, n_batches):
         i_s = probe_sensor_idx[i_p]
 
+        # Inactive filler probe (probe_radius == 0): reads zero depth (which contact-probe interprets as no contact).
+        if probe_radii[i_p] <= gs.qd_float(0.0):
+            cache_idx = sensor_cache_start[i_s] + i_p - sensor_probe_start[i_s]
+            output_gt[cache_idx, i_b] = gs.qd_float(0.0)
+            output_measured[cache_idx, i_b] = gs.qd_float(0.0)
+            continue
+
         probe_pos_local = qd.Vector(
             [probe_positions_local[i_p, 0], probe_positions_local[i_p, 1], probe_positions_local[i_p, 2]]
         )
@@ -348,47 +405,31 @@ def _kernel_contact_depth_probe(
             collider_state,
             sdf_info,
         )
+        # Per-(env, probe) gain on the measured-branch depth only.
+        max_penetration_m = max_penetration_m * probe_gains[i_b, i_p]
         cache_idx = sensor_cache_start[i_s] + i_p - sensor_probe_start[i_s]
         output_gt[cache_idx, i_b] = max_penetration_gt
         output_measured[cache_idx, i_b] = max_penetration_m
 
 
 class KinematicTactileSensorMixin(ProbeSensorMixin[ProbesWithNormalSensorSharedMetadataT]):
-    def __init__(self, sensor_options: "SensorOptions", sensor_idx: int, sensor_manager: "SensorManager"):
-        super().__init__(sensor_options, sensor_idx, sensor_manager)
-        self._debug_objects: list = []
-
     def build(self):
         super().build()
         self._shared_metadata.solver.collider.activate_sdf()
 
-    def _draw_debug_probes(self, context: "RasterizerContext", get_is_contact: Callable[[object], object]):
-        for obj in self._debug_objects:
-            context.clear_debug_object(obj)
-        self._debug_objects.clear()
-
-        envs_idx, n_debug_envs, _, probe_world = self._compute_probes_world_pos(context)
-        data = self.read_ground_truth(envs_idx)
-        is_contact = np.asarray(tensor_to_array(get_is_contact(data)), dtype=bool).reshape(-1)
-        probe_global_idx = int(self._shared_metadata.sensor_probe_start[self._idx])
-        probe_radius = float(self._shared_metadata.probe_radii[probe_global_idx])
-        for is_contact_state in (False, True):
-            (probes_idx,) = np.nonzero(is_contact == is_contact_state)
-            if probes_idx.size > 0:
-                spheres_obj = context.draw_debug_spheres(
-                    poss=probe_world[probes_idx],
-                    radius=probe_radius,
-                    color=self._options.debug_contact_color if is_contact_state else self._options.debug_probe_color,
-                )
-                self._debug_objects.append(spheres_obj)
-
 
 @dataclass
-class ContactDepthProbeMetadata(ProbeSensorMetadataMixin, RigidSensorMetadataMixin, SimpleSensorMetadata):
+class ContactDepthProbeMetadata(
+    ViscoelasticHysteresisMetadataMixin,
+    ProbeSensorMetadataMixin,
+    RigidSensorMetadataMixin,
+    SimpleSensorMetadata,
+):
     pass
 
 
 class ContactDepthProbeSensor(
+    ViscoelasticHysteresisMixin[ContactDepthProbeMetadata],
     KinematicTactileSensorMixin[ContactDepthProbeMetadata],
     RigidSensorMixin[ContactDepthProbeMetadata],
     SimpleSensor[ContactDepthProbeOptions, ContactDepthProbeMetadata, tuple],
@@ -396,7 +437,7 @@ class ContactDepthProbeSensor(
     """Returns contact depth in meters per probe."""
 
     def _get_return_format(self) -> tuple[int, ...]:
-        return (self._n_probes,)
+        return self._probe_layout_shape
 
     @classmethod
     def _get_cache_dtype(cls) -> torch.dtype:
@@ -411,19 +452,15 @@ def _update_current_timestep_data(
         measured_data_timeline: "TensorRingBuffer",
     ):
         solver = shared_metadata.solver
-
-        current_ground_truth_data_T.zero_()
-        measured = measured_data_timeline.at(0, copy=False)
-        measured.zero_()
-        if shared_metadata.measured_scratch_T.shape != current_ground_truth_data_T.shape:
-            shared_metadata.measured_scratch_T = torch.empty_like(current_ground_truth_data_T)
-        measured_cols_b = shared_metadata.measured_scratch_T
-
+        measured, measured_cols_b = get_measured_bufs(
+            shared_metadata, current_ground_truth_data_T, measured_data_timeline
+        )
         _kernel_contact_depth_probe(
             shared_metadata.probe_positions,
             shared_metadata.probe_sensor_idx,
             shared_metadata.probe_radii,
             shared_metadata.probe_radii_noise,
+            shared_metadata.probe_gains,
             shared_metadata.links_idx,
             shared_metadata.sensor_cache_start,
             shared_metadata.sensor_probe_start,
@@ -440,24 +477,48 @@ def _update_current_timestep_data(
         measured.copy_(measured_cols_b.T)
 
     def _draw_debug(self, context: "RasterizerContext"):
-        self._draw_debug_probes(context, lambda depth: depth >= gs.EPS)
+        def mask(envs_idx):
+            depth = self.read_ground_truth(envs_idx)
+            if self._options.history_length > 0:
+                depth = depth.select(1 if self._manager._sim.n_envs > 0 else 0, -1)
+            return depth >= gs.EPS
+
+        self._draw_debug_probes(context, self._tactile_color_groups_fn(mask))
 
 
 @dataclass
 class ContactProbeMetadata(ContactDepthProbeMetadata):
     contact_threshold: torch.Tensor = make_tensor_field((0,))
-    # Per-probe threshold scattered into intermediate-cache layout, computed lazily on first `_post_process`.
+    release_threshold: torch.Tensor = make_tensor_field((0,))
+    # Per-probe thresholds scattered into intermediate-cache layout, computed lazily on first `_post_process`.
     threshold_row: torch.Tensor = make_tensor_field((0,))
+    release_threshold_row: torch.Tensor = make_tensor_field((0,))
 
 
 class ContactProbeSensor(ContactDepthProbeSensor, SimpleSensor[ContactProbeOptions, ContactProbeMetadata, tuple]):
-    """Returns boolean contact per probe (depth > threshold). Shares the depth-probe kernel."""
+    """
+    Returns boolean contact per probe with optional Schmitt-trigger hysteresis. Shares the depth-probe kernel.
+
+    The contact bit latches on when depth exceeds ``contact_threshold`` and releases when depth drops to or below
+    ``release_threshold``. When ``release_threshold`` is left unset (the default; it then falls back to
+    ``contact_threshold``), the latch is degenerate and behavior matches a stateless threshold. Latch state is read
+    from the per-branch return-space ring, so GT and measured branches latch independently and reset cleanly with
+    the env (the manager zeros the ring on reset).
+    """
 
     def build(self):
         super().build()
         self._shared_metadata.contact_threshold = concat_with_tensor(
             self._shared_metadata.contact_threshold, self._options.contact_threshold, expand=(1,)
         )
+        release = (
+            self._options.contact_threshold
+            if self._options.release_threshold is None
+            else self._options.release_threshold
+        )
+        self._shared_metadata.release_threshold = concat_with_tensor(
+            self._shared_metadata.release_threshold, release, expand=(1,)
+        )
 
     @classmethod
     def _get_cache_dtype(cls) -> torch.dtype:
@@ -483,15 +544,26 @@ def _post_process(
             i_p = torch.arange(shared_metadata.total_n_probes, device=gs.device, dtype=gs.tc_int)
             i_s = shared_metadata.probe_sensor_idx
             cache_idx = shared_metadata.sensor_cache_start[i_s] + i_p - shared_metadata.sensor_probe_start[i_s]
-            row = torch.zeros((tensor.shape[1],), dtype=tensor.dtype, device=gs.device)
-            row.scatter_(
-                0, cache_idx.to(dtype=torch.int64), shared_metadata.contact_threshold[i_s].to(dtype=tensor.dtype)
-            )
-            shared_metadata.threshold_row = row
-        return tensor > shared_metadata.threshold_row.unsqueeze(0)
+            cache_idx_64 = cache_idx.to(dtype=torch.int64)
+            enter_row = torch.zeros((tensor.shape[1],), dtype=tensor.dtype, device=gs.device)
+            enter_row.scatter_(0, cache_idx_64, shared_metadata.contact_threshold[i_s].to(dtype=tensor.dtype))
+            release_row = torch.zeros((tensor.shape[1],), dtype=tensor.dtype, device=gs.device)
+            release_row.scatter_(0, cache_idx_64, shared_metadata.release_threshold[i_s].to(dtype=tensor.dtype))
+            shared_metadata.threshold_row = enter_row
+            shared_metadata.release_threshold_row = release_row
+        above_enter = tensor > shared_metadata.threshold_row.unsqueeze(0)
+        above_release = tensor > shared_metadata.release_threshold_row.unsqueeze(0)
+        prev_state = timeline.at(0, copy=False)
+        return above_enter | (prev_state & above_release)
 
     def _draw_debug(self, context: "RasterizerContext"):
-        self._draw_debug_probes(context, lambda data: data)
+        def mask(envs_idx):
+            contact = self.read_ground_truth(envs_idx)
+            if self._options.history_length > 0:
+                contact = contact.select(1 if self._manager._sim.n_envs > 0 else 0, -1)
+            return contact
+
+        self._draw_debug_probes(context, self._tactile_color_groups_fn(mask))
 
 
 class KinematicTaxelData(NamedTuple):
@@ -508,15 +580,188 @@ class KinematicTaxelData(NamedTuple):
 
 
 @dataclass
-class KinematicTaxelMetadata(ProbesWithNormalSensorMetadataMixin, RigidSensorMetadataMixin, SimpleSensorMetadata):
+class KinematicTaxelMetadata(
+    ViscoelasticHysteresisMetadataMixin,
+    GridFFTConvMetadataMixin,
+    ProbesWithNormalSensorMetadataMixin,
+    RigidSensorMetadataMixin,
+    SimpleSensorMetadata,
+):
     normal_stiffness: torch.Tensor = make_tensor_field((0,))
     normal_damping: torch.Tensor = make_tensor_field((0,))
     normal_exponent: torch.Tensor = make_tensor_field((0,))
     shear_scalar: torch.Tensor = make_tensor_field((0,))
     twist_scalar: torch.Tensor = make_tensor_field((0,))
 
+    # Spatial crosstalk reuses the shared ``GridFFTConvMetadataMixin`` state. ``grid_fft_meta`` tuples for this
+    # sensor are ``(sensor_idx, g_ny, g_nx, probe_start, cache_start, sigma, strength, spacing_u, spacing_v)``;
+    # the kernel is a combined ``(1 - strength) * identity + strength * Gaussian / sum(Gaussian)`` blur and the
+    # per-step buffer has 6 channels (force xyz + torque xyz).
+
+
+@torch.jit.script
+def _precompute_crosstalk_kernel_fft(
+    sigma: float,
+    strength: float,
+    grid_spacing: tuple[float, float],
+    fft_n: tuple[int, int],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Combined ``(1 - strength) * identity + strength * Gaussian/sum(Gaussian)`` kernel, real-FFT'd.
+
+    Kernel is centered on the FFT origin via ``ifftshift`` so circular convolution is equivalent to convolution with
+    a kernel anchored at the taxel itself. The Gaussian is L1-normalized so a uniform field passes through unchanged
+    (DC bin = 1); the identity-blend keeps the response peaked at the source taxel and the rest leaked into the
+    Gaussian skirt. The output is a complex ``(fft_n[0], fft_n[1] // 2 + 1)`` half-spectrum ready to multiply against
+    ``rfft2(field)``.
+    """
+    i = torch.arange(fft_n[0], dtype=dtype, device=device)
+    j = torch.arange(fft_n[1], dtype=dtype, device=device)
+    yy, xx = torch.meshgrid((i - fft_n[0] // 2) * grid_spacing[0], (j - fft_n[1] // 2) * grid_spacing[1], indexing="ij")
+    sigma_t = torch.tensor(sigma, dtype=dtype, device=device)
+    g = torch.exp(-(xx * xx + yy * yy) / (2.0 * sigma_t * sigma_t))
+    g = g / g.sum()
+    # Identity in centered layout: 1 at the central cell. ``ifftshift`` then aligns it with FFT index 0.
+    identity = torch.zeros_like(g)
+    identity[fft_n[0] // 2, fft_n[1] // 2] = 1.0
+    combined = (1.0 - strength) * identity + strength * g
+    combined = torch.fft.ifftshift(combined, dim=(-2, -1))
+    return torch.fft.rfft2(combined)
+
+
+def _crosstalk_kernel_builder(meta_entry: tuple, fft_n: tuple[int, int]) -> torch.Tensor:
+    """``register_grid_fft_sensor`` kernel builder for spatial crosstalk: 1 plane (identity-blended Gaussian).
+
+    ``meta_entry`` is ``(sensor_idx, g_ny, g_nx, probe_start, cache_start, sigma, strength, spacing_u, spacing_v)``.
+    The crosstalk kernel's axis 0 spans ny / tangent_v and axis 1 spans nx / tangent_u, so spacing is passed as
+    ``(spacing_v, spacing_u)``.
+    """
+    _, _, _, _, _, sigma, strength, spacing_u, spacing_v = meta_entry
+    k = _precompute_crosstalk_kernel_fft(sigma, strength, (spacing_v, spacing_u), fft_n, gs.device, gs.tc_float)
+    return k.unsqueeze(0)  # (1, fft_ny, fft_nx) -- single kernel plane
+
+
+def _kinematic_taxel_grid_fft_crosstalk(
+    grid_fft_meta: list[tuple],
+    grid_fft_kernels_stacked: torch.Tensor,
+    cache_data: torch.Tensor,
+    grid_fft_buffer: torch.Tensor,
+    probe_radii: torch.Tensor,
+) -> None:
+    """
+    Apply per-sensor 2D-FFT spatial crosstalk to all 6 channels (force xyz + torque xyz) of every registered
+    grid-crosstalk KinematicTaxel sensor. Mutates ``cache_data`` in place.
+
+    ``cache_data`` is the per-class intermediate cache in ``(B, total_cols)`` layout. Each KinematicTaxel sensor's
+    slice spans ``2 * n_probes * 3`` columns: 3 force xyz cols per probe, then 3 torque xyz cols per probe.
+    """
+    if not grid_fft_meta:
+        return
+    B = cache_data.shape[0]
+    fft_ny, fft_nx = grid_fft_buffer.shape[-2], grid_fft_buffer.shape[-1]
+
+    # 1) Fill the active region of the buffer. The zero-padding region is never written here and stays zero from
+    # allocation (``register_grid_fft_sensor`` allocates with ``torch.zeros``); the active ``[:g_ny, :g_nx]`` region
+    # is fully overwritten every step, so no per-step ``zero_()`` is needed.
+    for grid_pos, (_, g_ny, g_nx, _, cache_start, _, _, _, _) in enumerate(grid_fft_meta):
+        n_probes = g_ny * g_nx
+        # Layout in cache: force.xyz for all probes, then torque.xyz for all probes. Each block is ``n_probes * 3``
+        # cols, with probe-major ordering matching probe flat index ``iy * nx + ix``.
+        force_block = cache_data[:, cache_start : cache_start + n_probes * 3]
+        torque_block = cache_data[:, cache_start + n_probes * 3 : cache_start + 2 * n_probes * 3]
+        # Reshape (B, ny, nx, 3) -> (B, 3, ny, nx); the slice-assignment accepts the non-contiguous permuted view.
+        grid_fft_buffer[:, grid_pos, 0:3, :g_ny, :g_nx] = force_block.view(B, g_ny, g_nx, 3).permute(0, 3, 1, 2)
+        grid_fft_buffer[:, grid_pos, 3:6, :g_ny, :g_nx] = torque_block.view(B, g_ny, g_nx, 3).permute(0, 3, 1, 2)
+
+    # 2) Batched real FFT over the last two dims; kernel is per-sensor, broadcast over B and 6 channels. Inputs are
+    # real so ``rfft2`` (half spectrum) is ~2x cheaper than the full complex ``fft2``.
+    H_fft = torch.fft.rfft2(grid_fft_buffer)  # (B, n_grid_xt, 6, fft_ny, fft_nx // 2 + 1) complex
+    # Stacked kernels: (n_grid_xt, 1, fft_ny, fft_nx // 2 + 1) -> (1, n_grid_xt, 1, ...) for broadcast.
+    K = grid_fft_kernels_stacked.unsqueeze(0)
+    smeared = torch.fft.irfft2(H_fft * K, s=(fft_ny, fft_nx))  # (B, n_grid_xt, 6, fft_ny, fft_nx)
+
+    # 3) Slice each sensor back to its (g_ny, g_nx) grid and write into the cache.
+    for grid_pos, (_, g_ny, g_nx, probe_start, cache_start, _, _, _, _) in enumerate(grid_fft_meta):
+        n_probes = g_ny * g_nx
+        # Zero inactive filler probes (probe_radius == 0): the blur smears neighbour force/torque into their cells.
+        active = (probe_radii[probe_start : probe_start + n_probes] > 0.0).to(smeared.dtype).view(1, 1, g_ny, g_nx)
+        force_smeared = smeared[:, grid_pos, 0:3, :g_ny, :g_nx] * active  # (B, 3, ny, nx)
+        torque_smeared = smeared[:, grid_pos, 3:6, :g_ny, :g_nx] * active
+        # Inverse of the permute used in step 1: (B, 3, ny, nx) -> (B, ny, nx, 3) -> flat (B, ny*nx*3).
+        cache_data[:, cache_start : cache_start + n_probes * 3] = force_smeared.permute(0, 2, 3, 1).reshape(
+            B, n_probes * 3
+        )
+        cache_data[:, cache_start + n_probes * 3 : cache_start + 2 * n_probes * 3] = torque_smeared.permute(
+            0, 2, 3, 1
+        ).reshape(B, n_probes * 3)
+
+
+CrosstalkSharedMetadataT = TypeVar("CrosstalkSharedMetadataT", bound=KinematicTaxelMetadata)
+
+
+class KinematicTaxelCrosstalkMixin(Generic[CrosstalkSharedMetadataT]):
+    """
+    Adds FFT-based spatial crosstalk (Gaussian blur, optionally mixed with identity) to KinematicTaxel on the
+    measured branch. Operates on all 6 channels (force xyz + torque xyz) of every grid-shaped sensor with
+    ``crosstalk_strength > 0``. Must come BEFORE ``SimpleSensor`` and AFTER ``ViscoelasticHysteresisMixin`` in MRO
+    so the data flow is: kernel output -> crosstalk -> hysteresis -> hardware imperfections.
+    """
+
+    _shared_metadata: CrosstalkSharedMetadataT
+
+    def _register_crosstalk(self):
+        """Register this sensor for FFT crosstalk via the shared ``register_grid_fft_sensor`` scaffolding.
+
+        Called only when this sensor has a validated grid layout AND ``crosstalk_strength > 0``. The FFT size is
+        ``(ny, nx)``, padded for the Gaussian tail so circular wrap stays below tolerance.
+        """
+        sm = self._shared_metadata
+        sensor_idx = sm.n_probes_per_sensor.shape[0] - 1  # this sensor was just registered
+        probe_start = int(sm.sensor_probe_start[sensor_idx].item())
+        cache_start = int(sm.sensor_cache_start[sensor_idx].item())
+        g_ny, g_nx = int(self._probe_layout_shape[0]), int(self._probe_layout_shape[1])
+        sigma = float(self._options.crosstalk_sigma)
+        strength = float(self._options.crosstalk_strength)
+        spacing_u = float(self._grid_spacing[0].item())
+        spacing_v = float(self._grid_spacing[1].item())
+        # FFT size per axis: grid extent + the 3-sigma Gaussian tail on each side, rounded up to a power of 2.
+        # Truncating at 3 sigma leaves circular wraparound below ~0.3% (sub-tolerance for a well-localized blur).
+        fft_ny = next_pow2(g_ny + 2 * int(math.ceil(3.0 * sigma / spacing_v)))
+        fft_nx = next_pow2(g_nx + 2 * int(math.ceil(3.0 * sigma / spacing_u)))
+        register_grid_fft_sensor(
+            sm,
+            meta_entry=(sensor_idx, g_ny, g_nx, probe_start, cache_start, sigma, strength, spacing_u, spacing_v),
+            this_fft_n=(fft_ny, fft_nx),
+            kernel_builder=_crosstalk_kernel_builder,
+            n_buffer_channels=6,
+            batch_size=self._manager._sim._B,
+        )
+
+    @classmethod
+    def _apply_transform(
+        cls,
+        shared_metadata: CrosstalkSharedMetadataT,
+        data: torch.Tensor,
+        timeline: "TensorRingBuffer",
+        *,
+        is_measured: bool,
+    ):
+        super()._apply_transform(shared_metadata, data, timeline, is_measured=is_measured)
+        if not is_measured or not shared_metadata.any_grid_fft:
+            return
+        _kinematic_taxel_grid_fft_crosstalk(
+            shared_metadata.grid_fft_meta,
+            shared_metadata.grid_fft_kernels_stacked,
+            data,
+            shared_metadata.grid_fft_buffer,
+            shared_metadata.probe_radii,
+        )
+
 
 class KinematicTaxelSensor(
+    ViscoelasticHysteresisMixin[KinematicTaxelMetadata],
+    KinematicTaxelCrosstalkMixin[KinematicTaxelMetadata],
     KinematicTactileSensorMixin[KinematicTaxelMetadata],
     ProbesWithNormalSensorMixin[KinematicTaxelMetadata],
     RigidSensorMixin[KinematicTaxelMetadata],
@@ -524,8 +769,32 @@ class KinematicTaxelSensor(
 ):
     """Kinematic taxels: spring-damper force and torque per probe from contact geometry and relative motion."""
 
+    # Two channel groups: force xyz followed by torque xyz (probe-major within each group). See
+    # ``ProbeSensorMixin._taxel_channel_groups`` for how this drives dead-taxel cache-col -> probe mapping.
+    _taxel_channel_groups: int = 2
+
     def __init__(self, sensor_options: KinematicTaxelOptions, sensor_idx: int, sensor_manager: "SensorManager"):
         super().__init__(sensor_options, sensor_idx, sensor_manager)
+        # FFT-grid eligibility: validates that a 2D layout has uniform spacing/normals/orthogonal tangents.
+        # Flat pos/normals are already populated by ProbeSensorMixin / ProbesWithNormalSensorMixin.
+        is_grid = len(self._probe_layout_shape) == 2
+        _, _, self._use_grid_fft, grid_normal, grid_tangent_u, grid_tangent_v, grid_spacing = (
+            normalize_grid_probe_layout(
+                np.asarray(sensor_options.probe_local_pos, dtype=gs.np_float),
+                np.asarray(sensor_options.probe_local_normal, dtype=gs.np_float),
+                is_grid,
+            )
+        )
+        self._grid_normal = torch.tensor(grid_normal, dtype=gs.tc_float, device=gs.device)
+        self._grid_tangent_u = torch.tensor(grid_tangent_u, dtype=gs.tc_float, device=gs.device)
+        self._grid_tangent_v = torch.tensor(grid_tangent_v, dtype=gs.tc_float, device=gs.device)
+        self._grid_spacing = torch.tensor(grid_spacing, dtype=gs.tc_float, device=gs.device)
+
+        if self._options.crosstalk_strength > 0.0 and not self._use_grid_fft:
+            gs.raise_exception(
+                "KinematicTaxel crosstalk requires a validated grid layout (probe_local_pos shape (ny, nx, 3) with "
+                "uniform spacing, uniform normals, and orthogonal tangents)."
+            )
 
     def build(self):
         super().build()
@@ -546,8 +815,12 @@ def build(self):
             self._shared_metadata.twist_scalar, float(self._options.twist_scalar), expand=(1,)
         )
 
+        if self._options.crosstalk_strength > 0.0:
+            self._register_crosstalk()
+
     def _get_return_format(self) -> tuple[tuple[int, ...], ...]:
-        return (self._n_probes, 3), (self._n_probes, 3)
+        shape = (*self._probe_layout_shape, 3)
+        return shape, shape
 
     @classmethod
     def _get_cache_dtype(cls) -> torch.dtype:
@@ -562,19 +835,20 @@ def _update_current_timestep_data(
         measured_data_timeline: "TensorRingBuffer",
     ):
         solver = shared_metadata.solver
-
-        current_ground_truth_data_T.zero_()
-        measured = measured_data_timeline.at(0, copy=False)
-        measured.zero_()
-        if shared_metadata.measured_scratch_T.shape != current_ground_truth_data_T.shape:
-            shared_metadata.measured_scratch_T = torch.empty_like(current_ground_truth_data_T)
-        measured_cols_b = shared_metadata.measured_scratch_T
-
+        measured, measured_cols_b = get_measured_bufs(
+            shared_metadata, current_ground_truth_data_T, measured_data_timeline
+        )
+        # The measured branch is provably identical to GT (and the kernel can skip recomputing it) when no probe
+        # has a noised sensing radius and no probe has a non-unit measured-branch gain.
+        measured_equals_gt = int(
+            not shared_metadata.has_any_probe_radius_noise and not shared_metadata.has_any_probe_gain
+        )
         _kernel_kinematic_taxel(
             shared_metadata.probe_positions,
             shared_metadata.probe_sensor_idx,
             shared_metadata.probe_radii,
             shared_metadata.probe_radii_noise,
+            shared_metadata.probe_gains,
             shared_metadata.normal_stiffness,
             shared_metadata.normal_damping,
             shared_metadata.normal_exponent,
@@ -592,6 +866,7 @@ def _update_current_timestep_data(
             solver._rigid_global_info,
             solver.collider._sdf._sdf_info,
             gs.EPS,
+            measured_equals_gt,
             current_ground_truth_data_T,
             measured_cols_b,
         )
@@ -600,4 +875,10 @@ def _update_current_timestep_data(
         measured.copy_(measured_cols_b.T)
 
     def _draw_debug(self, context: "RasterizerContext"):
-        self._draw_debug_probes(context, lambda data: torch.linalg.norm(data.force, dim=-1) >= gs.EPS)
+        def mask(envs_idx):
+            force = self.read_ground_truth(envs_idx).force
+            if self._options.history_length > 0:
+                force = force.select(1 if self._manager._sim.n_envs > 0 else 0, -1)
+            return torch.linalg.norm(force, dim=-1) >= gs.EPS
+
+        self._draw_debug_probes(context, self._tactile_color_groups_fn(mask))
diff --git a/genesis/engine/sensors/point_cloud_tactile.py b/genesis/engine/sensors/point_cloud_tactile.py
index 2037c09f65..d7f66a51f5 100644
--- a/genesis/engine/sensors/point_cloud_tactile.py
+++ b/genesis/engine/sensors/point_cloud_tactile.py
@@ -21,6 +21,15 @@
     ProbesWithNormalSensorMetadataMixin,
     ProbesWithNormalSensorMixin,
     func_noised_probe_radius,
+    get_measured_bufs,
+)
+from .tactile_shared import (
+    GridFFTConvMetadataMixin,
+    ViscoelasticHysteresisMetadataMixin,
+    ViscoelasticHysteresisMixin,
+    next_pow2,
+    normalize_grid_probe_layout,
+    register_grid_fft_sensor,
 )
 
 if TYPE_CHECKING:
@@ -411,6 +420,7 @@ def _kernel_point_cloud_proximity_taxel_bvh(
     pc_active_envs_mask: qd.types.ndarray(),
     probe_radii: qd.types.ndarray(),
     probe_radii_noise: qd.types.ndarray(),
+    probe_gains: qd.types.ndarray(),
     stiffness: qd.types.ndarray(),
     shear_coupling: qd.types.ndarray(),
     proximity_density_scale: qd.types.ndarray(),
@@ -550,6 +560,14 @@ def _kernel_point_cloud_proximity_taxel_bvh(
                 fv_m[j] = fv_gt[j]
                 tau_w_m[j] = tau_w_gt[j]
 
+        # Per-(env, probe) gain on the measured-branch accumulated penetration. Force and torque computed from
+        # these accumulators downstream scale linearly with gain because they're proportional to ``sum_p``.
+        gain_m = probe_gains[i_b, i_p]
+        sum_p_m = sum_p_m * gain_m
+        for j in qd.static(range(3)):
+            fv_m[j] = fv_m[j] * gain_m
+            tau_w_m[j] = tau_w_m[j] * gain_m
+
         taxel_signal_buf[i_p, i_b] = sum_p_m
 
         f_w_gt = qd.Vector.zero(gs.qd_float, 3)
@@ -613,7 +631,6 @@ class PointCloudTactileSharedMetadata(ProbeSensorMetadataMixin, RigidSensorMetad
 class PointCloudTactileSensorMixin(ProbeSensorMixin[PointCloudTactileSensorMetadataMixinT]):
     def __init__(self, sensor_options: "SensorOptions", sensor_idx: int, sensor_manager: "SensorManager"):
         super().__init__(sensor_options, sensor_idx, sensor_manager)
-        self._debug_objects: list = []
         self._probe_start_idx = -1
         self._debug_pc_chunks: list[tuple[int, torch.Tensor, torch.Tensor]] | None = None
 
@@ -664,38 +681,24 @@ def build(self):
         )
 
     def _draw_debug_probes(
-        self, context: "RasterizerContext", get_magnitude_1d: Callable[[list[int] | None], np.ndarray]
-    ) -> None:
-        for obj in self._debug_objects:
-            context.clear_debug_object(obj)
-        self._debug_objects.clear()
-
-        envs_idx, n_debug_envs, env_offsets, probe_world = self._compute_probes_world_pos(context)
-
-        magnitude = get_magnitude_1d(envs_idx).reshape(-1)
-        for is_contact in (False, True):
-            (probes_idx,) = np.nonzero(magnitude >= gs.EPS if is_contact else magnitude < gs.EPS)
-            if probes_idx.size == 0:
-                continue
-            spheres_obj = context.draw_debug_spheres(
-                poss=probe_world[probes_idx],
-                radius=self._shared_metadata.probe_radii[self._probe_start_idx].item(),
-                color=self._options.debug_contact_color if is_contact else self._options.debug_probe_color,
-            )
-            self._debug_objects.append(spheres_obj)
+        self,
+        context: "RasterizerContext",
+        color_groups_fn: Callable[[list[int] | None], list[tuple]] | None = None,
+    ) -> tuple[list[int] | None, int, np.ndarray | None]:
+        envs_idx, n_debug_envs, env_offsets = super()._draw_debug_probes(context, color_groups_fn)
 
         if self._debug_pc_chunks is None:
-            return
+            return envs_idx, n_debug_envs, env_offsets
         world_chunks: list[np.ndarray] = []
         for link_idx, pos_local, active_envs_mask in self._debug_pc_chunks:
-            trk_link = self._shared_metadata.solver.links[link_idx]
+            track_link = self._shared_metadata.solver.links[link_idx]
             if envs_idx is not None:
                 active_mask = tensor_to_array(active_envs_mask[:, envs_idx].T).astype(bool)
                 if not active_mask.any():
                     continue
-                trk_pos = trk_link.get_pos(envs_idx)[:, None, :]
-                trk_quat = trk_link.get_quat(envs_idx)[:, None, :]
-                pc_world = gu.transform_by_trans_quat(pos_local[None, :, :], trk_pos, trk_quat)
+                track_pos = track_link.get_pos(envs_idx)[:, None, :]
+                track_quat = track_link.get_quat(envs_idx)[:, None, :]
+                pc_world = gu.transform_by_trans_quat(pos_local[None, :, :], track_pos, track_quat)
                 pc_world = tensor_to_array(pc_world) + env_offsets[:, None, :]
                 world_chunks.append(pc_world[active_mask])
             else:
@@ -703,18 +706,18 @@ def _draw_debug_probes(
                 pos_active = pos_local[active_mask]
                 if pos_active.numel() == 0:
                     continue
-                trk_pos = trk_link.get_pos(envs_idx).reshape(3)
-                trk_quat = trk_link.get_quat(envs_idx).reshape(4)
-                world_chunks.append(tensor_to_array(gu.transform_by_trans_quat(pos_active, trk_pos, trk_quat)))
-        if not world_chunks:
-            return
-        pc_world = np.concatenate(world_chunks, axis=0)
-        pc_obj = context.draw_debug_spheres(
-            poss=pc_world,
-            radius=float(self._options.debug_point_cloud_radius),
-            color=self._options.debug_point_cloud_color,
-        )
-        self._debug_objects.append(pc_obj)
+                track_pos = track_link.get_pos(envs_idx).reshape(3)
+                track_quat = track_link.get_quat(envs_idx).reshape(4)
+                world_chunks.append(tensor_to_array(gu.transform_by_trans_quat(pos_active, track_pos, track_quat)))
+        if world_chunks:
+            self._debug_objects.append(
+                context.draw_debug_spheres(
+                    poss=np.concatenate(world_chunks, axis=0),
+                    radius=float(self._options.debug_point_cloud_radius),
+                    color=self._options.debug_point_cloud_color,
+                )
+            )
+        return envs_idx, n_debug_envs, env_offsets
 
     def _debug_probe_buffer_magnitudes(self, buffer: torch.Tensor, envs_idx: list[int] | None) -> np.ndarray:
         values = buffer[self._probe_start_idx : self._probe_start_idx + self._n_probes]
@@ -731,7 +734,11 @@ class ProximityTaxelData(NamedTuple):
 
 
 @dataclass
-class ProximityTaxelMetadata(PointCloudTactileSharedMetadata, ProbesWithNormalSensorMetadataMixin):
+class ProximityTaxelMetadata(
+    ViscoelasticHysteresisMetadataMixin,
+    PointCloudTactileSharedMetadata,
+    ProbesWithNormalSensorMetadataMixin,
+):
     stiffness: torch.Tensor = make_tensor_field((0,))
     shear_coupling: torch.Tensor = make_tensor_field((0,))
     proximity_density_scale: torch.Tensor = make_tensor_field((0, 0))
@@ -739,6 +746,7 @@ class ProximityTaxelMetadata(PointCloudTactileSharedMetadata, ProbesWithNormalSe
 
 
 class ProximityTaxelSensor(
+    ViscoelasticHysteresisMixin[ProximityTaxelMetadata],
     PointCloudTactileSensorMixin[ProximityTaxelMetadata],
     ProbesWithNormalSensorMixin[ProximityTaxelMetadata],
     RigidSensorMixin[ProximityTaxelMetadata],
@@ -746,6 +754,9 @@ class ProximityTaxelSensor(
 ):
     """Spherical point-cloud taxels: per-taxel force and torque in link-local frame vs tracked meshes."""
 
+    # Two channel groups: force xyz followed by torque xyz (probe-major within each group).
+    _taxel_channel_groups: int = 2
+
     def build(self):
         super().build()
         self._shared_metadata.stiffness = concat_with_tensor(
@@ -769,7 +780,8 @@ def build(self):
         )
 
     def _get_return_format(self) -> tuple[tuple[int, ...], ...]:
-        return ((self._n_probes, 3), (self._n_probes, 3))
+        shape = (*self._probe_layout_shape, 3)
+        return shape, shape
 
     @classmethod
     def _get_cache_dtype(cls) -> torch.dtype:
@@ -789,13 +801,9 @@ def _update_current_timestep_data(
         measured_data_timeline: "TensorRingBuffer",
     ):
         solver = shared_metadata.solver
-        current_ground_truth_data_T.zero_()
-        measured = measured_data_timeline.at(0, copy=False)
-        measured.zero_()
-        if shared_metadata.measured_scratch_T.shape != current_ground_truth_data_T.shape:
-            shared_metadata.measured_scratch_T = torch.empty_like(current_ground_truth_data_T)
-        measured_cols_b = shared_metadata.measured_scratch_T
-
+        measured, measured_cols_b = get_measured_bufs(
+            shared_metadata, current_ground_truth_data_T, measured_data_timeline
+        )
         bvh = shared_metadata.pc_bvh
         _kernel_point_cloud_proximity_taxel_bvh(
             shared_metadata.probe_positions,
@@ -820,6 +828,7 @@ def _update_current_timestep_data(
             shared_metadata.pc_active_envs_mask,
             shared_metadata.probe_radii,
             shared_metadata.probe_radii_noise,
+            shared_metadata.probe_gains,
             shared_metadata.stiffness,
             shared_metadata.shear_coupling,
             shared_metadata.proximity_density_scale,
@@ -836,99 +845,13 @@ def _update_current_timestep_data(
     def _draw_debug(self, context: "RasterizerContext"):
         self._draw_debug_probes(
             context,
-            lambda envs_idx: self._debug_probe_buffer_magnitudes(self._shared_metadata.taxel_signal_buf, envs_idx),
+            self._tactile_color_groups_fn(
+                lambda envs_idx: self._debug_probe_buffer_magnitudes(self._shared_metadata.taxel_signal_buf, envs_idx)
+                >= gs.EPS,
+            ),
         )
 
 
-_GRID_TOL = 1.0e-5
-
-
-def _next_pow2(n: int) -> int:
-    """Smallest power of 2 >= n (1 if n==0)."""
-    if n <= 1:
-        return 1
-    p = 1
-    while p < n:
-        p *= 2
-    return p
-
-
-def _expand_probe_normals(normals: np.ndarray, n_probes: int, probe_shape: tuple[int, ...]) -> np.ndarray:
-    normals = np.asarray(normals, dtype=gs.np_float)
-    if normals.ndim == 1:
-        return np.broadcast_to(normals, (n_probes, 3)).copy()
-    if normals.shape == (*probe_shape, 3):
-        return normals.reshape(n_probes, 3).copy()
-    if normals.shape == (n_probes, 3):
-        return normals.copy()
-    gs.raise_exception(
-        "ElastomerTaxel probe_local_normal must be one normal or match probe_local_pos shape. "
-        f"Got normal shape {normals.shape} for probe shape {probe_shape}."
-    )
-
-
-def _normalize_elastomer_probe_layout(
-    probe_pos: np.ndarray, probe_normals: np.ndarray, is_grid: bool
-) -> tuple[np.ndarray, np.ndarray, bool, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-    probe_shape = probe_pos.shape[:-1]
-    flat = probe_pos.reshape(-1, 3)
-    normals = _expand_probe_normals(probe_normals, flat.shape[0], probe_shape)
-
-    normal_norms = np.linalg.norm(normals, axis=1)
-    if np.any(normal_norms < gs.EPS):
-        gs.raise_exception("ElastomerTaxel probe_local_normal entries must be non-zero.")
-    normals = normals / normal_norms[:, None]
-
-    use_grid_fft = False
-    grid_normal = np.zeros(3, dtype=gs.np_float)
-    tangent_u = np.zeros(3, dtype=gs.np_float)
-    tangent_v = np.zeros(3, dtype=gs.np_float)
-    grid_spacing = np.zeros(2, dtype=gs.np_float)
-
-    if is_grid:
-        if len(probe_shape) != 2:
-            gs.raise_exception("ElastomerTaxel grid probe_local_pos must have shape (ny, nx, 3).")
-        ny, nx = int(probe_shape[0]), int(probe_shape[1])
-        if nx >= 2 and ny >= 2:
-            grid = probe_pos.reshape(ny, nx, 3)
-            step_u = grid[0, 1] - grid[0, 0]
-            step_v = grid[1, 0] - grid[0, 0]
-            spacing_u = float(np.linalg.norm(step_u))
-            spacing_v = float(np.linalg.norm(step_v))
-            if spacing_u >= gs.EPS and spacing_v >= gs.EPS:
-                tangent_u_candidate = (step_u / spacing_u).astype(gs.np_float)
-                tangent_v_candidate = (step_v / spacing_v).astype(gs.np_float)
-                normal_candidate = normals[0].astype(gs.np_float, copy=False)
-                normals_are_uniform = bool(np.all(normals @ normal_candidate >= 1.0 - _GRID_TOL))
-                axes_are_orthogonal = abs(float(tangent_u_candidate @ tangent_v_candidate)) <= _GRID_TOL
-                axes_in_plane = (
-                    abs(float(tangent_u_candidate @ normal_candidate)) <= _GRID_TOL
-                    and abs(float(tangent_v_candidate @ normal_candidate)) <= _GRID_TOL
-                )
-                expected = (
-                    grid[0, 0]
-                    + np.arange(nx, dtype=gs.np_float)[None, :, None] * step_u[None, None, :]
-                    + np.arange(ny, dtype=gs.np_float)[:, None, None] * step_v[None, None, :]
-                )
-                is_regular = bool(np.max(np.linalg.norm(grid - expected, axis=-1)) <= _GRID_TOL)
-                use_grid_fft = normals_are_uniform and axes_are_orthogonal and axes_in_plane and is_regular
-                if use_grid_fft:
-                    grid_normal = normal_candidate
-                    tangent_u = tangent_u_candidate
-                    tangent_v = tangent_v_candidate
-                    grid_spacing = np.array((spacing_u, spacing_v), dtype=gs.np_float)
-
-    return (
-        flat.astype(gs.np_float, copy=False),
-        normals.astype(gs.np_float, copy=False),
-        use_grid_fft,
-        grid_normal.astype(gs.np_float, copy=False),
-        tangent_u.astype(gs.np_float, copy=False),
-        tangent_v.astype(gs.np_float, copy=False),
-        grid_spacing.astype(gs.np_float, copy=False),
-    )
-
-
 @qd.func
 def _func_elastomer_min_sdf_over_active_geoms(
     i_b: int,
@@ -1000,17 +923,18 @@ def _func_elastomer_update_surface_anchor(
 @qd.func
 def _func_elastomer_direct_dilate_contribution(
     source_pos: qd.types.vector(3),
-    source_normal: qd.types.vector(3),
     target_pos: qd.types.vector(3),
     target_normal: qd.types.vector(3),
     depth: float,
     lam: float,
     scale: float,
+    normal_exponent: float,
 ) -> qd.types.vector(3):
-    source_contact_pos = source_pos - source_normal * depth
-    diff = target_pos - source_contact_pos
-    planar_diff = _func_elastomer_tangent(diff, target_normal)
-    return diff * depth * qd.exp(-lam * planar_diff.dot(planar_diff)) * scale
+    # Tangential marker spreading is linear in penetration depth; the out-of-plane bulge follows a
+    # ``depth ** normal_exponent`` power law (mirrors the FFT path's H / H**normal_exponent channel split).
+    planar_diff = _func_elastomer_tangent(target_pos - source_pos, target_normal)
+    falloff = qd.exp(-lam * planar_diff.dot(planar_diff)) * scale
+    return (planar_diff * depth + target_normal * qd.pow(depth, normal_exponent)) * falloff
 
 
 @qd.func
@@ -1054,19 +978,37 @@ def _collect_collision_geom_idx(solver, track_link_idx: np.ndarray) -> tuple[tor
 def _precompute_hydroshear_dilate_kernel_fft(
     lambda_d: float, grid_spacing: tuple[float, float], fft_n: tuple[int, int], device: torch.device, dtype: torch.dtype
 ) -> torch.Tensor:
-    i = torch.arange(fft_n[0], dtype=dtype, device=device)
-    j = torch.arange(fft_n[1], dtype=dtype, device=device)
-    xx, yy = torch.meshgrid((i - fft_n[0] // 2) * grid_spacing[0], (j - fft_n[1] // 2) * grid_spacing[1], indexing="ij")
-    g = torch.exp(torch.tensor(-lambda_d, dtype=dtype, device=device) * (xx * xx + yy * yy))
-    k = torch.stack((xx * g, yy * g, g), dim=0)
+    """Real FFT of the 3-plane HydroShear dilation kernel ``(Ku, Kv, Kn)``.
+
+    ``fft_n`` is ``(fft_ny, fft_nx)`` row-major: axis 0 spans the tangent_v direction, axis 1 the tangent_u
+    direction. ``grid_spacing`` is ``(spacing_u, spacing_v)``. The output is a complex
+    ``(3, fft_ny, fft_nx // 2 + 1)`` half-spectrum ready to multiply against ``rfft2(field)``.
+    """
+    iv = torch.arange(fft_n[0], dtype=dtype, device=device)
+    iu = torch.arange(fft_n[1], dtype=dtype, device=device)
+    vv, uu = torch.meshgrid(
+        (iv - fft_n[0] // 2) * grid_spacing[1], (iu - fft_n[1] // 2) * grid_spacing[0], indexing="ij"
+    )
+    g = torch.exp(torch.tensor(-lambda_d, dtype=dtype, device=device) * (uu * uu + vv * vv))
+    k = torch.stack((uu * g, vv * g, g), dim=0)
     k = torch.fft.ifftshift(k, dim=(-2, -1))
-    return torch.fft.fft2(k)
+    return torch.fft.rfft2(k)
+
+
+def _dilate_kernel_builder(meta_entry: tuple, fft_n: tuple[int, int]) -> torch.Tensor:
+    """``register_grid_fft_sensor`` kernel builder for HydroShear dilation: 3 planes ``(Ku, Kv, Kn)``.
+
+    ``meta_entry`` is ``(sensor_idx, g_ny, g_nx, probe_start, cache_start, lambda_d, spacing_u, spacing_v)``.
+    """
+    _, _, _, _, _, lambda_d, spacing_u, spacing_v = meta_entry
+    return _precompute_hydroshear_dilate_kernel_fft(lambda_d, (spacing_u, spacing_v), fft_n, gs.device, gs.tc_float)
 
 
 @qd.kernel(fastcache=True)
 def _kernel_elastomer_probe_depth(
     probe_positions_local: qd.types.ndarray(),
     probe_sensor_idx: qd.types.ndarray(),
+    probe_radii: qd.types.ndarray(),
     links_idx: qd.types.ndarray(),
     sensor_track_geom_start: qd.types.ndarray(),
     sensor_track_geom_n: qd.types.ndarray(),
@@ -1085,6 +1027,10 @@ def _kernel_elastomer_probe_depth(
     n_batches = probe_depth_buf.shape[0]
 
     for i_b, i_p in qd.ndrange(n_batches, total_n_probes):
+        # Inactive filler probe (probe_radius == 0): no SDF query, contributes no dilation.
+        if probe_radii[i_p] <= gs.qd_float(0.0):
+            probe_depth_buf[i_b, i_p] = gs.qd_float(0.0)
+            continue
         i_s = probe_sensor_idx[i_p]
         sensor_link_idx = links_idx[i_s]
         link_pos = links_state.pos[sensor_link_idx, i_b]
@@ -1113,11 +1059,13 @@ def _kernel_elastomer_dilate_accumulate(
     probe_positions_local: qd.types.ndarray(),
     probe_local_normal: qd.types.ndarray(),
     probe_sensor_idx: qd.types.ndarray(),
+    probe_radii: qd.types.ndarray(),
     sensor_cache_start: qd.types.ndarray(),
     sensor_probe_start: qd.types.ndarray(),
     n_probes_per_sensor: qd.types.ndarray(),
     lambda_d: qd.types.ndarray(),
     dilate_scale: qd.types.ndarray(),
+    normal_exponent: qd.types.ndarray(),
     probe_depth_buf: qd.types.ndarray(),
     output: qd.types.ndarray(),
 ):
@@ -1138,8 +1086,15 @@ def _kernel_elastomer_dilate_accumulate(
         cache_start = sensor_cache_start[i_s]
         lam = lambda_d[i_s]
         scale = dilate_scale[i_s]
+        n_exp = normal_exponent[i_s]
         _i_p = i_p - probe_start
 
+        # Inactive filler probe (probe_radius == 0): reads zero, no dilation accumulated.
+        if probe_radii[i_p] <= gs.qd_float(0.0):
+            for k in qd.static(range(3)):
+                output[cache_start + _i_p * 3 + k, i_b] = gs.qd_float(0.0)
+            continue
+
         target_local = _func_vec3_at(probe_positions_local, i_p)
         target_normal = _func_vec3_at(probe_local_normal, i_p)
 
@@ -1151,12 +1106,12 @@ def _kernel_elastomer_dilate_accumulate(
                 continue
             contribution = _func_elastomer_direct_dilate_contribution(
                 _func_vec3_at(probe_positions_local, j_p),
-                _func_vec3_at(probe_local_normal, j_p),
                 target_local,
                 target_normal,
                 src_depth,
                 lam,
                 scale,
+                n_exp,
             )
             for k in qd.static(range(3)):
                 acc[k] = acc[k] + contribution[k]
@@ -1330,6 +1285,7 @@ def _kernel_elastomer_shear_accumulate(
     probe_positions_local: qd.types.ndarray(),
     probe_local_normal: qd.types.ndarray(),
     probe_sensor_idx: qd.types.ndarray(),
+    probe_radii: qd.types.ndarray(),
     sensor_cache_start: qd.types.ndarray(),
     sensor_probe_start: qd.types.ndarray(),
     sensor_pc_start: qd.types.ndarray(),
@@ -1347,8 +1303,8 @@ def _kernel_elastomer_shear_accumulate(
     that are flagged ``surface_initialized`` and sum Gaussian contributions into a register, then
     += the result into ``output``. No atomic_add (each (i_b, i_p) thread owns its output slot).
 
-    Must run after the surface-state kernel AND after the Patch-3 torch cleanup that invalidates
-    ``surface_initialized_buf`` for BVH-pruned points -- otherwise stale True flags from prior
+    Must run after the surface-state kernel AND after the post-kernel ``surface_initialized_buf &= candidate``
+    cleanup that invalidates the flag for BVH-pruned points -- otherwise stale True flags from prior
     steps would corrupt this step's accumulation.
     """
     total_n_probes = probe_positions_local.shape[0]
@@ -1359,6 +1315,9 @@ def _kernel_elastomer_shear_accumulate(
         scale = shear_scale[i_s]
         if scale <= gs.qd_float(0.0):
             continue
+        # Inactive filler probe (probe_radius == 0): reads zero (dilate already wrote 0 to this output slot).
+        if probe_radii[i_p] <= gs.qd_float(0.0):
+            continue
         lam = lambda_s[i_s]
         cache_start = sensor_cache_start[i_s]
         _i_p = i_p - sensor_probe_start[i_s]
@@ -1409,11 +1368,13 @@ def _kernel_elastomer_shear_accumulate(
 
 
 def _elastomer_taxel_grid_fft_dilate(
-    fft_grid_meta: list[tuple[int, int, int, int, int, float, float, float]],
-    fft_grid_kernels_stacked: torch.Tensor,
+    grid_fft_meta: list[tuple],
+    grid_fft_kernels_stacked: torch.Tensor,
     probe_depth_buf: torch.Tensor,
-    fft_depth_buffer: torch.Tensor,
+    probe_radii: torch.Tensor,
+    grid_fft_buffer: torch.Tensor,
     dilate_scale: torch.Tensor,
+    normal_exponent: torch.Tensor,
     grid_normal: torch.Tensor,
     grid_tangent_u: torch.Tensor,
     grid_tangent_v: torch.Tensor,
@@ -1423,59 +1384,73 @@ def _elastomer_taxel_grid_fft_dilate(
     """
     Elastomer marker dilation via 2D FFT in the validated probe tangent basis.
 
-    All grid sensors share the global ``fft_max_n`` (= last two dims of ``fft_depth_buffer``); their
-    kernels are stacked into ``fft_grid_kernels_stacked`` of shape (n_grid, 3, fft_max_n[0],
-    fft_max_n[1]). The four heavy FFTs (fft of H, fft of H*H, ifft for Kx/Ky/Kn) thus run as
-    batched ops over the grid-sensor axis, dropping launches from 4·n_grid to 4. The H-fill and
-    write-back stages remain per-sensor (small Python loops over view/copy and per-sensor tangent
-    decomposition).
+    All grid sensors share the global ``grid_fft_max_n`` (= last two dims of ``grid_fft_buffer``); their
+    kernels are stacked into ``grid_fft_kernels_stacked`` of shape (n_grid, 3, fft_ny, fft_nx). The four heavy
+    FFTs (fft of H, fft of H**normal_exponent, ifft for Ku/Kv/Kn) thus run as batched ops over the grid-sensor
+    axis, dropping
+    launches from 4·n_grid to 4. The H-fill and write-back stages remain per-sensor (small Python loops over
+    view/copy and per-sensor tangent decomposition). Grid axes are ``(ny, nx)`` row-major throughout (matching
+    the probe flat index ``iy * nx + ix``), so no transpose is needed on either the fill or write-back side.
     """
-    if not fft_grid_meta:
+    if not grid_fft_meta:
         return
     n_batches = probe_depth_buf.shape[0]
-
-    # 1) Fill the (B, n_grid, fft_max_nx, fft_max_ny) depth buffer. Per-sensor view+copy only.
-    fft_depth_buffer.zero_()
-    for grid_pos, (_, g_nx, g_ny, probe_start, _, _, _, _) in enumerate(fft_grid_meta):
-        depth_slice = probe_depth_buf[:, probe_start : probe_start + g_nx * g_ny]
-        fft_depth_buffer[:, grid_pos, :g_nx, :g_ny].copy_(depth_slice.view(n_batches, g_ny, g_nx).transpose(1, 2))
-
-    # 2) Batched FFTs across (B, n_grid). Broadcast over B when multiplying by per-sensor kernels.
-    H_fft = torch.fft.fft2(fft_depth_buffer)
-    H2_fft = torch.fft.fft2(fft_depth_buffer * fft_depth_buffer)
-    Kx_all = fft_grid_kernels_stacked[:, 0]  # (n_grid, fft_max_nx, fft_max_ny) complex
-    Ky_all = fft_grid_kernels_stacked[:, 1]
-    Kn_all = fft_grid_kernels_stacked[:, 2]
-    disp_u_all = torch.fft.ifft2(H_fft * Kx_all).real  # (B, n_grid, fft_max_nx, fft_max_ny)
-    disp_v_all = torch.fft.ifft2(H_fft * Ky_all).real
-    disp_n_all = torch.fft.ifft2(H2_fft * Kn_all).real
-
-    # 3) Per-sensor write-back: slice to (g_nx, g_ny), apply scale + tangent decomposition, copy
+    fft_ny, fft_nx = grid_fft_buffer.shape[-2], grid_fft_buffer.shape[-1]
+
+    # 1) Fill the active region of the (B, n_grid, fft_ny, fft_nx) depth buffer. The zero-padding region is never
+    # written here and stays zero from allocation, so no per-step ``zero_()`` is needed.
+    for grid_pos, (_, g_ny, g_nx, probe_start, _, _, _, _) in enumerate(grid_fft_meta):
+        depth_slice = probe_depth_buf[:, probe_start : probe_start + g_ny * g_nx]
+        grid_fft_buffer[:, grid_pos, :g_ny, :g_nx].copy_(depth_slice.view(n_batches, g_ny, g_nx))
+
+    # 2) Batched real FFTs across (B, n_grid). Inputs are real so ``rfft2`` (half spectrum) is ~2x cheaper than the
+    # full complex ``fft2``. Kernels broadcast over B when multiplying.
+    H_fft = torch.fft.rfft2(grid_fft_buffer)
+    # The normal channel follows depth ** normal_exponent, so it convolves the per-grid powered depth field;
+    # the tangential (u, v) channels stay linear in depth and convolve the raw field H.
+    exps = normal_exponent[[meta[0] for meta in grid_fft_meta]].reshape(1, -1, 1, 1)
+    Hp_fft = torch.fft.rfft2(grid_fft_buffer.pow(exps))
+    Ku_all = grid_fft_kernels_stacked[:, 0]  # (n_grid, fft_ny, fft_nx // 2 + 1) complex
+    Kv_all = grid_fft_kernels_stacked[:, 1]
+    Kn_all = grid_fft_kernels_stacked[:, 2]
+    disp_u_all = torch.fft.irfft2(H_fft * Ku_all, s=(fft_ny, fft_nx))  # (B, n_grid, fft_ny, fft_nx)
+    disp_v_all = torch.fft.irfft2(H_fft * Kv_all, s=(fft_ny, fft_nx))
+    disp_n_all = torch.fft.irfft2(Hp_fft * Kn_all, s=(fft_ny, fft_nx))
+
+    # 3) Per-sensor write-back: slice to (g_ny, g_nx), apply scale + tangent decomposition, copy
     # into the sensor's output range. Tangent vectors are per-sensor so can't trivially batch here.
-    for grid_pos, meta in enumerate(fft_grid_meta):
-        sensor_idx, g_nx, g_ny, _, cache_start, _, _, _ = meta
+    for grid_pos, meta in enumerate(grid_fft_meta):
+        sensor_idx, g_ny, g_nx, probe_start, cache_start, _, _, _ = meta
         scale_s = dilate_scale[sensor_idx]
-        disp_u = disp_u_all[:, grid_pos, :g_nx, :g_ny] * scale_s
-        disp_v = disp_v_all[:, grid_pos, :g_nx, :g_ny] * scale_s
-        disp_n = disp_n_all[:, grid_pos, :g_nx, :g_ny] * scale_s
-        # Cache order is probe flat index iy*nx+ix; (g_nx, g_ny) transpose(1, 2).reshape gives (g_ny, g_nx) -> iy*nx+ix.
-        disp_u_flat = disp_u.transpose(1, 2).reshape(n_batches, -1)
-        disp_v_flat = disp_v.transpose(1, 2).reshape(n_batches, -1)
-        disp_n_flat = disp_n.transpose(1, 2).reshape(n_batches, -1)
-        grid_size = g_nx * g_ny * 3
+        disp_u = disp_u_all[:, grid_pos, :g_ny, :g_nx] * scale_s
+        disp_v = disp_v_all[:, grid_pos, :g_ny, :g_nx] * scale_s
+        disp_n = disp_n_all[:, grid_pos, :g_ny, :g_nx] * scale_s
+        # (B, g_ny, g_nx) reshapes directly to the probe flat index iy*nx+ix -- no transpose.
+        disp_u_flat = disp_u.reshape(n_batches, -1)
+        disp_v_flat = disp_v.reshape(n_batches, -1)
+        disp_n_flat = disp_n.reshape(n_batches, -1)
+        grid_size = g_ny * g_nx * 3
         out_block = grid_dilate_out_buffer[:, :grid_size]
         tangent_u = grid_tangent_u[sensor_idx]
         tangent_v = grid_tangent_v[sensor_idx]
         normal = grid_normal[sensor_idx]
+        # Zero inactive filler probes (probe_radius == 0): they are non-sources, but the FFT still smears
+        # neighbour dilation into their cells, so mask the per-probe write-back.
+        active = (probe_radii[probe_start : probe_start + g_ny * g_nx] > 0.0).to(disp_u_flat.dtype)
         for k in range(3):
             out_block[:, k:grid_size:3] = (
                 disp_u_flat * tangent_u[k] + disp_v_flat * tangent_v[k] + disp_n_flat * normal[k]
-            )
+            ) * active
         output[cache_start : cache_start + grid_size].copy_(out_block.T)
 
 
 @dataclass
-class ElastomerTaxelSensorMetadata(PointCloudTactileSharedMetadata, ProbesWithNormalSensorMetadataMixin):
+class ElastomerTaxelSensorMetadata(
+    ViscoelasticHysteresisMetadataMixin,
+    GridFFTConvMetadataMixin,
+    PointCloudTactileSharedMetadata,
+    ProbesWithNormalSensorMetadataMixin,
+):
     track_geom_idx: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
     track_geom_active_envs_mask: torch.Tensor = make_tensor_field((0, 0), dtype_factory=lambda: gs.tc_bool)
     sensor_track_geom_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
@@ -1490,6 +1465,7 @@ class ElastomerTaxelSensorMetadata(PointCloudTactileSharedMetadata, ProbesWithNo
     lambda_s: torch.Tensor = make_tensor_field((0,))
     dilate_scale: torch.Tensor = make_tensor_field((0,))
     shear_scale: torch.Tensor = make_tensor_field((0,))
+    normal_exponent: torch.Tensor = make_tensor_field((0,))
     elastomer_contact_sdf_enter: torch.Tensor = make_tensor_field((0,))
     elastomer_contact_sdf_exit: torch.Tensor = make_tensor_field((0,))
 
@@ -1504,27 +1480,15 @@ class ElastomerTaxelSensorMetadata(PointCloudTactileSharedMetadata, ProbesWithNo
     # stale surface_initialized / surface_entry_pos for points the BVH skipped this step.
     surface_candidate_buf: torch.Tensor = make_tensor_field((0, 0), dtype_factory=lambda: gs.tc_bool)
 
-    is_grid: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_bool)
+    # Per-sensor flag selecting the FFT dilation path vs the direct (non-grid) dilation kernel.
     use_grid_fft: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_bool)
-    grid_n: torch.Tensor = make_tensor_field((0, 2), dtype_factory=lambda: gs.tc_int)
-    grid_spacing: torch.Tensor = make_tensor_field((0, 2))
+    # Per-grid-FFT-sensor tangent basis, consumed by the dilation write-back. ``grid_fft_meta`` tuples for this
+    # sensor are ``(sensor_idx, g_ny, g_nx, probe_start, cache_start, lambda_d, spacing_u, spacing_v)``.
     grid_normal: torch.Tensor = make_tensor_field((0, 3))
     grid_tangent_u: torch.Tensor = make_tensor_field((0, 3))
     grid_tangent_v: torch.Tensor = make_tensor_field((0, 3))
-    # Stacked complex FFT kernels for all grid-FFT sensors, shape (n_grid, 3, fft_max_n[0],
-    # fft_max_n[1]). All sensors share fft_max_n so torch.fft.fft2 batches across the grid axis
-    # in a single launch. When a new grid sensor expands fft_max_n at build time, prior sensors'
-    # kernels are recomputed at the new size so the stack stays uniform.
-    fft_grid_kernels_stacked: torch.Tensor = make_tensor_field((0, 0, 0, 0), dtype_factory=lambda: torch.complex64)
-    fft_depth_buffer: torch.Tensor = make_tensor_field((0, 0, 0, 0))
+    # Scratch for the per-sensor tangent-decomposition write-back, lazily grown to the largest grid.
     grid_dilate_out_buffer: torch.Tensor = make_tensor_field((0, 0))
-    # Per-grid-FFT-sensor metadata captured at build time. Tuple fields:
-    # (sensor_idx, g_nx, g_ny, probe_start, cache_start, lambda_d, spacing_u, spacing_v).
-    # Indexed positionally to match rows of fft_grid_kernels_stacked / sensor axis of
-    # fft_depth_buffer. Iterating it directly avoids per-step .item() device syncs.
-    fft_grid_meta: list[tuple[int, int, int, int, int, float, float, float]] = field(default_factory=list)
-    # Global max FFT size across all grid sensors. Mutated only at build time.
-    fft_max_n: tuple[int, int] = (0, 0)
 
     # True iff at least one configured ElastomerTaxel has shear_scale > 0. Set during build by OR-ing
     # each sensor's value, so per-step gating avoids an O(n_sensors) reduction + device sync.
@@ -1532,6 +1496,7 @@ class ElastomerTaxelSensorMetadata(PointCloudTactileSharedMetadata, ProbesWithNo
 
 
 class ElastomerTaxelSensor(
+    ViscoelasticHysteresisMixin[ElastomerTaxelSensorMetadata],
     PointCloudTactileSensorMixin[ElastomerTaxelSensorMetadata],
     ProbesWithNormalSensorMixin[ElastomerTaxelSensorMetadata],
     RigidSensorMixin[ElastomerTaxelSensorMetadata],
@@ -1539,20 +1504,15 @@ class ElastomerTaxelSensor(
 ):
     def __init__(self, sensor_options: ElastomerTaxelSensorOptions, sensor_idx: int, sensor_manager: "SensorManager"):
         super().__init__(sensor_options, sensor_idx, sensor_manager)
-
-        self._is_grid = self._probe_local_pos.ndim > 2
-        self._shape = self._probe_local_pos.shape[:-1]
-
-        (probe_pos, probe_normals, use_grid_fft, grid_normal, grid_tangent_u, grid_tangent_v, grid_spacing) = (
-            _normalize_elastomer_probe_layout(
+        # FFT-grid eligibility check (flat pos/normals are already populated by the base mixins).
+        is_grid = len(self._probe_layout_shape) == 2
+        _, _, self._use_grid_fft, grid_normal, grid_tangent_u, grid_tangent_v, grid_spacing = (
+            normalize_grid_probe_layout(
                 np.asarray(sensor_options.probe_local_pos, dtype=gs.np_float),
                 np.asarray(sensor_options.probe_local_normal, dtype=gs.np_float),
-                self._is_grid,
+                is_grid,
             )
         )
-        self._probe_local_pos = torch.tensor(probe_pos, dtype=gs.tc_float, device=gs.device)
-        self._probe_local_normal = torch.tensor(probe_normals, dtype=gs.tc_float, device=gs.device)
-        self._use_grid_fft = use_grid_fft
         self._grid_normal = torch.tensor(grid_normal, dtype=gs.tc_float, device=gs.device)
         self._grid_tangent_u = torch.tensor(grid_tangent_u, dtype=gs.tc_float, device=gs.device)
         self._grid_tangent_v = torch.tensor(grid_tangent_v, dtype=gs.tc_float, device=gs.device)
@@ -1613,11 +1573,12 @@ def build(self):
         self._shared_metadata.dilate_scale = concat_with_tensor(
             self._shared_metadata.dilate_scale, float(self._options.dilate_scale), expand=(1,)
         )
+        self._shared_metadata.normal_exponent = concat_with_tensor(
+            self._shared_metadata.normal_exponent, float(self._options.normal_exponent), expand=(1,)
+        )
         self._shared_metadata.shear_scale = concat_with_tensor(
             self._shared_metadata.shear_scale, float(self._options.shear_scale), expand=(1,)
         )
-        if float(self._options.shear_scale) > 0.0:
-            self._shared_metadata.any_shear = True
         self._shared_metadata.elastomer_contact_sdf_enter = concat_with_tensor(
             self._shared_metadata.elastomer_contact_sdf_enter,
             float(self._options.elastomer_contact_sdf_enter),
@@ -1628,6 +1589,8 @@ def build(self):
             float(self._options.elastomer_contact_sdf_exit),
             expand=(1,),
         )
+        if float(self._options.shear_scale) > 0.0:
+            self._shared_metadata.any_shear = True
 
         self._shared_metadata.probe_depth_buf = torch.zeros(
             (B, self._shared_metadata.total_n_probes), dtype=gs.tc_float, device=gs.device
@@ -1648,61 +1611,40 @@ def build(self):
             (B, total_n_surface), dtype=gs.tc_bool, device=gs.device
         )
 
-        self._shared_metadata.is_grid = concat_with_tensor(self._shared_metadata.is_grid, self._is_grid, expand=(1,))
         self._shared_metadata.use_grid_fft = concat_with_tensor(
             self._shared_metadata.use_grid_fft, self._use_grid_fft, expand=(1,)
         )
 
-        grid_n = torch.tensor((0, 0), dtype=gs.tc_int, device=gs.device)
-        grid_spacing = torch.tensor((0.0, 0.0), dtype=gs.tc_float, device=gs.device)
         grid_normal = torch.zeros(3, dtype=gs.tc_float, device=gs.device)
         grid_tangent_u = torch.zeros(3, dtype=gs.tc_float, device=gs.device)
         grid_tangent_v = torch.zeros(3, dtype=gs.tc_float, device=gs.device)
         if self._use_grid_fft:
-            nx, ny = int(self._shape[1]), int(self._shape[0])
-            grid_n = torch.tensor((nx, ny), dtype=gs.tc_int, device=gs.device)
-            grid_spacing = self._grid_spacing
+            nx, ny = int(self._probe_layout_shape[1]), int(self._probe_layout_shape[0])
             grid_normal = self._grid_normal
             grid_tangent_u = self._grid_tangent_u
             grid_tangent_v = self._grid_tangent_v
-            spacing_u, spacing_v = float(grid_spacing[0].item()), float(grid_spacing[1].item())
-            this_fft_n = tuple(_next_pow2(2 * n - 1) for n in (nx, ny))
+            spacing_u, spacing_v = float(self._grid_spacing[0].item()), float(self._grid_spacing[1].item())
+            # FFT size is (ny, nx) row-major. Sizing each axis to ``2n - 1`` (the full linear-convolution support)
+            # rounded up to a power of 2 guarantees zero circular wraparound regardless of the dilation kernel's
+            # decay -- the ``x*g`` / ``y*g`` first-moment kernels decay slower than the Gaussian itself.
+            this_fft_n = (next_pow2(2 * ny - 1), next_pow2(2 * nx - 1))
             cache_start_py = int(self._shared_metadata.sensor_cache_start[self._idx].item())
-            self._shared_metadata.fft_grid_meta.append(
-                (
+            register_grid_fft_sensor(
+                self._shared_metadata,
+                meta_entry=(
                     self._idx,
-                    nx,
                     ny,
+                    nx,
                     self._probe_start_idx,
                     cache_start_py,
                     float(self._options.lambda_d),
                     spacing_u,
                     spacing_v,
-                )
-            )
-
-            # Expand the global FFT size if this sensor needs more padding. When that happens, all
-            # prior grid sensors' kernels are rebuilt at the new size (their FFTs depend on the
-            # transform length, so frequency-domain padding wouldn't be equivalent).
-            prev_max = self._shared_metadata.fft_max_n
-            new_max = (max(prev_max[0], this_fft_n[0]), max(prev_max[1], this_fft_n[1]))
-            self._shared_metadata.fft_max_n = new_max
-            n_grid = len(self._shared_metadata.fft_grid_meta)
-            stacked = torch.empty(
-                (n_grid, 3, new_max[0], new_max[1]),
-                dtype=torch.complex64,
-                device=gs.device,
-            )
-            for grid_pos, (_, _, _, _, _, lam_d, sp_u, sp_v) in enumerate(self._shared_metadata.fft_grid_meta):
-                stacked[grid_pos] = _precompute_hydroshear_dilate_kernel_fft(
-                    lam_d, (sp_u, sp_v), new_max, gs.device, gs.tc_float
-                )
-            self._shared_metadata.fft_grid_kernels_stacked = stacked
-
-            # fft_depth_buffer is keyed by grid-sensor position (not raw sensor i_s), sized at the
-            # current global max FFT size. Reallocate every time we add a grid sensor.
-            self._shared_metadata.fft_depth_buffer = torch.zeros(
-                (B, n_grid, new_max[0], new_max[1]), dtype=gs.tc_float, device=gs.device
+                ),
+                this_fft_n=this_fft_n,
+                kernel_builder=_dilate_kernel_builder,
+                n_buffer_channels=0,
+                batch_size=B,
             )
             grid_size = nx * ny * 3
             out_buf = self._shared_metadata.grid_dilate_out_buffer
@@ -1713,10 +1655,6 @@ def build(self):
                     device=gs.device,
                 )
 
-        self._shared_metadata.grid_n = concat_with_tensor(self._shared_metadata.grid_n, grid_n, expand=(1, 2))
-        self._shared_metadata.grid_spacing = concat_with_tensor(
-            self._shared_metadata.grid_spacing, grid_spacing, expand=(1, 2)
-        )
         self._shared_metadata.grid_normal = concat_with_tensor(
             self._shared_metadata.grid_normal, grid_normal, expand=(1, 3)
         )
@@ -1728,7 +1666,7 @@ def build(self):
         )
 
     def _get_return_format(self) -> tuple[int, ...]:
-        return (self._n_probes, 3)
+        return (*self._probe_layout_shape, 3)
 
     @classmethod
     def _get_cache_dtype(cls) -> torch.dtype:
@@ -1742,6 +1680,27 @@ def reset(cls, shared_metadata: ElastomerTaxelSensorMetadata, shared_ground_trut
         # implicitly invalidated by clearing it; surface_candidate_buf is .zero_()'d at step start.
         shared_metadata.surface_initialized_buf[envs_idx, :] = False
 
+    @classmethod
+    def _apply_transform(
+        cls,
+        shared_metadata: ElastomerTaxelSensorMetadata,
+        data: torch.Tensor,
+        timeline: "TensorRingBuffer",
+        *,
+        is_measured: bool,
+    ):
+        super()._apply_transform(shared_metadata, data, timeline, is_measured=is_measured)
+        if not is_measured:
+            return
+        # ElastomerTaxel's kernel writes a single output used for both GT and measured (measured is .copy_'d from
+        # GT), so per-probe gain is applied here as a post-step multiplication on the measured branch only.
+        # Approximation note: tangential dilation and shear scale linearly with gain (exact), but the H^2
+        # normal-dilation term ideally scales as gain^2 -- here we apply gain^1 across all components. For typical
+        # gains near 1 this is a small error; for large deviations the normal component will be slightly off.
+        cls._maybe_build_cache_col_probe_idx(shared_metadata, data)
+        gain_per_col = shared_metadata.probe_gains[:, shared_metadata.cache_col_probe_idx]
+        data.mul_(gain_per_col)
+
     @classmethod
     def _update_current_timestep_data(
         cls,
@@ -1760,6 +1719,7 @@ def _update_current_timestep_data(
         _kernel_elastomer_probe_depth(
             shared_metadata.probe_positions,
             shared_metadata.probe_sensor_idx,
+            shared_metadata.probe_radii,
             shared_metadata.links_idx,
             shared_metadata.sensor_track_geom_start,
             shared_metadata.sensor_track_geom_n,
@@ -1776,22 +1736,26 @@ def _update_current_timestep_data(
             shared_metadata.probe_positions,
             shared_metadata.probe_local_normal,
             shared_metadata.probe_sensor_idx,
+            shared_metadata.probe_radii,
             shared_metadata.sensor_cache_start,
             shared_metadata.sensor_probe_start,
             shared_metadata.n_probes_per_sensor,
             shared_metadata.lambda_d,
             shared_metadata.dilate_scale,
+            shared_metadata.normal_exponent,
             shared_metadata.probe_depth_buf,
             current_ground_truth_data_T,
         )
         # FFT runs after the qd dilate kernel: on Metal, write-only kernel outputs zero unwritten slots on copy-back,
         # which would erase the grid range the FFT just wrote.
         _elastomer_taxel_grid_fft_dilate(
-            shared_metadata.fft_grid_meta,
-            shared_metadata.fft_grid_kernels_stacked,
+            shared_metadata.grid_fft_meta,
+            shared_metadata.grid_fft_kernels_stacked,
             shared_metadata.probe_depth_buf,
-            shared_metadata.fft_depth_buffer,
+            shared_metadata.probe_radii,
+            shared_metadata.grid_fft_buffer,
             shared_metadata.dilate_scale,
+            shared_metadata.normal_exponent,
             shared_metadata.grid_normal,
             shared_metadata.grid_tangent_u,
             shared_metadata.grid_tangent_v,
@@ -1844,6 +1808,7 @@ def _update_current_timestep_data(
                 shared_metadata.probe_positions,
                 shared_metadata.probe_local_normal,
                 shared_metadata.probe_sensor_idx,
+                shared_metadata.probe_radii,
                 shared_metadata.sensor_cache_start,
                 shared_metadata.sensor_probe_start,
                 shared_metadata.sensor_pc_start,
@@ -1863,6 +1828,10 @@ def _update_current_timestep_data(
         measured.copy_(current_ground_truth_data_T.T)
 
     def _draw_debug(self, context: "RasterizerContext"):
-        self._draw_debug_probes(
-            context, lambda envs_idx: tensor_to_array(torch.linalg.norm(self.read_ground_truth(envs_idx), dim=-1))
-        )
+        def mask(envs_idx):
+            disp = self.read_ground_truth(envs_idx)
+            if self._options.history_length > 0:
+                disp = disp.select(1 if self._manager._sim.n_envs > 0 else 0, -1)
+            return torch.linalg.norm(disp, dim=-1) >= gs.EPS
+
+        self._draw_debug_probes(context, self._tactile_color_groups_fn(mask))
diff --git a/genesis/engine/sensors/probe.py b/genesis/engine/sensors/probe.py
index 92964c69cd..0adda1a94c 100644
--- a/genesis/engine/sensors/probe.py
+++ b/genesis/engine/sensors/probe.py
@@ -1,5 +1,5 @@
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Generic, TypeVar
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Callable, Generic, TypeVar
 
 import numpy as np
 import quadrants as qd
@@ -7,10 +7,12 @@
 
 import genesis as gs
 import genesis.utils.geom as gu
+from genesis.options.sensors.tactile import TactileProbeSensorOptionsMixin
 from genesis.utils.misc import concat_with_tensor, make_tensor_field, tensor_to_array
 
 if TYPE_CHECKING:
     from genesis.options.sensors.options import SensorOptions
+    from genesis.utils.ring_buffer import TensorRingBuffer
     from genesis.vis.rasterizer_context import RasterizerContext
 
     from .sensor_manager import SensorManager
@@ -35,25 +37,65 @@ class ProbeSensorMetadataMixin:
     probe_positions: torch.Tensor = make_tensor_field((0, 3))
     probe_radii: torch.Tensor = make_tensor_field((0,))
     probe_radii_noise: torch.Tensor = make_tensor_field((0,))
+    has_any_probe_radius_noise: bool = False
+    has_any_probe_gain: bool = False
     n_probes_per_sensor: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
     probe_sensor_idx: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
     sensor_cache_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
     sensor_probe_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-    # Class-level scratch for the kernel-writes-(cols, B) -> ring-slot-(B, cols) transpose-copy pattern. Lazy-allocated
-    # on first hot-path call to avoid per-step `torch.empty_like` allocations.
+
     measured_scratch_T: torch.Tensor = make_tensor_field((0, 0))
 
+    probe_gains: torch.Tensor = make_tensor_field((0, 0))
+    probe_gain_resample_low: torch.Tensor = make_tensor_field((0,))
+    probe_gain_resample_high: torch.Tensor = make_tensor_field((0,))
+    probe_has_gain_resample: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_bool)
+    any_gain_resample: bool = False
+
+    dead_taxel_mask: torch.Tensor = make_tensor_field((0, 0), dtype_factory=lambda: gs.tc_bool)
+    dead_taxel_values: torch.Tensor = make_tensor_field((0, 0))
+    dead_taxel_probability: torch.Tensor = make_tensor_field((0,))
+    dead_taxel_value_low: torch.Tensor = make_tensor_field((0,))
+    dead_taxel_value_high: torch.Tensor = make_tensor_field((0,))
+    any_dead_taxel: bool = False
+    dead_mask_per_col: torch.Tensor = make_tensor_field((0, 0), dtype_factory=lambda: gs.tc_bool)
+    dead_values_per_col: torch.Tensor = make_tensor_field((0, 0))
+    dead_dirty: bool = True
+    cache_col_probe_idx: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: torch.long)
+    cache_col_n_channel_groups: list[int] = field(default_factory=list)
+
 
 ProbeSensorSharedMetadataT = TypeVar("ProbeSensorSharedMetadataT", bound=ProbeSensorMetadataMixin)
 
 
+def get_measured_bufs(
+    shared_metadata: "ProbeSensorMetadataMixin",
+    current_ground_truth_data_T: torch.Tensor,
+    measured_data_timeline: "TensorRingBuffer",
+) -> tuple[torch.Tensor, torch.Tensor]:
+    current_ground_truth_data_T.zero_()
+    measured_slot = measured_data_timeline.at(0, copy=False)
+    measured_slot.zero_()
+    if shared_metadata.measured_scratch_T.shape != current_ground_truth_data_T.shape:
+        shared_metadata.measured_scratch_T = torch.empty_like(current_ground_truth_data_T)
+    return measured_slot, shared_metadata.measured_scratch_T
+
+
 class ProbeSensorMixin(Generic[ProbeSensorSharedMetadataT]):
     """Shared logic for registering this sensor's probes in ``ProbeSensorMetadataMixin`` fields."""
 
+    # Number of channel groups per probe in the cache layout. Used by the per-cache-col probe-index builder.
+    _taxel_channel_groups: int = 1
+
     def __init__(self, sensor_options: "SensorOptions", sensor_idx: int, sensor_manager: "SensorManager"):
-        # `_get_return_format` runs inside `super().__init__`, so `_probe_local_pos` / `_n_probes` must already be set.
-        self._probe_local_pos = torch.tensor(sensor_options.probe_local_pos, dtype=gs.tc_float, device=gs.device)
-        self._n_probes = int(np.prod(self._probe_local_pos.shape[:-1]))
+        # `_get_return_format` runs inside `super().__init__`, so `_probe_local_pos` / `_n_probes` /
+        # `_probe_layout_shape` must already be set. ``_probe_layout_shape`` is the input layout sans the trailing
+        # ``xyz`` axis: ``(N,)`` for a flat probe list or ``(M, N)`` for a 2D grid. Probe-axis storage is flat.
+        raw_pos = torch.tensor(sensor_options.probe_local_pos, dtype=gs.tc_float, device=gs.device)
+        self._probe_layout_shape = raw_pos.shape[:-1]
+        self._n_probes = int(np.prod(self._probe_layout_shape))
+        self._probe_local_pos = raw_pos.reshape(self._n_probes, 3).contiguous()
+        self._debug_objects: list = []
         super().__init__(sensor_options, sensor_idx, sensor_manager)
 
     def build(self) -> None:
@@ -81,7 +123,9 @@ def build(self) -> None:
         if isinstance(self._options.probe_radius, float):
             probe_radii = torch.full((self._n_probes,), self._options.probe_radius, dtype=gs.tc_float, device=gs.device)
         else:
-            probe_radii = torch.tensor(self._options.probe_radius, dtype=gs.tc_float, device=gs.device)
+            probe_radii = torch.tensor(self._options.probe_radius, dtype=gs.tc_float, device=gs.device).reshape(
+                self._n_probes
+            )
         self._shared_metadata.probe_radii = concat_with_tensor(
             self._shared_metadata.probe_radii, probe_radii, expand=(self._n_probes,)
         )
@@ -90,6 +134,186 @@ def build(self) -> None:
             torch.full((self._n_probes,), self._options.probe_radius_noise, dtype=gs.tc_float, device=gs.device),
             expand=(self._n_probes,),
         )
+        if self._options.probe_radius_noise > 0.0:
+            self._shared_metadata.has_any_probe_radius_noise = True
+
+        # Tactile-specific options (probe_gain, dead_taxel_*) live on ``TactileProbeSensorOptionsMixin``; generic
+        # probe sensors (e.g. SurfaceDistanceProbe) don't carry them and register defaults (gain 1, no dead).
+        B = self._manager._sim._B
+        opts = self._options
+        is_tactile = isinstance(opts, TactileProbeSensorOptionsMixin)
+        # Initial per-probe gain (probe_gain may be scalar or per-probe array).
+        gain_value = opts.probe_gain if is_tactile else 1.0
+        if isinstance(gain_value, float) or isinstance(gain_value, int):
+            init_gain = torch.full((B, self._n_probes), float(gain_value), dtype=gs.tc_float, device=gs.device)
+            if float(gain_value) != 1.0:
+                self._shared_metadata.has_any_probe_gain = True
+        else:
+            init_gain = (
+                torch.tensor(gain_value, dtype=gs.tc_float, device=gs.device)
+                .reshape(self._n_probes)
+                .unsqueeze(0)
+                .expand(B, self._n_probes)
+                .contiguous()
+            )
+            if not bool((init_gain == 1.0).all().item()):
+                self._shared_metadata.has_any_probe_gain = True
+        self._shared_metadata.probe_gains = concat_with_tensor(
+            self._shared_metadata.probe_gains, init_gain, expand=(B, self._n_probes), dim=1
+        )
+
+        # Per-probe gain resample range (constant across envs). When option is None, write zeros + has_resample=False;
+        # the reset hook gates on ``has_gain_resample`` per probe.
+        resample_range = opts.probe_gain_resample_range if is_tactile else None
+        if resample_range is None:
+            low, high = 0.0, 0.0
+            has_resample = False
+        else:
+            low, high = float(resample_range[0]), float(resample_range[1])
+            has_resample = True
+            self._shared_metadata.any_gain_resample = True
+            # Resampled gain is generally != 1, so the measured branch can't be assumed equal to GT.
+            self._shared_metadata.has_any_probe_gain = True
+        self._shared_metadata.probe_gain_resample_low = concat_with_tensor(
+            self._shared_metadata.probe_gain_resample_low,
+            torch.full((self._n_probes,), low, dtype=gs.tc_float, device=gs.device),
+            expand=(self._n_probes,),
+        )
+        self._shared_metadata.probe_gain_resample_high = concat_with_tensor(
+            self._shared_metadata.probe_gain_resample_high,
+            torch.full((self._n_probes,), high, dtype=gs.tc_float, device=gs.device),
+            expand=(self._n_probes,),
+        )
+        self._shared_metadata.probe_has_gain_resample = concat_with_tensor(
+            self._shared_metadata.probe_has_gain_resample,
+            torch.full((self._n_probes,), has_resample, dtype=gs.tc_bool, device=gs.device),
+            expand=(self._n_probes,),
+        )
+
+        # Per-probe dead taxel configuration (constant across envs).
+        dead_prob = float(opts.dead_taxel_probability) if is_tactile else 0.0
+        dead_range = opts.dead_taxel_value_range if is_tactile else (0.0, 0.0)
+        s_low, s_high = float(dead_range[0]), float(dead_range[1])
+        self._shared_metadata.dead_taxel_probability = concat_with_tensor(
+            self._shared_metadata.dead_taxel_probability,
+            torch.full((self._n_probes,), dead_prob, dtype=gs.tc_float, device=gs.device),
+            expand=(self._n_probes,),
+        )
+        self._shared_metadata.dead_taxel_value_low = concat_with_tensor(
+            self._shared_metadata.dead_taxel_value_low,
+            torch.full((self._n_probes,), s_low, dtype=gs.tc_float, device=gs.device),
+            expand=(self._n_probes,),
+        )
+        self._shared_metadata.dead_taxel_value_high = concat_with_tensor(
+            self._shared_metadata.dead_taxel_value_high,
+            torch.full((self._n_probes,), s_high, dtype=gs.tc_float, device=gs.device),
+            expand=(self._n_probes,),
+        )
+        if dead_prob > 0.0:
+            self._shared_metadata.any_dead_taxel = True
+        # Allocate / extend the per-(env, probe) dead buffers to the new total probe count.
+        self._shared_metadata.dead_taxel_mask = torch.zeros(
+            (B, self._shared_metadata.total_n_probes), dtype=gs.tc_bool, device=gs.device
+        )
+        self._shared_metadata.dead_taxel_values = torch.zeros(
+            (B, self._shared_metadata.total_n_probes), dtype=gs.tc_float, device=gs.device
+        )
+        # Invalidate the lazy cache-col probe index; rebuilt on next dead apply.
+        self._shared_metadata.cache_col_probe_idx = torch.empty((0,), dtype=torch.long, device=gs.device)
+        self._shared_metadata.cache_col_n_channel_groups.append(self._taxel_channel_groups)
+
+    @classmethod
+    def reset(cls, shared_metadata, shared_ground_truth_cache, envs_idx):
+        super().reset(shared_metadata, shared_ground_truth_cache, envs_idx)
+        # Resample per-(env, probe) gain for probes whose sensor configured a resample range.
+        if shared_metadata.any_gain_resample and shared_metadata.probe_gains.numel() > 0:
+            mask = shared_metadata.probe_has_gain_resample.unsqueeze(0)  # (1, total_n_probes)
+            low = shared_metadata.probe_gain_resample_low.unsqueeze(0)
+            high = shared_metadata.probe_gain_resample_high.unsqueeze(0)
+            sub = shared_metadata.probe_gains[envs_idx]
+            new_gain = torch.rand_like(sub) * (high - low) + low
+            shared_metadata.probe_gains[envs_idx] = torch.where(mask, new_gain, sub)
+        # Resample dead mask + values per env for affected probes.
+        if shared_metadata.any_dead_taxel and shared_metadata.dead_taxel_mask.numel() > 0:
+            prob = shared_metadata.dead_taxel_probability.unsqueeze(0)  # (1, total_n_probes)
+            n_envs = shared_metadata.dead_taxel_mask[envs_idx].shape[0]
+            rolls = torch.rand((n_envs, shared_metadata.total_n_probes), device=gs.device, dtype=gs.tc_float)
+            new_mask = rolls < prob
+            shared_metadata.dead_taxel_mask[envs_idx] = new_mask
+            low = shared_metadata.dead_taxel_value_low.unsqueeze(0)
+            high = shared_metadata.dead_taxel_value_high.unsqueeze(0)
+            uniforms = torch.rand((n_envs, shared_metadata.total_n_probes), device=gs.device, dtype=gs.tc_float)
+            shared_metadata.dead_taxel_values[envs_idx] = uniforms * (high - low) + low
+            # The per-cache-column broadcast is now stale; rebuilt on the next `_apply_hardware_imperfections`.
+            shared_metadata.dead_dirty = True
+
+    @gs.assert_built
+    def set_probe_gain(self, value, envs_idx=None):
+        """Set the per-probe measured-branch contact-depth gain for the given envs.
+
+        ``value`` may be a scalar (broadcast to all probes of this sensor), or an array of length ``n_probes``.
+        Affects only the probes registered by this sensor instance.
+        """
+        envs_idx = self._sanitize_envs_idx(envs_idx)
+        probe_start = int(self._shared_metadata.sensor_probe_start[self._idx].item())
+        probe_slice = slice(probe_start, probe_start + self._n_probes)
+        if isinstance(value, (int, float)):
+            row = torch.full((len(envs_idx), self._n_probes), float(value), dtype=gs.tc_float, device=gs.device)
+        else:
+            t = torch.as_tensor(value, dtype=gs.tc_float, device=gs.device).reshape(-1)
+            if t.numel() != self._n_probes:
+                gs.raise_exception(f"set_probe_gain expected {self._n_probes} values, got {t.numel()}.")
+            row = t.unsqueeze(0).expand(len(envs_idx), self._n_probes).contiguous()
+        self._shared_metadata.probe_gains[envs_idx, probe_slice] = row
+        # Conservatively mark gain in use (a user-set gain may be non-unit); never reset to False.
+        self._shared_metadata.has_any_probe_gain = True
+
+    @classmethod
+    def _apply_hardware_imperfections(cls, shared_metadata, measured_slot_0):
+        super()._apply_hardware_imperfections(shared_metadata, measured_slot_0)
+        if not shared_metadata.any_dead_taxel:
+            return
+        cls._maybe_build_cache_col_probe_idx(shared_metadata, measured_slot_0)
+        # The per-(env, probe) dead state only changes on reset; broadcast it to per-(env, cache_col) layout once
+        # (when dirty) instead of gathering every step.
+        if shared_metadata.dead_dirty or shared_metadata.dead_mask_per_col.shape != measured_slot_0.shape:
+            idx = shared_metadata.cache_col_probe_idx  # (total_cache_size,)
+            shared_metadata.dead_mask_per_col = shared_metadata.dead_taxel_mask[:, idx]
+            shared_metadata.dead_values_per_col = shared_metadata.dead_taxel_values[:, idx].to(
+                dtype=measured_slot_0.dtype
+            )
+            shared_metadata.dead_dirty = False
+        torch.where(
+            shared_metadata.dead_mask_per_col,
+            shared_metadata.dead_values_per_col,
+            measured_slot_0,
+            out=measured_slot_0,
+        )
+
+    @classmethod
+    def _maybe_build_cache_col_probe_idx(cls, shared_metadata, tensor):
+        n_cols = tensor.shape[1]
+        if shared_metadata.cache_col_probe_idx.shape == (n_cols,):
+            return
+        sizes = shared_metadata.cache_sizes
+        n_probes_per = shared_metadata.n_probes_per_sensor.tolist()
+        probe_starts = shared_metadata.sensor_probe_start.tolist()
+        groups = shared_metadata.cache_col_n_channel_groups
+        # Each sensor's cache columns are ordered (group, probe, component); only the probe axis indexes a probe,
+        # so its slice is a strided arange: arange(n_p) repeated per-component, tiled over the k groups.
+        per_sensor = []
+        for i_s, cache_size in enumerate(sizes):
+            n_p = n_probes_per[i_s]
+            if n_p == 0:
+                continue
+            k = groups[i_s] if i_s < len(groups) else 1
+            components_per_group = cache_size // (k * n_p)
+            cols = torch.arange(n_p, dtype=torch.long, device=gs.device)
+            cols = cols.repeat_interleave(components_per_group).repeat(k)
+            per_sensor.append(cols + probe_starts[i_s])
+        shared_metadata.cache_col_probe_idx = (
+            torch.cat(per_sensor) if per_sensor else torch.empty((0,), dtype=torch.long, device=gs.device)
+        )
 
     @property
     def probe_local_pos(self) -> torch.Tensor:
@@ -128,6 +352,127 @@ def _compute_probes_world_pos(self, context: "RasterizerContext"):
             )
         return envs_idx, n_debug_envs, env_offsets, probe_world.reshape(-1, 3)
 
+    def _draw_probe_spheres(
+        self,
+        context: "RasterizerContext",
+        probe_world: np.ndarray,
+        rgb,
+        probe_radii: np.ndarray | None = None,
+        probe_radii_noise: np.ndarray | None = None,
+    ) -> list:
+        """
+        Draw a small opaque center sphere and a translucent outer sensing sphere at each ``probe_world`` position.
+
+        ``probe_world`` is ``(N, 3)`` (already tiled over rendered envs). ``probe_radii`` and ``probe_radii_noise``
+        are the matching ``(N,)`` per-position nominal sensing radius and additive uniform noise; both default to
+        the per-probe values from shared metadata, tiled to match ``probe_world``. When noise is positive, each
+        outer sphere is drawn at a fresh sample ``clip(r + U(-noise, +noise), 0, inf)`` rounded to the nearest
+        ``noise`` magnitude so the unique-radius batches stay small. Returns the created debug objects.
+        """
+        options = self._options
+        rgb = tuple(float(c) for c in rgb)
+        center_color = (*rgb, 1.0)
+        objs = [
+            context.draw_debug_spheres(
+                poss=probe_world,
+                radius=float(options.debug_probe_center_radius),
+                color=center_color,
+            )
+        ]
+        if options.debug_probe_sphere_opacity <= 0.0:
+            return objs
+        outer_color = (*rgb, float(options.debug_probe_sphere_opacity))
+        probe_start = int(self._shared_metadata.sensor_probe_start[self._idx].item())
+        probe_slice = slice(probe_start, probe_start + self._n_probes)
+        n_tile = probe_world.shape[0] // self._n_probes if self._n_probes > 0 else 0
+        n_tile = max(n_tile, 1)
+        if probe_radii is None:
+            per_probe = tensor_to_array(self._shared_metadata.probe_radii[probe_slice]).reshape(-1)
+            probe_radii = np.tile(per_probe, n_tile)
+        if probe_radii_noise is None:
+            per_probe_noise = tensor_to_array(self._shared_metadata.probe_radii_noise[probe_slice]).reshape(-1)
+            probe_radii_noise = np.tile(per_probe_noise, n_tile)
+        nz = probe_radii_noise > 0.0
+        if nz.any():
+            jitter = np.random.uniform(-1.0, 1.0, size=probe_radii.shape) * probe_radii_noise
+            noisy = np.maximum(0.0, probe_radii + jitter)
+            rounded = probe_radii.astype(float, copy=True)
+            rounded[nz] = np.round(noisy[nz] / probe_radii_noise[nz]) * probe_radii_noise[nz]
+            probe_radii = rounded
+        for r in np.unique(probe_radii):
+            if r <= 0.0:
+                continue
+            mask = probe_radii == r
+            objs.append(
+                context.draw_debug_spheres(
+                    poss=probe_world[mask],
+                    radius=float(r),
+                    color=outer_color,
+                )
+            )
+        return objs
+
+    def _draw_debug_probes(
+        self,
+        context: "RasterizerContext",
+        color_groups_fn: Callable[[list[int] | None], list[tuple]] | None = None,
+    ) -> tuple[list[int] | None, int, np.ndarray | None]:
+        """
+        Generic per-probe debug renderer. Clears prior debug objects, then for each provided color group draws the
+        two-sphere marker (small opaque center + translucent outer sensing sphere) on the selected probe positions.
+
+        ``color_groups_fn(envs_idx)`` returns a list of ``(rgb, mask)`` pairs, where ``rgb`` is a length-3 sequence
+        and ``mask`` is a flat ``(n_debug_envs * n_probes,)`` bool array (or tensor castable to bool) selecting
+        which probe positions take that color. Passing ``None`` falls back to a single group covering every probe
+        in the sensor's ``debug_probe_color`` (no contact-state assumption -- usable by any probe sensor).
+
+        Returns ``(envs_idx, n_debug_envs, env_offsets)`` so subclasses can extend the drawing with additional
+        debug geometry without recomputing the env layout.
+        """
+        for obj in self._debug_objects:
+            context.clear_debug_object(obj)
+        self._debug_objects.clear()
+
+        envs_idx, n_debug_envs, env_offsets, probe_world = self._compute_probes_world_pos(context)
+        probe_start = int(self._shared_metadata.sensor_probe_start[self._idx].item())
+        probe_slice = slice(probe_start, probe_start + self._n_probes)
+        n_tile = max(n_debug_envs, 1)
+        all_radii = np.tile(tensor_to_array(self._shared_metadata.probe_radii[probe_slice]).reshape(-1), n_tile)
+        all_noise = np.tile(tensor_to_array(self._shared_metadata.probe_radii_noise[probe_slice]).reshape(-1), n_tile)
+        if color_groups_fn is None:
+            groups = [(self._options.debug_probe_color, np.ones(probe_world.shape[0], dtype=bool))]
+        else:
+            groups = color_groups_fn(envs_idx)
+        for rgb, mask in groups:
+            mask_arr = np.asarray(tensor_to_array(mask), dtype=bool).reshape(-1)
+            (probes_idx,) = np.nonzero(mask_arr)
+            if probes_idx.size == 0:
+                continue
+            self._debug_objects.extend(
+                self._draw_probe_spheres(
+                    context, probe_world[probes_idx], rgb, all_radii[probes_idx], all_noise[probes_idx]
+                )
+            )
+        return envs_idx, n_debug_envs, env_offsets
+
+    def _tactile_color_groups_fn(
+        self, get_is_contact_flat: Callable[[list[int] | None], object]
+    ) -> Callable[[list[int] | None], list[tuple]]:
+        """
+        Build a ``color_groups_fn`` for the common tactile split: not-in-contact probes get ``debug_probe_color``
+        and in-contact probes get ``debug_contact_color``. The sensor's options must expose
+        ``debug_contact_color`` (i.e. inherit ``TactileProbeSensorOptionsMixin``).
+        """
+
+        def fn(envs_idx):
+            is_contact = np.asarray(tensor_to_array(get_is_contact_flat(envs_idx)), dtype=bool).reshape(-1)
+            return [
+                (self._options.debug_probe_color, ~is_contact),
+                (self._options.debug_contact_color, is_contact),
+            ]
+
+        return fn
+
 
 @dataclass
 class ProbesWithNormalSensorMetadataMixin(ProbeSensorMetadataMixin):
@@ -146,9 +491,11 @@ class ProbesWithNormalSensorMixin(ProbeSensorMixin[ProbesWithNormalSensorSharedM
 
     def __init__(self, sensor_options: "SensorOptions", sensor_idx: int, sensor_manager: "SensorManager"):
         super().__init__(sensor_options, sensor_idx, sensor_manager)
-        self._probe_local_normal = torch.tensor(self._options.probe_local_normal, dtype=gs.tc_float, device=gs.device)
-        if self._probe_local_normal.ndim == 1:
-            self._probe_local_normal = self._probe_local_normal.expand(self._n_probes, 3).contiguous()
+        raw_normal = torch.tensor(self._options.probe_local_normal, dtype=gs.tc_float, device=gs.device)
+        if raw_normal.ndim == 1:
+            self._probe_local_normal = raw_normal.expand(self._n_probes, 3).contiguous()
+        else:
+            self._probe_local_normal = raw_normal.reshape(self._n_probes, 3).contiguous()
 
     def build(self) -> None:
         super().build()
diff --git a/genesis/engine/sensors/surface_distance_probe.py b/genesis/engine/sensors/surface_distance_probe.py
index 858719a4fd..66eaa4ad5f 100644
--- a/genesis/engine/sensors/surface_distance_probe.py
+++ b/genesis/engine/sensors/surface_distance_probe.py
@@ -14,7 +14,12 @@
 from genesis.utils.raycast_qd import get_triangle_vertices
 
 from .base_sensor import RigidSensorMetadataMixin, RigidSensorMixin, SimpleSensor, SimpleSensorMetadata
-from .probe import ProbeSensorMetadataMixin, ProbeSensorMixin, func_noised_probe_radius
+from .probe import (
+    ProbeSensorMetadataMixin,
+    ProbeSensorMixin,
+    func_noised_probe_radius,
+    get_measured_bufs,
+)
 
 if TYPE_CHECKING:
     from genesis.utils.ring_buffer import TensorRingBuffer
@@ -179,13 +184,12 @@ def _kernel_surface_distance_probe(
 
         probe_idx_in_sensor = i_p - sensor_probe_start[i_s]
         cache_start = sensor_cache_start[i_s]
-        probe_global_idx = sensor_probe_start[i_s] + probe_idx_in_sensor
 
         output_gt[cache_start + probe_idx_in_sensor, i_b] = best_dist_gt
         output_measured[cache_start + probe_idx_in_sensor, i_b] = best_dist_m
         for j in qd.static(range(3)):
-            positions_gt[i_b, probe_global_idx, j] = best_point_gt[j]
-            positions_measured[i_b, probe_global_idx, j] = best_point_m[j]
+            positions_gt[i_b, i_p, j] = best_point_gt[j]
+            positions_measured[i_b, i_p, j] = best_point_m[j]
 
 
 @dataclass
@@ -215,7 +219,6 @@ class SurfaceDistanceProbeSensor(
 
     def __init__(self, sensor_options: SurfaceDistanceProbeOptions, sensor_idx: int, sensor_manager: "SensorManager"):
         super().__init__(sensor_options, sensor_idx, sensor_manager)
-        self._debug_objects: list = []
         self._nearest_points_slice: slice | None = None
 
     def _get_return_format(self) -> tuple[tuple[int, ...], ...]:
@@ -272,13 +275,9 @@ def _update_current_timestep_data(
         measured_data_timeline: "TensorRingBuffer",
     ):
         solver = shared_metadata.solver
-        current_ground_truth_data_T.zero_()
-        measured = measured_data_timeline.at(0, copy=False)
-        measured.zero_()
-        if shared_metadata.measured_scratch_T.shape != current_ground_truth_data_T.shape:
-            shared_metadata.measured_scratch_T = torch.empty_like(current_ground_truth_data_T)
-        measured_cols_b = shared_metadata.measured_scratch_T
-
+        measured, measured_cols_b = get_measured_bufs(
+            shared_metadata, current_ground_truth_data_T, measured_data_timeline
+        )
         _kernel_surface_distance_probe(
             shared_metadata.probe_positions,
             shared_metadata.probe_radii,
@@ -316,24 +315,30 @@ def _draw_debug(self, context: "RasterizerContext"):
 
         link_pos = self._link.get_pos(env_idx).squeeze()
         link_quat = self._link.get_quat(env_idx).squeeze()
-        probe_world = tensor_to_array(gu.transform_by_trans_quat(self._probe_local_pos, link_pos, link_quat))
+        probe_world = tensor_to_array(
+            gu.transform_by_trans_quat(self._probe_local_pos.reshape(-1, 3), link_pos, link_quat)
+        ).reshape(-1, 3)
         points = tensor_to_array(self.nearest_points[env_idx]).reshape(-1, 3)
 
+        rgb = tuple(float(c) for c in self._options.debug_probe_color)
+        line_color = (*rgb, 1.0)
+        self._debug_objects.extend(self._draw_probe_spheres(context, probe_world, rgb))
         self._debug_objects.append(
             context.draw_debug_spheres(
-                poss=np.concatenate([probe_world, points]),
-                radius=self._options.debug_sphere_radius,
-                color=self._options.debug_probe_color,
+                poss=points,
+                radius=float(self._options.debug_probe_center_radius),
+                color=line_color,
             )
         )
         for i in range(len(probe_world)):
-            line_obj = context.draw_debug_line(
-                probe_world[i],
-                points[i],
-                radius=self._options.debug_sphere_radius / 4.0,
-                color=self._options.debug_probe_color,
+            self._debug_objects.append(
+                context.draw_debug_line(
+                    probe_world[i],
+                    points[i],
+                    radius=float(self._options.debug_probe_center_radius) / 4.0,
+                    color=line_color,
+                )
             )
-            self._debug_objects.append(line_obj)
 
     @property
     def nearest_points(self) -> torch.Tensor:
diff --git a/genesis/engine/sensors/tactile_shared.py b/genesis/engine/sensors/tactile_shared.py
new file mode 100644
index 0000000000..7613db64a1
--- /dev/null
+++ b/genesis/engine/sensors/tactile_shared.py
@@ -0,0 +1,296 @@
+import math
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Callable, Generic, TypeVar
+
+import numpy as np
+import torch
+
+import genesis as gs
+from genesis.utils.misc import concat_with_tensor, make_tensor_field
+
+if TYPE_CHECKING:
+    from genesis.utils.ring_buffer import TensorRingBuffer
+
+
+_GRID_TOL = 1.0e-5  # Tolerance for grid-regularity / orthogonality / normal-uniformity checks.
+
+
+def next_pow2(n: int) -> int:
+    """Smallest power of 2 >= ``n`` (1 if ``n == 0``)."""
+    if n <= 1:
+        return 1
+    p = 1
+    while p < n:
+        p *= 2
+    return p
+
+
+# ============================ FFT helpers ============================
+
+
+@dataclass
+class GridFFTConvMetadataMixin:
+    """
+    Shared per-sensor-class state for the per-grid 2D-FFT convolution passes.
+
+    Parameters
+    ----------
+    grid_fft_meta : list of tuple
+        Per-grid-FFT-sensor metadata tuples. The leading 5 fields are always
+        ``(sensor_idx, g_ny, g_nx, probe_start, cache_start)``; sensors append their kernel params after that.
+    grid_fft_max_n : (int, int)
+        Global FFT size ``(fft_ny, fft_nx)``, the elementwise max over all registered grid sensors. Build-time only.
+    grid_fft_kernels_stacked : torch.Tensor
+        Stacked complex ``rfft2`` kernels (half spectrum), shape ``(n_grid, n_planes, fft_ny, fft_nx // 2 + 1)``.
+        Recomputed when the FFT size grows.
+    grid_fft_buffer : torch.Tensor
+        Reused per-step real buffer: ``(B, n_grid, n_channels, fft_ny, fft_nx)`` when registered with
+        ``n_buffer_channels > 0``, else ``(B, n_grid, fft_ny, fft_nx)``. Reallocated on each registration.
+    any_grid_fft : bool
+        Python fast-path flag; True iff at least one grid-FFT sensor is registered.
+    """
+
+    grid_fft_meta: list[tuple] = field(default_factory=list)
+    grid_fft_max_n: tuple[int, int] = (0, 0)
+    grid_fft_kernels_stacked: torch.Tensor = make_tensor_field((0, 0, 0, 0), dtype_factory=lambda: torch.complex64)
+    grid_fft_buffer: torch.Tensor = make_tensor_field((0, 0, 0, 0))
+    any_grid_fft: bool = False
+
+
+def register_grid_fft_sensor(
+    metadata: GridFFTConvMetadataMixin,
+    meta_entry: tuple,
+    this_fft_n: tuple[int, int],
+    kernel_builder: Callable[[tuple, tuple[int, int]], torch.Tensor],
+    n_buffer_channels: int,
+    batch_size: int,
+) -> None:
+    """
+    Register one grid-shaped sensor for FFT convolution; (re)build the stacked kernels and the per-step buffer.
+
+    Parameters
+    ----------
+    metadata : GridFFTConvMetadataMixin
+        Shared per-sensor-class FFT state to register into and (re)build.
+    meta_entry : tuple
+        Metadata tuple appended to ``grid_fft_meta``; its leading 5 fields must be
+        ``(sensor_idx, g_ny, g_nx, probe_start, cache_start)``, followed by any sensor-specific kernel params.
+    this_fft_n : (int, int)
+        The ``(ny, nx)`` FFT size this sensor needs. The shared ``grid_fft_max_n`` grows to the elementwise max;
+        when it grows, every prior sensor's kernel is recomputed at the new size (frequency-domain padding is not
+        equivalent to spatial zero-padding).
+    kernel_builder : callable
+        ``kernel_builder(meta_entry, fft_n) -> (n_planes, fft_ny, fft_nx // 2 + 1)`` complex tensor (an ``rfft2``
+        half spectrum). Must be deterministic from the meta tuple, since it is re-invoked whenever the FFT size grows.
+    n_buffer_channels : int
+        When ``> 0``, allocate a 5D ``(B, n_grid, n_buffer_channels, ny, nx)`` per-step buffer; else a 4D
+        ``(B, n_grid, ny, nx)`` one.
+    batch_size : int
+        Number of environments ``B``; the leading dimension of the per-step buffer.
+    """
+    metadata.grid_fft_meta.append(meta_entry)
+    cur = metadata.grid_fft_max_n
+    new_n = (max(cur[0], this_fft_n[0]), max(cur[1], this_fft_n[1]))
+    metadata.grid_fft_max_n = new_n
+    n_grid = len(metadata.grid_fft_meta)
+    metadata.grid_fft_kernels_stacked = torch.stack([kernel_builder(m, new_n) for m in metadata.grid_fft_meta], dim=0)
+    buffer_shape = (
+        (batch_size, n_grid, n_buffer_channels, new_n[0], new_n[1])
+        if n_buffer_channels > 0
+        else (batch_size, n_grid, new_n[0], new_n[1])
+    )
+    metadata.grid_fft_buffer = torch.zeros(buffer_shape, dtype=gs.tc_float, device=gs.device)
+    metadata.any_grid_fft = True
+
+
+def expand_probe_normals(normals: np.ndarray, n_probes: int, probe_shape: tuple[int, ...]) -> np.ndarray:
+    """Broadcast ``normals`` to a flat ``(n_probes, 3)`` array.
+
+    Accepts a single shared normal of shape ``(3,)``, a grid-shaped ``(*probe_shape, 3)`` array, or an already-flat
+    ``(n_probes, 3)``. Any other shape raises.
+    """
+    normals = np.asarray(normals, dtype=gs.np_float)
+    if normals.ndim == 1:
+        return np.broadcast_to(normals, (n_probes, 3)).copy()
+    if normals.shape == (*probe_shape, 3):
+        return normals.reshape(n_probes, 3).copy()
+    if normals.shape == (n_probes, 3):
+        return normals.copy()
+    gs.raise_exception(
+        "probe_local_normal must be one normal or match probe_local_pos shape. "
+        f"Got normal shape {normals.shape} for probe shape {probe_shape}."
+    )
+
+
+def normalize_grid_probe_layout(
+    probe_pos: np.ndarray, probe_normals: np.ndarray, is_grid: bool
+) -> tuple[np.ndarray, np.ndarray, bool, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Validate a probe layout and extract grid-FFT metadata when the layout qualifies.
+
+    Returns ``(flat_positions, flat_normals, use_grid_fft, grid_normal, tangent_u, tangent_v, grid_spacing)``. When
+    the layout is flat (``is_grid=False``) or fails any grid-FFT precondition, ``use_grid_fft`` is False and the
+    tangent / spacing entries are zero.
+
+    Grid-FFT preconditions: shape ``(ny, nx, 3)`` with ``ny, nx >= 2``, normals uniform within tolerance, tangents
+    orthogonal, both tangents in the plane perpendicular to the normal, and all interior probes laid out on a
+    regular ``(spacing_u, spacing_v)`` rectangle.
+    """
+    probe_shape = probe_pos.shape[:-1]
+    flat = probe_pos.reshape(-1, 3)
+    normals = expand_probe_normals(probe_normals, flat.shape[0], probe_shape)
+
+    normal_norms = np.linalg.norm(normals, axis=1)
+    if np.any(normal_norms < gs.EPS):
+        gs.raise_exception("probe_local_normal entries must be non-zero.")
+    normals = normals / normal_norms[:, None]
+
+    use_grid_fft = False
+    grid_normal = np.zeros(3, dtype=gs.np_float)
+    tangent_u = np.zeros(3, dtype=gs.np_float)
+    tangent_v = np.zeros(3, dtype=gs.np_float)
+    grid_spacing = np.zeros(2, dtype=gs.np_float)
+
+    if is_grid:
+        if len(probe_shape) != 2:
+            gs.raise_exception("Grid probe_local_pos must have shape (ny, nx, 3).")
+        ny, nx = int(probe_shape[0]), int(probe_shape[1])
+        if nx >= 2 and ny >= 2:
+            grid = probe_pos.reshape(ny, nx, 3)
+            step_u = grid[0, 1] - grid[0, 0]
+            step_v = grid[1, 0] - grid[0, 0]
+            spacing_u = float(np.linalg.norm(step_u))
+            spacing_v = float(np.linalg.norm(step_v))
+            if spacing_u >= gs.EPS and spacing_v >= gs.EPS:
+                tangent_u_candidate = (step_u / spacing_u).astype(gs.np_float)
+                tangent_v_candidate = (step_v / spacing_v).astype(gs.np_float)
+                normal_candidate = normals[0].astype(gs.np_float, copy=False)
+                normals_are_uniform = bool(np.all(normals @ normal_candidate >= 1.0 - _GRID_TOL))
+                axes_are_orthogonal = abs(float(tangent_u_candidate @ tangent_v_candidate)) <= _GRID_TOL
+                axes_in_plane = (
+                    abs(float(tangent_u_candidate @ normal_candidate)) <= _GRID_TOL
+                    and abs(float(tangent_v_candidate @ normal_candidate)) <= _GRID_TOL
+                )
+                expected = (
+                    grid[0, 0]
+                    + np.arange(nx, dtype=gs.np_float)[None, :, None] * step_u[None, None, :]
+                    + np.arange(ny, dtype=gs.np_float)[:, None, None] * step_v[None, None, :]
+                )
+                is_regular = bool(np.max(np.linalg.norm(grid - expected, axis=-1)) <= _GRID_TOL)
+                use_grid_fft = normals_are_uniform and axes_are_orthogonal and axes_in_plane and is_regular
+                if use_grid_fft:
+                    grid_normal = normal_candidate
+                    tangent_u = tangent_u_candidate
+                    tangent_v = tangent_v_candidate
+                    grid_spacing = np.array((spacing_u, spacing_v), dtype=gs.np_float)
+
+    return (
+        flat.astype(gs.np_float, copy=False),
+        normals.astype(gs.np_float, copy=False),
+        use_grid_fft,
+        grid_normal.astype(gs.np_float, copy=False),
+        tangent_u.astype(gs.np_float, copy=False),
+        tangent_v.astype(gs.np_float, copy=False),
+        grid_spacing.astype(gs.np_float, copy=False),
+    )
+
+
+# ============================ ViscoelasticHysteresis ============================
+
+
+@dataclass
+class ViscoelasticHysteresisMetadataMixin:
+    hysteresis_strength: torch.Tensor = make_tensor_field((0,))
+    hysteresis_alpha: torch.Tensor = make_tensor_field((0,))
+    viscoelastic_xi: torch.Tensor = make_tensor_field((0, 0))
+    viscoelastic_prev_input: torch.Tensor = make_tensor_field((0, 0))
+    viscoelastic_strength_row: torch.Tensor = make_tensor_field((0,))
+    viscoelastic_alpha_row: torch.Tensor = make_tensor_field((0,))
+    has_any_hysteresis: bool = False
+
+
+ViscoelasticHysteresisSharedMetadataT = TypeVar(
+    "ViscoelasticHysteresisSharedMetadataT", bound=ViscoelasticHysteresisMetadataMixin
+)
+
+
+class ViscoelasticHysteresisMixin(Generic[ViscoelasticHysteresisSharedMetadataT]):
+    """
+    Viscoelastic hysteresis (single Maxwell element, equilibrium gain normalized to 1).
+
+    Per simulation step::
+        alpha = exp(-dt / tau)
+        xi    <- alpha * xi + (x - x_prev)
+        output = x + strength * xi
+        x_prev <- x
+
+    After a step input from 0 to X the output jumps to ``X * (1 + strength)`` and decays back to ``X`` with time
+    constant ``tau``. On cyclic input the output overshoots on rising edges and undershoots on falling edges,
+    """
+
+    _shared_metadata: ViscoelasticHysteresisSharedMetadataT
+
+    def build(self):
+        super().build()
+        # ``getattr`` so sensor classes whose options don't declare the hysteresis fields (e.g. ``ContactProbe``,
+        # which inherits the sensor mixin via ``ContactDepthProbeSensor`` but has bool output and isn't a viscoelastic
+        # target) still build cleanly with hysteresis disabled.
+        strength = float(getattr(self._options, "hysteresis_strength", 0.0))
+        tau = float(getattr(self._options, "hysteresis_tau", 0.0))
+        alpha = math.exp(-self._dt / tau) if tau > 0.0 else 0.0
+        self._shared_metadata.hysteresis_strength = concat_with_tensor(
+            self._shared_metadata.hysteresis_strength, strength, expand=(1,)
+        )
+        self._shared_metadata.hysteresis_alpha = concat_with_tensor(
+            self._shared_metadata.hysteresis_alpha, alpha, expand=(1,)
+        )
+        if strength > 0.0 and tau > 0.0:
+            self._shared_metadata.has_any_hysteresis = True
+        # Invalidate lazy rows so they rebuild on first apply against the final cache width. Per-column state tensors
+        # are allocated lazily at the same time, so sensor classes that never enable hysteresis pay no memory cost.
+        self._shared_metadata.viscoelastic_strength_row = torch.empty((0,), dtype=gs.tc_float, device=gs.device)
+        self._shared_metadata.viscoelastic_alpha_row = torch.empty((0,), dtype=gs.tc_float, device=gs.device)
+
+    @classmethod
+    def reset(
+        cls,
+        shared_metadata: ViscoelasticHysteresisSharedMetadataT,
+        shared_ground_truth_cache: torch.Tensor,
+        envs_idx,
+    ):
+        super().reset(shared_metadata, shared_ground_truth_cache, envs_idx)
+        if shared_metadata.viscoelastic_xi.numel() > 0:
+            shared_metadata.viscoelastic_xi[envs_idx] = 0.0
+            shared_metadata.viscoelastic_prev_input[envs_idx] = 0.0
+
+    @classmethod
+    def _apply_transform(
+        cls,
+        shared_metadata: ViscoelasticHysteresisSharedMetadataT,
+        data: torch.Tensor,
+        timeline: "TensorRingBuffer",
+        *,
+        is_measured: bool,
+    ):
+        super()._apply_transform(shared_metadata, data, timeline, is_measured=is_measured)
+        if not is_measured or not shared_metadata.has_any_hysteresis:
+            return
+
+        B, n_cols, *_ = data.shape
+        # Lazily build the per-cache-column strength/alpha rows and state buffers
+        if shared_metadata.viscoelastic_strength_row.shape != (n_cols,):
+            sensor_col_idx = []
+            for i_s, size in enumerate(shared_metadata.cache_sizes):
+                sensor_col_idx.extend([i_s] * size)
+            idx_t = torch.tensor(sensor_col_idx, dtype=torch.long, device=gs.device)
+            shared_metadata.viscoelastic_strength_row = shared_metadata.hysteresis_strength[idx_t].to(dtype=data.dtype)
+            shared_metadata.viscoelastic_alpha_row = shared_metadata.hysteresis_alpha[idx_t].to(dtype=data.dtype)
+            shared_metadata.viscoelastic_xi = torch.zeros((B, n_cols), dtype=data.dtype, device=gs.device)
+            shared_metadata.viscoelastic_prev_input = torch.zeros((B, n_cols), dtype=data.dtype, device=gs.device)
+
+        xi = shared_metadata.viscoelastic_xi
+        prev = shared_metadata.viscoelastic_prev_input
+        xi.mul_(shared_metadata.viscoelastic_alpha_row.unsqueeze(0))
+        xi.add_(data).sub_(prev)
+        prev.copy_(data)
+        data.addcmul_(xi, shared_metadata.viscoelastic_strength_row.unsqueeze(0))
diff --git a/genesis/options/sensors/options.py b/genesis/options/sensors/options.py
index 6ea1223ce3..9d21644114 100644
--- a/genesis/options/sensors/options.py
+++ b/genesis/options/sensors/options.py
@@ -13,15 +13,19 @@
     NonNegativeInt,
     OptionalIArrayType,
     PositiveFArrayType,
+    PositiveFGridType,
     PositiveFloat,
     PositiveVec3IType,
     RotationMatrixType,
+    UnitInterval,
     UnitIntervalVec3Type,
     UnitIntervalVec4Type,
     UnitVec3FArrayType,
+    UnitVec3FGridType,
     UnitVec3FType,
     Vec2FType,
     Vec3FArrayType,
+    Vec3FGridType,
     Vec3FType,
     Vec4FType,
     is_sequence,
@@ -201,25 +205,39 @@ class ProbeSensorOptionsMixin(SensorOptions[SensorT]):
 
     Parameters
     ----------
-    probe_local_pos : array-like[array-like[float, float, float]]
-        Probe positions in link-local frame. One ``(x, y, z)`` per probe.
-    probe_radius : float | array-like[float]
-        Probe sensing radius in meters. A scalar is shared by every probe; an array must match the probe count.
+    probe_local_pos : array-like[array-like[float, float, float]] or shape ``(M, N, 3)`` grid
+        Probe positions in link-local frame. Either a flat ``(N, 3)`` set or a 2D grid ``(M, N, 3)``; the
+        ``read()`` output is reshaped back to match this layout.
+    probe_radius : float | array-like[float] or shape ``(M, N)`` grid
+        Probe sensing radius in meters. A scalar is shared by every probe; an array (or grid) must match the
+        layout of ``probe_local_pos``.
     probe_radius_noise : float
         Additive radius noise in meters used by kernels whose measured branch depends on effective probe radius.
-    debug_probe_color : array-like[float, float, float, float]
-        RGBA color for inactive debug probe spheres.
+    debug_probe_color : array-like[float, float, float]
+        RGB color for debug probe spheres (no alpha; the center sphere is drawn opaque and the outer sphere uses
+        ``debug_probe_sphere_opacity``).
+    debug_probe_center_radius : float
+        Radius in meters of the small opaque marker sphere drawn at each probe position.
+    debug_probe_sphere_opacity : float
+        Alpha (0..1) of the outer translucent sphere drawn at each probe's sensing radius. Set to ``0.0`` to skip.
     """
 
-    probe_local_pos: Vec3FArrayType = ((0.0, 0.0, 0.0),)
-    probe_radius: PositiveFArrayType | PositiveFloat = 0.01
+    probe_local_pos: Vec3FArrayType | Vec3FGridType = ((0.0, 0.0, 0.0),)
+    probe_radius: PositiveFloat | PositiveFArrayType | PositiveFGridType = 0.01
     probe_radius_noise: NonNegativeFloat = 0.0
-    debug_probe_color: UnitIntervalVec4Type = (0.2, 0.6, 1.0, 0.6)
+    debug_probe_color: UnitIntervalVec3Type = (0.2, 0.4, 1.0)
+    debug_probe_center_radius: PositiveFloat = 0.0008
+    debug_probe_sphere_opacity: UnitInterval = 0.3
 
     def model_post_init(self, context: Any) -> None:
         super().model_post_init(context)
-        n_probes = np.array(self.probe_local_pos).reshape(-1, 3).shape[0]
-        _check_len_match(self.probe_radius, n_probes, "probe_radius", "probe_local_pos")
+        n_probes = int(np.prod(np.asarray(self.probe_local_pos).shape[:-1]))
+        if isinstance(self.probe_radius, Sequence):
+            if np.asarray(self.probe_radius).size != n_probes:
+                gs.raise_exception(
+                    f"probe_radius shape {np.asarray(self.probe_radius).shape} must contain "
+                    f"{n_probes} entries to match probe_local_pos."
+                )
 
 
 class ProbesWithNormalSensorOptionsMixin(ProbeSensorOptionsMixin[SensorT]):
@@ -227,16 +245,16 @@ class ProbesWithNormalSensorOptionsMixin(ProbeSensorOptionsMixin[SensorT]):
     Probe options for sensors that also define one normal per probe, or one shared normal.
     """
 
-    probe_local_normal: UnitVec3FArrayType | UnitVec3FType = (0.0, 0.0, 1.0)
+    probe_local_normal: UnitVec3FType | UnitVec3FArrayType | UnitVec3FGridType = (0.0, 0.0, 1.0)
 
     def model_post_init(self, context: Any) -> None:
         super().model_post_init(context)
-        n_probes = np.array(self.probe_local_pos).reshape(-1, 3).shape[0]
-        normals = np.array(self.probe_local_normal)
-        if normals.ndim > 1 and normals.reshape(-1, 3).shape[0] != n_probes:
+        n_probes = int(np.prod(np.asarray(self.probe_local_pos).shape[:-1]))
+        normals = np.asarray(self.probe_local_normal)
+        if normals.ndim > 1 and normals.size // 3 != n_probes:
             gs.raise_exception(
-                "probe_local_normal must be one normal or match probe_local_pos length. "
-                f"Got {normals.reshape(-1, 3).shape[0]} normals and {n_probes} probe positions."
+                "probe_local_normal must be one normal or contain one normal per probe. "
+                f"Got normal shape {normals.shape} for {n_probes} probes."
             )
 
 
@@ -492,18 +510,14 @@ class SurfaceDistanceProbe(
         Probe positions in link-local frame. One (x, y, z) per probe.
     probe_radius : float | array-like[float]
         Maximum sensing range in meters. When no mesh is within this distance, distance is clamped to the probe
-        radius and nearest points is the probe position. Default: 10.0.
+        radius and nearest points is the probe position. Default: 0.5. Also controls the outer debug sphere.
     track_link_idx : array-like[int]
         Global link indices (solver link space) whose mesh geoms are used for distance queries.
-    debug_sphere_radius: float, optional
-        The radius of each debug sphere drawn in the scene. Defaults to 0.008.
     """
 
-    probe_radius: PositiveFArrayType | PositiveFloat = 10.0
+    probe_radius: PositiveFArrayType | PositiveFloat = 0.5
     track_link_idx: IArrayType = Field(default_factory=tuple)
 
-    debug_sphere_radius: PositiveFloat = 0.008
-
     def validate_scene(self, scene: "Scene"):
         super().validate_scene(scene)
         n_links = scene.sim.rigid_solver.n_links
diff --git a/genesis/options/sensors/tactile.py b/genesis/options/sensors/tactile.py
index 230e0599cd..337a98af6a 100644
--- a/genesis/options/sensors/tactile.py
+++ b/genesis/options/sensors/tactile.py
@@ -1,27 +1,32 @@
-from typing import TYPE_CHECKING, Annotated, Any, Sequence
+from typing import TYPE_CHECKING, Any
 
+import numpy as np
 from pydantic import Field, StrictBool
 
 import genesis as gs
 from genesis.typing import (
+    FArrayType,
+    FGridType,
     IArrayType,
     NonNegativeFloat,
     NonNegativeInt,
-    NumericType,
+    PositiveFArrayType,
     PositiveFloat,
     PositiveInt,
+    PositiveVec2FType,
+    UnitIntervalVec3Type,
     UnitIntervalVec4Type,
-    UnitVec3FArrayType,
-    UnitVec3FType,
-    Vec3FArrayType,
+    Vec2FType,
 )
 
 from .options import (
     ProbeSensorOptionsMixin,
     ProbesWithNormalSensorOptionsMixin,
     RigidSensorOptionsMixin,
+    SensorOptions,
     SensorT,
     SimpleSensorOptions,
+    _check_len_match,
 )
 
 if TYPE_CHECKING:
@@ -33,11 +38,46 @@
         ProximityTaxelSensor,
     )
 
-    Vec3FGridType = Sequence[Sequence[Sequence[NumericType]]]
-    UnitVec3FGridType = Sequence[Sequence[Sequence[NumericType]]]
-else:
-    Vec3FGridType = Annotated[tuple[Vec3FArrayType, ...], Field(min_length=1, strict=False)]
-    UnitVec3FGridType = Annotated[tuple[UnitVec3FArrayType, ...], Field(min_length=1, strict=False)]
+
+def _validate_filler_probe_radius(probe_radius, sensor_name: str) -> None:
+    """
+    Validate a ``probe_radius`` that permits 0-valued (inactive padding for grid) entries.
+    """
+    radii = np.atleast_1d(np.asarray(probe_radius, dtype=float))
+    if np.any(radii < 0.0):
+        gs.raise_exception(f"{sensor_name} probe_radius entries must be non-negative. Got {probe_radius}.")
+    if not np.any(radii > 0.0):
+        gs.raise_exception(f"{sensor_name} requires at least one positive probe_radius. Got {probe_radius}.")
+
+
+class ViscoelasticHysteresisOptionsMixin(SensorOptions[SensorT]):
+    """
+    Single-Maxwell viscoelastic hysteresis applied on the measured branch only.
+
+    Output equals ``x + hysteresis_strength * xi``, where ``xi`` is a per-cache-column state with
+    ``xi_k = exp(-dt / hysteresis_tau) * xi_{k-1} + (x_k - x_{k-1})``. Equilibrium gain is 1 (steady-state output =
+    steady-state input). On a step input, output transiently overshoots by ``strength``, decaying with time constant
+    ``tau``. On cyclic input this gives a loading-unloading loop in output-vs-input space.
+
+    Parameters
+    ----------
+    hysteresis_strength : float, optional
+        Dimensionless ratio of the Maxwell branch to the equilibrium branch (``E_1 / E_inf`` with ``E_inf = 1``).
+        ``0`` disables hysteresis. Default ``0``.
+    hysteresis_tau : float, optional
+        Relaxation time constant in seconds. Must be positive when ``hysteresis_strength > 0``.
+    """
+
+    hysteresis_strength: NonNegativeFloat = 0.0
+    hysteresis_tau: NonNegativeFloat = 0.0
+
+    def model_post_init(self, context: Any) -> None:
+        super().model_post_init(context)
+        if self.hysteresis_strength > 0.0 and self.hysteresis_tau <= 0.0:
+            gs.raise_exception(
+                f"hysteresis_tau ({self.hysteresis_tau}) must be > 0 when hysteresis_strength "
+                f"({self.hysteresis_strength}) > 0."
+            )
 
 
 class TactileProbeSensorOptionsMixin(ProbeSensorOptionsMixin[SensorT]):
@@ -51,11 +91,45 @@ class TactileProbeSensorOptionsMixin(ProbeSensorOptionsMixin[SensorT]):
 
     Parameters
     ----------
-    debug_contact_color: array-like[float, float, float, float]
-        The color of the debug contact. Defaults to (1.0, 0.2, 0.0, 0.8).
+    debug_contact_color: array-like[float, float, float]
+        RGB color of the debug probe spheres while in contact.
+    probe_gain : float | array-like[float], optional
+        Per-taxel multiplicative gain applied to the measured-branch contact depth. Default ``1.0`` (no gain). Accepts
+        a scalar (applied to all probes) or an array matching the probe count. Force/torque scale as
+        ``gain**normal_exponent`` because the spring-damper sees the gained depth.
+    probe_gain_resample_range : (float, float), optional
+        If set, the per-probe gain is resampled uniformly in ``(low, high)`` on every ``scene.reset()``. Disables the
+        static ``probe_gain`` after the first reset. Default ``None`` (no resampling; gain stays at initial value).
+    dead_taxel_probability : float, optional
+        Per-probe Bernoulli probability that the taxel becomes dead on each ``scene.reset()``. Default ``0.0``
+        (no dead taxels). When set, the intermediate-cache value for dead probes is overwritten by a fresh
+        per-channel uniform sample in ``dead_taxel_value_range`` at the hardware-imperfections stage; the GT branch
+        is untouched.
+    dead_taxel_value_range : (float, float), optional
+        Uniform range for the dead value sampled per channel on reset. Default ``(0.0, 0.0)``.
     """
 
-    debug_contact_color: UnitIntervalVec4Type = (1.0, 0.2, 0.0, 0.8)
+    debug_contact_color: UnitIntervalVec3Type = (1.0, 0.2, 0.0)
+
+    probe_gain: PositiveFArrayType | PositiveFloat = 1.0
+    probe_gain_resample_range: PositiveVec2FType | None = None
+    dead_taxel_probability: NonNegativeFloat = 0.0
+    dead_taxel_value_range: Vec2FType = (0.0, 0.0)
+
+    def model_post_init(self, context: Any) -> None:
+        super().model_post_init(context)
+        n_probes = int(np.prod(np.asarray(self.probe_local_pos).shape[:-1]))
+        _check_len_match(self.probe_gain, n_probes, "probe_gain", "probe_local_pos")
+
+        if self.probe_gain_resample_range is not None:
+            low, high = float(self.probe_gain_resample_range[0]), float(self.probe_gain_resample_range[1])
+            if low > high:
+                gs.raise_exception(f"probe_gain_resample_range must satisfy low <= high. Got ({low}, {high}).")
+        if self.dead_taxel_probability > 1.0:
+            gs.raise_exception(f"dead_taxel_probability must be in [0, 1]. Got {self.dead_taxel_probability}.")
+        low, high = float(self.dead_taxel_value_range[0]), float(self.dead_taxel_value_range[1])
+        if low > high:
+            gs.raise_exception(f"dead_taxel_value_range must satisfy low <= high. Got ({low}, {high}).")
 
 
 class PointCloudTactileSensorMixin(TactileProbeSensorOptionsMixin[SensorT]):
@@ -86,23 +160,44 @@ class ContactProbe(
     RigidSensorOptionsMixin["ContactProbeSensor"],
     SimpleSensorOptions["ContactProbeSensor"],
     TactileProbeSensorOptionsMixin["ContactProbeSensor"],
+    ViscoelasticHysteresisOptionsMixin["ContactProbeSensor"],
 ):
     """
     Returns boolean contact per probe based on the contact depth threshold.
 
     Parameters
     ----------
+    probe_radius : float | array-like[float] or shape ``(M, N)`` grid
+        Probe sensing radius in meters. A scalar is shared by every probe; an array (or grid) must match the
+        layout of ``probe_local_pos``. Array entries of ``0`` mark inactive filler probes -- they always read
+        ``False`` and skip the SDF query -- so an irregular taxel set can be padded into a regular grid.
     contact_threshold: float
-        A probe is considered in contact if the penetration depth is greater than or equal to this threshold (meters).
+        Penetration depth (meters) at or above which a probe latches into contact.
+    release_threshold: float, optional
+        Penetration depth (meters) at or below which a latched probe releases (Schmitt-trigger hysteresis). Must be
+        ``<= contact_threshold``. Defaults to ``contact_threshold`` (no hysteresis).
     """
 
+    # Permits 0-valued (inactive filler) entries; see _validate_filler_probe_radius.
+    probe_radius: PositiveFloat | FArrayType | FGridType = 0.01
+
     contact_threshold: NonNegativeFloat = 0.0001
+    release_threshold: NonNegativeFloat | None = None
+
+    def model_post_init(self, context: Any) -> None:
+        super().model_post_init(context)
+        _validate_filler_probe_radius(self.probe_radius, "ContactProbe")
+        if self.release_threshold is not None and self.release_threshold > self.contact_threshold:
+            gs.raise_exception(
+                f"release_threshold ({self.release_threshold}) must be <= contact_threshold ({self.contact_threshold})."
+            )
 
 
 class ContactDepthProbe(
     RigidSensorOptionsMixin["ContactDepthProbeSensor"],
     SimpleSensorOptions["ContactDepthProbeSensor"],
     TactileProbeSensorOptionsMixin["ContactDepthProbeSensor"],
+    ViscoelasticHysteresisOptionsMixin["ContactDepthProbeSensor"],
 ):
     """
     Returns contact depth in meters per probe.
@@ -114,6 +209,7 @@ class KinematicTaxel(
     SimpleSensorOptions["KinematicTaxelSensor"],
     TactileProbeSensorOptionsMixin["KinematicTaxelSensor"],
     ProbesWithNormalSensorOptionsMixin["KinematicTaxelSensor"],
+    ViscoelasticHysteresisOptionsMixin["KinematicTaxelSensor"],
 ):
     """
     A tactile sensor which estimates force and torque per taxel by querying contact depth relative to given probe
@@ -132,8 +228,16 @@ class KinematicTaxel(
     ----
     If this sensor is attached to a fixed entity, it will not detect contacts with other fixed entities.
 
+    ``probe_local_pos`` may be either an arbitrary set of probes with shape ``(N, 3)`` or a grid-shaped set with shape
+    ``(M, N, 3)``. Regular planar grids enable FFT-based spatial crosstalk on the measured branch (see
+    ``crosstalk_strength``). A probe whose ``probe_radius`` is 0 is treated as an inactive filler -- it reads 0
+    force/torque and is skipped -- so an irregular taxel set can be padded into a regular grid for FFT crosstalk.
+
     Parameters
     ----------
+    probe_radius : float | array-like[float]
+        Probe sensing radius in meters. A scalar is shared by every probe; an array must match the probe count.
+        Array entries of 0 mark inactive filler probes (see the grid note above); at least one must be positive.
     normal_stiffness : float
         Stiffness for normal force estimation based on contact penetration depth and spring-damper model.
     normal_damping : float
@@ -145,19 +249,38 @@ class KinematicTaxel(
         Coefficient for shear force estimation based on relative linear velocity of the probe and entity in contact.
     twist_scalar : float, optional
         Coefficient for twist torque estimation based on relative angular velocity of the probe and entity in contact.
+    crosstalk_strength : float, optional
+        Spatial crosstalk mixing fraction applied on the measured branch. ``0`` (default) disables; ``1`` is pure
+        Gaussian blur with sigma ``crosstalk_sigma``. Requires a validated regular grid layout for
+        ``probe_local_pos`` and ``crosstalk_sigma > 0``.
+    crosstalk_sigma : float, optional
+        Gaussian crosstalk standard deviation in meters (same units as ``probe_local_pos`` spacing). Must be > 0
+        when ``crosstalk_strength > 0``.
     """
 
+    # Permits 0-valued (inactive filler) entries; see _validate_filler_probe_radius.
+    probe_radius: PositiveFloat | FArrayType | FGridType = 0.01
+
     normal_stiffness: NonNegativeFloat = 1000.0
     normal_damping: NonNegativeFloat = 1.0
     normal_exponent: NonNegativeFloat = 1.0
     shear_scalar: NonNegativeFloat = 1.0
     twist_scalar: NonNegativeFloat = 1.0
 
+    crosstalk_strength: NonNegativeFloat = 0.0
+    crosstalk_sigma: NonNegativeFloat = 0.0
+
     def model_post_init(self, context: Any) -> None:
         super().model_post_init(context)
 
+        _validate_filler_probe_radius(self.probe_radius, "KinematicTaxel")
         if self.normal_exponent < 1.0:
             gs.raise_exception(f"normal_exponent must be greater than or equal to 1.0. Got {self.normal_exponent}.")
+        if self.crosstalk_strength > 0.0 and self.crosstalk_sigma <= 0.0:
+            gs.raise_exception(
+                f"crosstalk_sigma ({self.crosstalk_sigma}) must be > 0 when crosstalk_strength "
+                f"({self.crosstalk_strength}) > 0."
+            )
 
 
 class ElastomerTaxel(
@@ -165,6 +288,7 @@ class ElastomerTaxel(
     SimpleSensorOptions["ElastomerTaxelSensor"],
     PointCloudTactileSensorMixin["ElastomerTaxelSensor"],
     ProbesWithNormalSensorOptionsMixin["ElastomerTaxelSensor"],
+    ViscoelasticHysteresisOptionsMixin["ElastomerTaxelSensor"],
 ):
     """
     An elastomer tactile sensor that implements HydroShear-style marker displacement from Genesis SDF queries.
@@ -175,7 +299,17 @@ class ElastomerTaxel(
     ----
     ``probe_local_pos`` may be either an arbitrary set of probes with shape ``(N, 3)`` or a grid-shaped set with shape
     ``(M, N, 3)``. Regular planar grids with one shared normal use FFT acceleration for dilation; other layouts use the
-    direct dilation path. Shear is computed directly.
+    direct dilation path. Shear is computed directly. A probe whose ``probe_radius`` is 0 is treated as an inactive
+    filler -- it reads 0 and is excluded from dilation/shear -- so an irregular taxel set can be padded into a
+    regular grid for FFT acceleration.
+
+    Note
+    ----
+    ``probe_gain`` is applied to ElastomerTaxel as a post-step linear scale of the measured marker displacement
+    (the dilation kernel writes a single shared field for both branches). This is exact for the tangential
+    dilation and shear components but approximate for the normal dilation term, which scales as
+    ``depth**normal_exponent`` and would ideally scale as ``gain**normal_exponent`` rather than ``gain``. For
+    gains near 1 the error is small.
 
     Parameters
     ----------
@@ -184,18 +318,29 @@ class ElastomerTaxel(
     probe_local_normal : array-like[float, float, float] or array-like[array-like[float, float, float]]
         Unit direction(s) in link-local frame: one normal for all probes, or one normal per probe matching
         ``probe_local_pos``.
+    probe_radius : float | array-like[float]
+        Probe sensing radius in meters. A scalar is shared by every probe; an array must match the probe count.
+        Array entries of 0 mark inactive filler probes (see the grid note above); at least one must be positive.
     track_link_idx : array-like[int]
         Global rigid link indices whose collision geometry is queried by SDF and whose mesh is sampled for shear.
     n_sample_points: int | array-like[int]
         Total surface samples split across ``track_link_idx``, or one count per tracked link.
     lambda_d: float
-        Exponential coefficient for dilation spread.
+        Gaussian falloff coefficient (in 1/m^2) for the dilation kernel ``exp(-lambda_d * r^2)`` that smears each
+        in-contact probe's normal/tangential bulge across its neighbors. Larger values give sharper, more localized
+        markers; smaller values smear the bulge across more probes.
     lambda_s: float
-        Exponential coefficient for shear spread from tracked surface points.
+        Gaussian falloff coefficient (in 1/m^2) for the shear kernel ``exp(-lambda_s * r^2)`` that spreads each
+        anchored tracked-surface point's tangential displacement to nearby probes. Larger values keep shear tightly
+        local to the contact patch; smaller values produce a softer, more diffuse shear response.
     dilate_scale: float
         Scalar gain applied to dilation displacement.
     shear_scale: float
         Scalar gain applied to shear displacement.
+    normal_exponent: float
+        Exponent of the penetration-depth power law for the normal (out-of-plane) marker dilation: the normal
+        bulge scales as ``depth ** normal_exponent``. Must be >= 1.0. Default ``2.0`` (the HydroShear quadratic
+        normal response). Tangential dilation and shear stay linear in depth regardless of this value.
     elastomer_contact_sdf_enter: float
         Positive margin on signed distance: a tracked surface point starts anchoring shear when its elastomer SDF
         value is below ``-elastomer_contact_sdf_enter``.
@@ -210,21 +355,25 @@ class ElastomerTaxel(
     should represent the compliant contact surface.
     """
 
-    probe_local_pos: Vec3FArrayType | Vec3FGridType = ((0.0, 0.0, 0.0),)
-    probe_local_normal: UnitVec3FArrayType | UnitVec3FGridType | UnitVec3FType = (0.0, 0.0, 1.0)
+    # Permits 0-valued (inactive filler) entries; see _validate_filler_probe_radius.
+    probe_radius: PositiveFloat | FArrayType | FGridType = 0.01
 
     lambda_d: NonNegativeFloat = 700.0
     lambda_s: NonNegativeFloat = 300.0
     dilate_scale: NonNegativeFloat = 1.0
     shear_scale: NonNegativeFloat = 1.0
+    normal_exponent: NonNegativeFloat = 2.0
 
     elastomer_contact_sdf_enter: NonNegativeFloat = 1e-5
     elastomer_contact_sdf_exit: NonNegativeFloat = 1e-4
 
     def model_post_init(self, context: Any) -> None:
         super().model_post_init(context)
+        _validate_filler_probe_radius(self.probe_radius, "ElastomerTaxel")
         if len(self.track_link_idx) == 0:
             gs.raise_exception("ElastomerTaxel requires at least one tracked link in track_link_idx.")
+        if self.normal_exponent < 1.0:
+            gs.raise_exception(f"normal_exponent must be greater than or equal to 1.0. Got {self.normal_exponent}.")
 
 
 class ProximityTaxel(
@@ -232,6 +381,7 @@ class ProximityTaxel(
     SimpleSensorOptions["ProximityTaxelSensor"],
     PointCloudTactileSensorMixin["ProximityTaxelSensor"],
     ProbesWithNormalSensorOptionsMixin["ProximityTaxelSensor"],
+    ViscoelasticHysteresisOptionsMixin["ProximityTaxelSensor"],
 ):
     """
     A tactile sensor which estimates force and torque per taxel from proximity to point clouds sampled on tracked
diff --git a/genesis/typing.py b/genesis/typing.py
index ec79a39f85..e852376f53 100644
--- a/genesis/typing.py
+++ b/genesis/typing.py
@@ -104,6 +104,10 @@ def __get_pydantic_core_schema__(cls, source_type: Any, handler: GetCoreSchemaHa
     UnitVec3FArrayType = Vec3FArrayType
     Vec3FLaxArrayType = Vec3FArrayType | Vec3FType
     UnitVec3FLaxArrayType = Vec3FLaxArrayType
+    FGridType = Sequence[Sequence[NumericType]] | np.ndarray
+    PositiveFGridType = FGridType
+    Vec3FGridType = Sequence[Sequence[Sequence[NumericType]]] | np.ndarray
+    UnitVec3FGridType = Vec3FGridType
     RotationMatrixType = Vec3FArrayType
     Matrix3x3Type = Sequence[Sequence[NumericType]] | np.ndarray
     Matrix4x4Type = Sequence[Sequence[NumericType]] | np.ndarray
@@ -165,6 +169,10 @@ def __get_pydantic_core_schema__(cls, source_type: Any, handler: GetCoreSchemaHa
     StrArrayType = Annotated[tuple[str, ...], Field(strict=False)]
     Vec3FArrayType = Annotated[tuple[Vec3FType, ...], Field(min_length=1, strict=False)]
     UnitVec3FArrayType = Annotated[tuple[UnitVec3FType, ...], Field(min_length=1, strict=False)]
+    FGridType = Annotated[tuple[FArrayType, ...], Field(min_length=1, strict=False)]
+    PositiveFGridType = Annotated[tuple[PositiveFArrayType, ...], Field(min_length=1, strict=False)]
+    Vec3FGridType = Annotated[tuple[Vec3FArrayType, ...], Field(min_length=1, strict=False)]
+    UnitVec3FGridType = Annotated[tuple[UnitVec3FArrayType, ...], Field(min_length=1, strict=False)]
     Vec3FLaxArrayType = Annotated[
         tuple[Vec3FType, ...],
         BeforeValidator(lambda v: v if is_sequence(v) and len(v) > 0 and is_sequence(v[0]) else (v,)),
diff --git a/tests/test_sensors.py b/tests/test_sensors.py
index 4e1c9043a1..1614012a3e 100644
--- a/tests/test_sensors.py
+++ b/tests/test_sensors.py
@@ -1682,11 +1682,6 @@ def test_surface_distance_sensor_box_sphere(show_viewer, tol, n_envs):
     )
 
 
-def _as_env_batch(data, n_envs: int) -> torch.Tensor:
-    data = torch.as_tensor(data, device=gs.device)
-    return data.unsqueeze(0) if n_envs == 0 else data
-
-
 # ------------------------------------------------------------------------------------------
 # ----------------------------------- Tactile Sensors --------------------------------------
 # ------------------------------------------------------------------------------------------
@@ -1702,6 +1697,7 @@ def test_kinematic_contact_probe_box_sphere_support(show_viewer, tol, n_envs):
     CONTACT_THRESHOLD = 0.002
     STIFFNESS = 100.0
     SPHERE_RADIUS = 0.1
+    GAIN = 1.5
 
     scene = gs.Scene(
         sim_options=gs.options.SimOptions(
@@ -1758,23 +1754,43 @@ def test_kinematic_contact_probe_box_sphere_support(show_viewer, tol, n_envs):
             **common_kwargs,
         )
     )
-    depth_probe = scene.add_sensor(gs.sensors.ContactDepthProbe(**common_kwargs))
+    depth_probe = scene.add_sensor(
+        gs.sensors.ContactDepthProbe(
+            **common_kwargs,
+        ),
+    )
     noisy_radius_depth_probe = scene.add_sensor(
         gs.sensors.ContactDepthProbe(
             probe_radius_noise=0.25,
             **common_kwargs,
         )
     )
-    taxel = scene.add_sensor(
-        gs.sensors.KinematicTaxel(
-            probe_local_normal=probe_normals,
-            normal_stiffness=STIFFNESS,
-            normal_damping=0.0,
-            shear_scalar=0.0,
-            twist_scalar=0.0,
+    # probe_gain variants: depth/force should scale by the gain on the measured branch only.
+    gained_depth_probe = scene.add_sensor(
+        gs.sensors.ContactDepthProbe(
+            probe_gain=GAIN,
             **common_kwargs,
         )
     )
+    taxel_kwargs = dict(
+        probe_local_normal=probe_normals,
+        normal_stiffness=STIFFNESS,
+        normal_damping=0.0,
+        shear_scalar=0.0,
+        twist_scalar=0.0,
+        **common_kwargs,
+    )
+    taxel = scene.add_sensor(
+        gs.sensors.KinematicTaxel(
+            **taxel_kwargs,
+        ),
+    )
+    gained_taxel = scene.add_sensor(
+        gs.sensors.KinematicTaxel(
+            probe_gain=GAIN,
+            **taxel_kwargs,
+        ),
+    )
     sphere_taxel = scene.add_sensor(
         gs.sensors.KinematicTaxel(
             entity_idx=sphere.idx,
@@ -1792,13 +1808,13 @@ def test_kinematic_contact_probe_box_sphere_support(show_viewer, tol, n_envs):
     scene.build(n_envs=n_envs)
     scene.step()
 
-    depth = _as_env_batch(depth_probe.read_ground_truth(), n_envs)
-    contact = _as_env_batch(contact_probe.read_ground_truth(), n_envs)
-    force = _as_env_batch(taxel.read_ground_truth().force, n_envs)
-    torque = _as_env_batch(taxel.read_ground_truth().torque, n_envs)
+    depth = depth_probe.read_ground_truth()
+    contact = contact_probe.read_ground_truth()
+    force = taxel.read_ground_truth().force
+    torque = taxel.read_ground_truth().torque
 
     assert_equal(contact, depth > CONTACT_THRESHOLD)
-    assert _as_env_batch(noisy_radius_depth_probe.read(), n_envs).shape == depth.shape
+    assert noisy_radius_depth_probe.read().shape == depth.shape
     # Check that the box's bottom probe (idx 3) detects the ground.
     assert (depth[..., 3] > tol).all(), "Bottom probe should detect the ground."
     assert (force[..., 3, 2] > tol).all(), "Bottom taxel force should point upward."
@@ -1811,15 +1827,25 @@ def test_kinematic_contact_probe_box_sphere_support(show_viewer, tol, n_envs):
     expected_normals = -torch.tensor(probe_normals, dtype=gs.tc_float, device=gs.device)
     assert_allclose(force, depth.unsqueeze(-1) * STIFFNESS * expected_normals, tol=tol)
 
+    # probe_gain scales the measured branch only; GT is untouched. normal_exponent defaults to 1, so the measured
+    # force is linear in the gained depth and scales by the same factor.
+    gained_depth = gained_depth_probe.read()
+    gained_force = gained_taxel.read().force
+    assert (depth[..., 3] > tol).all()  # sanity: the bottom probe is in contact
+    assert_allclose(gained_depth[..., 3], depth[..., 3] * GAIN, tol=tol)
+    assert_allclose(gained_depth_probe.read_ground_truth(), depth, tol=gs.EPS)
+    assert_allclose(gained_force[..., 3, :], force[..., 3, :] * GAIN, tol=tol)
+    assert_allclose(gained_taxel.read_ground_truth().force, force, tol=gs.EPS)
+
     # Now position the sphere to penetrate the top of the box.
     box_top_z = BOX_SIZE - PENETRATION
     sphere.set_pos((0.0, 0.0, box_top_z + SPHERE_RADIUS - PENETRATION))
     scene.step()
 
-    depth = _as_env_batch(depth_probe.read_ground_truth(), n_envs)
-    contact = _as_env_batch(contact_probe.read_ground_truth(), n_envs)
-    force = _as_env_batch(taxel.read_ground_truth().force, n_envs)
-    sphere_force = _as_env_batch(sphere_taxel.read_ground_truth().force, n_envs)
+    depth = depth_probe.read_ground_truth()
+    contact = contact_probe.read_ground_truth()
+    force = taxel.read_ground_truth().force
+    sphere_force = sphere_taxel.read_ground_truth().force
 
     assert_equal(contact, depth > CONTACT_THRESHOLD)
     assert (depth[..., 0] > tol).all(), "Top center probe should detect the sphere."
@@ -1834,6 +1860,377 @@ def test_kinematic_contact_probe_box_sphere_support(show_viewer, tol, n_envs):
     assert_allclose(sphere_taxel.read_ground_truth().force, 0.0, tol=gs.EPS)
 
 
+@pytest.mark.required
+def test_contact_probe_hysteresis(show_viewer):
+    """ContactProbe with release_threshold < contact_threshold latches like a Schmitt trigger.
+
+    Depth-probe semantics: ``depth = probe_radius - sd(probe, geom)``. With the probe placed at the box center
+    (link-local origin) and the box descending into the ground plane, ``sd = box.z`` and
+    ``depth = probe_radius - box.z``, giving smooth control over the reported depth.
+    """
+    n_envs = 0
+    BOX_SIZE = 0.2
+    # Place probe 0.05m above the box bottom; reported depth = probe_radius - probe.z. With probe_radius = 0.060,
+    # depth = 0.010 at zero penetration and grows linearly with penetration p.
+    PROBE_LOCAL_Z = -BOX_SIZE / 2 + 0.05
+    PROBE_RADIUS = 0.060
+    ENTER = 0.030  # triggered at p ≈ 0.020
+    RELEASE = 0.015  # triggered at p ≈ 0.005
+
+    # box.z values; box.z = BOX_SIZE/2 - p gives penetration p.
+    BOX_Z_OFF = 1.0  # well above plane → no contact → depth = 0
+    BOX_Z_BELOW_RELEASE = 0.099  # p = 0.001 → depth = 0.011 (< RELEASE)
+    BOX_Z_IN_BAND = 0.090  # p = 0.010 → depth = 0.020 (RELEASE < d < ENTER)
+    BOX_Z_ABOVE_ENTER = 0.070  # p = 0.030 → depth = 0.040 (> ENTER)
+
+    scene = gs.Scene(
+        sim_options=gs.options.SimOptions(gravity=(0.0, 0.0, 0.0)),
+        profiling_options=gs.options.ProfilingOptions(show_FPS=False),
+        show_viewer=show_viewer,
+    )
+    scene.add_entity(gs.morphs.Plane())
+    box = scene.add_entity(
+        gs.morphs.Box(
+            size=(BOX_SIZE, BOX_SIZE, BOX_SIZE),
+            pos=(0.0, 0.0, BOX_Z_OFF),
+            fixed=False,
+        ),
+    )
+
+    common = dict(
+        entity_idx=box.idx,
+        probe_local_pos=((0.0, 0.0, PROBE_LOCAL_Z),),
+        probe_radius=PROBE_RADIUS,
+        draw_debug=show_viewer,
+    )
+    hyst_probe = scene.add_sensor(
+        gs.sensors.ContactProbe(
+            contact_threshold=ENTER,
+            release_threshold=RELEASE,
+            **common,
+        ),
+    )
+    plain_probe = scene.add_sensor(
+        gs.sensors.ContactProbe(
+            contact_threshold=ENTER,
+            **common,
+        ),
+    )
+
+    scene.build(n_envs=n_envs)
+
+    def step_at(box_z):
+        box.set_pos((0.0, 0.0, box_z))
+        scene.step()
+        h = hyst_probe.read_ground_truth()
+        p = plain_probe.read_ground_truth()
+        return h.reshape(-1), p.reshape(-1)
+
+    # 1. No contact.
+    h, p = step_at(BOX_Z_OFF)
+    assert not h.any() and not p.any()
+
+    # 2. Depth in band before any latch: both False (not latched).
+    h, p = step_at(BOX_Z_IN_BAND)
+    assert not h.any() and not p.any()
+
+    # 3. Depth above enter: both latch True.
+    h, p = step_at(BOX_Z_ABOVE_ENTER)
+    assert h.all() and p.all()
+
+    # 4. Lift to band: hyst stays latched, plain releases (depth < enter).
+    h, p = step_at(BOX_Z_IN_BAND)
+    assert h.all() and not p.any()
+
+    # 5. Lift to below release: hyst clears.
+    h, p = step_at(BOX_Z_BELOW_RELEASE)
+    assert not h.any() and not p.any()
+
+    # 6. Back into band: still False (not latched).
+    h, p = step_at(BOX_Z_IN_BAND)
+    assert not h.any() and not p.any()
+
+    # 7. Reset clears latch even if depth is in band.
+    step_at(BOX_Z_ABOVE_ENTER)
+    scene.reset()
+    h, p = step_at(BOX_Z_IN_BAND)
+    assert not h.any() and not p.any()
+
+
+@pytest.mark.required
+def test_contact_depth_probe_viscoelastic_hysteresis(show_viewer):
+    """ContactDepthProbe with hysteresis_strength > 0 overshoots GT after a step input and relaxes back to it.
+
+    Single-Maxwell model with equilibrium gain 1: ``output = x + strength * xi``, ``xi`` decays with time constant
+    ``tau``. After a step from 0 to D, measured value jumps to D*(1+strength) and decays back to D. GT is untouched.
+    """
+    n_envs = 0
+    BOX_SIZE = 0.2
+    PROBE_LOCAL_Z = -BOX_SIZE / 2 + 0.05
+    PROBE_RADIUS = 0.060
+    STRENGTH = 0.5
+    DT = 0.01
+    TAU = 0.05  # alpha = exp(-dt/tau) ≈ 0.819
+    ALPHA = np.exp(-DT / TAU)
+
+    BOX_Z_OFF = 1.0
+    BOX_Z_ON = 0.080  # p = 0.020, depth = 0.030 in steady state
+
+    scene = gs.Scene(
+        sim_options=gs.options.SimOptions(gravity=(0.0, 0.0, 0.0), dt=DT),
+        profiling_options=gs.options.ProfilingOptions(show_FPS=False),
+        show_viewer=show_viewer,
+    )
+    scene.add_entity(gs.morphs.Plane())
+    box = scene.add_entity(
+        gs.morphs.Box(
+            size=(BOX_SIZE, BOX_SIZE, BOX_SIZE),
+            pos=(0.0, 0.0, BOX_Z_OFF),
+            fixed=False,
+        ),
+    )
+    common = dict(
+        entity_idx=box.idx,
+        probe_local_pos=((0.0, 0.0, PROBE_LOCAL_Z),),
+        probe_radius=PROBE_RADIUS,
+        draw_debug=show_viewer,
+    )
+    hyst = scene.add_sensor(
+        gs.sensors.ContactDepthProbe(
+            hysteresis_strength=STRENGTH,
+            hysteresis_tau=TAU,
+            **common,
+        ),
+    )
+    plain = scene.add_sensor(
+        gs.sensors.ContactDepthProbe(
+            **common,
+        ),
+    )
+
+    scene.build(n_envs=n_envs)
+
+    def step_at(z):
+        box.set_pos((0.0, 0.0, z))
+        scene.step()
+        return (
+            hyst.read().reshape(-1),
+            hyst.read_ground_truth().reshape(-1),
+            plain.read().reshape(-1),
+        )
+
+    # Step 1: no contact. All zero.
+    h_m, h_gt, p_m = step_at(BOX_Z_OFF)
+    assert_allclose(h_m, 0.0, tol=gs.EPS)
+    assert_allclose(h_gt, 0.0, tol=gs.EPS)
+    assert_allclose(p_m, 0.0, tol=gs.EPS)
+
+    # Step 2: jump to BOX_Z_ON. GT should equal plain measured (both = D). Hyst measured = D*(1+strength).
+    h_m, h_gt, p_m = step_at(BOX_Z_ON)
+    D = float(h_gt[0].item())
+    assert D > 0.02  # sanity
+    assert_allclose(p_m, D, tol=1e-5)
+    assert_allclose(h_m, D * (1.0 + STRENGTH), tol=1e-4)
+
+    # Subsequent steps at the same depth: xi decays by ALPHA each step, measured = D + strength * D * ALPHA^k.
+    for k in range(1, 5):
+        h_m, h_gt, p_m = step_at(BOX_Z_ON)
+        assert_allclose(h_gt, D, tol=1e-5)  # GT untouched.
+        assert_allclose(p_m, D, tol=1e-5)  # Plain untouched.
+        expected = D * (1.0 + STRENGTH * (ALPHA**k))
+        assert_allclose(h_m, expected, tol=1e-4)
+
+    # Reset clears xi: a single step at depth D should overshoot exactly like step 2.
+    scene.reset()
+    box.set_pos((0.0, 0.0, BOX_Z_OFF))
+    scene.step()
+    h_m, h_gt, p_m = step_at(BOX_Z_ON)
+    assert_allclose(h_m, D * (1.0 + STRENGTH), tol=1e-4)
+
+
+@pytest.mark.required
+def test_probe_gain_and_dead_resample(show_viewer):
+    """probe_gain_resample_range and dead_taxel_probability redraw their per-(env, probe) randomness on every
+    scene.reset(): the gained sensor's measured depth carries a per-env gain in range, and the dead sensor's
+    measured value is overwritten by a per-env sample in range. GT is untouched, and both redraw on the next
+    reset."""
+    BOX_SIZE = 0.2
+    PROBE_LOCAL_Z = -BOX_SIZE / 2 + 0.05
+    PROBE_RADIUS = 0.060
+    GAIN_LOW, GAIN_HIGH = 0.5, 1.5
+    DEAD_LOW, DEAD_HIGH = 0.123, 0.456
+    BOX_Z = 0.080  # box descends into the plane so the real contact depth is non-zero
+    N_ENVS = 8
+
+    scene = gs.Scene(
+        sim_options=gs.options.SimOptions(gravity=(0.0, 0.0, 0.0)),
+        profiling_options=gs.options.ProfilingOptions(show_FPS=False),
+        show_viewer=show_viewer,
+    )
+    scene.add_entity(gs.morphs.Plane())
+    box = scene.add_entity(
+        gs.morphs.Box(
+            size=(BOX_SIZE, BOX_SIZE, BOX_SIZE),
+            pos=(0.0, 0.0, 1.0),
+            fixed=False,
+        ),
+    )
+    common = dict(
+        entity_idx=box.idx,
+        probe_local_pos=((0.0, 0.0, PROBE_LOCAL_Z),),
+        probe_radius=PROBE_RADIUS,
+        draw_debug=show_viewer,
+    )
+    gain_sensor = scene.add_sensor(
+        gs.sensors.ContactDepthProbe(
+            probe_gain_resample_range=(GAIN_LOW, GAIN_HIGH),
+            **common,
+        ),
+    )
+    dead_sensor = scene.add_sensor(
+        gs.sensors.ContactDepthProbe(
+            dead_taxel_probability=1.0,
+            dead_taxel_value_range=(DEAD_LOW, DEAD_HIGH),
+            **common,
+        ),
+    )
+
+    scene.build(n_envs=N_ENVS)
+
+    def reset_step_read():
+        scene.reset()  # triggers the per-(env, probe) resample of gain and dead state
+        box.set_pos([[0.0, 0.0, BOX_Z]] * N_ENVS)
+        scene.step()
+        gains = (gain_sensor.read() / gain_sensor.read_ground_truth()).reshape(-1).cpu()
+        dead = dead_sensor.read().reshape(-1).cpu()
+        return gains, dead
+
+    gains_a, dead_a = reset_step_read()
+    # Gain stays in range, dead values are overwritten in range, and both vary across the 8 envs.
+    assert torch.all((gains_a >= GAIN_LOW - 1e-5) & (gains_a <= GAIN_HIGH + 1e-5))
+    assert torch.all((dead_a >= DEAD_LOW - 1e-5) & (dead_a <= DEAD_HIGH + 1e-5))
+    assert gains_a.std().item() > 0.01 and dead_a.std().item() > 0.01
+    # The dead sensor's GT is untouched -- it still reports the real (non-zero) contact depth.
+    assert torch.all(dead_sensor.read_ground_truth().reshape(-1) > 0.0)
+
+    # A second reset redraws both.
+    gains_b, dead_b = reset_step_read()
+    assert not torch.allclose(gains_a, gains_b, atol=1e-3)
+    assert not torch.allclose(dead_a, dead_b, atol=1e-3)
+
+
+@pytest.mark.required
+def test_kinematic_taxel_crosstalk(show_viewer):
+    """Crosstalk smears the measured force field across grid neighbors while leaving GT unchanged; total normal
+    force is preserved (DC of the normalized kernel is 1). Also checks ``crosstalk_strength=0`` is the exact
+    no-crosstalk path and that a grid layout matches a flat layout at the identical probe positions."""
+    BOX_SIZE = 0.2
+    PROBE_RADIUS = 0.02
+    SPACING = 0.03
+    SPHERE_RADIUS = 0.025
+    BOX_BOTTOM_Z = 0.05
+    CROSSTALK_STRENGTH = 0.6
+    CROSSTALK_SIGMA = SPACING
+
+    ny, nx = 5, 5
+    grid_positions = np.zeros((ny, nx, 3), dtype=gs.np_float)
+    for iy in range(ny):
+        for ix in range(nx):
+            grid_positions[iy, ix] = ((ix - 2) * SPACING, (iy - 2) * SPACING, BOX_SIZE / 2)
+
+    scene = gs.Scene(
+        sim_options=gs.options.SimOptions(gravity=(0.0, 0.0, 0.0)),
+        profiling_options=gs.options.ProfilingOptions(show_FPS=False),
+        show_viewer=show_viewer,
+    )
+    box = scene.add_entity(
+        gs.morphs.Box(size=(BOX_SIZE, BOX_SIZE, BOX_SIZE), pos=(0.0, 0.0, BOX_BOTTOM_Z + BOX_SIZE / 2), fixed=True)
+    )
+    sphere = scene.add_entity(
+        gs.morphs.Sphere(
+            radius=SPHERE_RADIUS,
+            pos=(0.0, 0.0, BOX_BOTTOM_Z + BOX_SIZE + SPHERE_RADIUS - 0.010),
+            fixed=False,
+        )
+    )
+
+    common = dict(
+        entity_idx=box.idx,
+        probe_local_normal=(0.0, 0.0, 1.0),
+        probe_radius=PROBE_RADIUS,
+        normal_stiffness=100.0,
+        normal_damping=0.0,
+        shear_scalar=0.0,
+        twist_scalar=0.0,
+    )
+    plain = scene.add_sensor(
+        gs.sensors.KinematicTaxel(
+            probe_local_pos=grid_positions.tolist(),
+            **common,
+        ),
+    )
+    crosstalk = scene.add_sensor(
+        gs.sensors.KinematicTaxel(
+            probe_local_pos=grid_positions.tolist(),
+            crosstalk_strength=CROSSTALK_STRENGTH,
+            crosstalk_sigma=CROSSTALK_SIGMA,
+            **common,
+        )
+    )
+    # crosstalk_strength=0 must reproduce the no-crosstalk path exactly, even with a non-zero sigma.
+    crosstalk_off = scene.add_sensor(
+        gs.sensors.KinematicTaxel(
+            probe_local_pos=grid_positions.tolist(), crosstalk_strength=0.0, crosstalk_sigma=0.05, **common
+        )
+    )
+    # Same probes laid out flat: per-probe GT must match the grid layout.
+    flat = scene.add_sensor(gs.sensors.KinematicTaxel(probe_local_pos=grid_positions.reshape(-1, 3).tolist(), **common))
+
+    scene.build(n_envs=0)
+    sphere.set_pos((0.0, 0.0, BOX_BOTTOM_Z + BOX_SIZE + SPHERE_RADIUS - 0.010))
+    scene.step()
+
+    plain_meas_force = plain.read().force
+    crosstalk_meas_force = crosstalk.read().force
+    plain_gt_force = plain.read_ground_truth().force
+    crosstalk_gt_force = crosstalk.read_ground_truth().force
+
+    # GT branch is untouched by crosstalk.
+    assert_allclose(crosstalk_gt_force, plain_gt_force, tol=gs.EPS)
+
+    # Plain measured equals GT (no transforms enabled on plain sensor).
+    assert_allclose(plain_meas_force, plain_gt_force, tol=gs.EPS)
+
+    plain_force_mag = torch.linalg.norm(plain_meas_force, dim=-1)
+    iy_c, ix_c = (plain_force_mag == plain_force_mag.max()).nonzero(as_tuple=False)[0].tolist()
+    assert (iy_c, ix_c) == (ny // 2, nx // 2)
+
+    crosstalk_force_mag = torch.linalg.norm(crosstalk_meas_force, dim=-1)
+    # Center magnitude on crosstalk sensor is reduced vs plain (energy redistributed).
+    assert crosstalk_force_mag[iy_c, ix_c] < plain_force_mag[iy_c, ix_c]
+    # A probe outside the contact patch (2 spacings from center) was ~zero on plain; crosstalk leaks force there.
+    plain_far = plain_force_mag[0, 0].item()
+    crosstalk_far = crosstalk_force_mag[0, 0].item()
+    assert plain_far < 1e-4, f"far probe should be ~zero on plain sensor (got {plain_far})"
+    assert crosstalk_far > 1e-4, f"far probe should pick up crosstalk leakage (got {crosstalk_far})"
+
+    # Total Fz across the grid is preserved up to Gaussian-tail leakage past the output slice boundary.
+    plain_total_fz = plain_meas_force[..., 2].sum().item()
+    crosstalk_total_fz = crosstalk_meas_force[..., 2].sum().item()
+    assert np.isclose(plain_total_fz, crosstalk_total_fz, rtol=5e-2, atol=1e-5), (
+        f"plain={plain_total_fz}, crosstalk={crosstalk_total_fz}"
+    )
+
+    # crosstalk_strength=0 is the exact no-crosstalk path (even with a non-zero sigma).
+    assert_allclose(crosstalk_off.read().force, plain_meas_force, tol=gs.EPS)
+    assert_allclose(crosstalk_off.read().torque, plain.read().torque, tol=gs.EPS)
+
+    # A grid layout produces the same per-probe GT as a flat layout at the identical positions.
+    flat_gt = flat.read_ground_truth()
+    assert_allclose(plain_gt_force.reshape(-1, 3), flat_gt.force, tol=gs.EPS)
+    assert_allclose(plain.read_ground_truth().torque.reshape(-1, 3), flat_gt.torque, tol=gs.EPS)
+
+
 @pytest.mark.required
 @pytest.mark.parametrize("n_envs", [0, 2])
 def test_elastomer_sensor_sphere_ground_dilate_shear(show_viewer, tol, n_envs):
@@ -1845,6 +2242,7 @@ def test_elastomer_sensor_sphere_ground_dilate_shear(show_viewer, tol, n_envs):
     N_RINGS = 3
     LATERAL_SHIFT = 0.01
     SHEAR_SCALE = 100.0
+    GAIN = 2.0
 
     scene = gs.Scene(
         sim_options=gs.options.SimOptions(
@@ -1912,26 +2310,40 @@ def test_elastomer_sensor_sphere_ground_dilate_shear(show_viewer, tol, n_envs):
             **sensor_kwargs,
         )
     )
+    # probe_gain variant: the measured marker displacement scales by the gain; GT is untouched.
+    gained_sensor = scene.add_sensor(
+        gs.sensors.ElastomerTaxel(
+            dilate_scale=1.0,
+            shear_scale=0.0,
+            probe_gain=GAIN,
+            **sensor_kwargs,
+        )
+    )
     assert not dilate_sensor._is_grid and not dilate_sensor._use_grid_fft
 
     scene.build(n_envs=n_envs)
     scene.step()
 
-    dilate_data = _as_env_batch(dilate_sensor.read_ground_truth(), n_envs)
-    shear_data = _as_env_batch(shear_sensor.read_ground_truth(), n_envs)
-    combined_data = _as_env_batch(combined_sensor.read_ground_truth(), n_envs)
+    dilate_data = dilate_sensor.read_ground_truth()
+    shear_data = shear_sensor.read_ground_truth()
+    combined_data = combined_sensor.read_ground_truth()
     normal_projection = (dilate_data * normals).sum(dim=-1)
     assert (normal_projection[..., 0] > tol).all(), "Bottom marker should dilate along its outward normal."
     assert torch.linalg.norm(dilate_data, dim=-1).max() > tol
     assert_allclose(shear_data, 0.0, tol=tol)
     assert_allclose(combined_data, dilate_data, tol=tol)
 
+    gained_meas = gained_sensor.read()
+    gained_gt = gained_sensor.read_ground_truth()
+    assert torch.linalg.norm(gained_gt, dim=-1).max() > tol  # sanity: in contact
+    assert_allclose(gained_meas, gained_gt * GAIN, tol=tol)
+
     sphere.set_pos((LATERAL_SHIFT, 0.0, sphere_init_pos[2]))
     scene.step()
 
-    dilate_data = _as_env_batch(dilate_sensor.read_ground_truth(), n_envs)
-    shear_data = _as_env_batch(shear_sensor.read_ground_truth(), n_envs)
-    combined_data = _as_env_batch(combined_sensor.read_ground_truth(), n_envs)
+    dilate_data = dilate_sensor.read_ground_truth()
+    shear_data = shear_sensor.read_ground_truth()
+    combined_data = combined_sensor.read_ground_truth()
     shear_normal_projection = (shear_data * normals).sum(dim=-1)
     shear_tangent = shear_data - shear_normal_projection.unsqueeze(-1) * normals
     assert torch.linalg.norm(shear_tangent, dim=-1).max() > tol
@@ -2025,6 +2437,25 @@ def test_elastomer_sensor_grid_box_sphere(show_viewer, tol, n_envs):
             **sensor_kwargs,
         )
     )
+    # A non-default normal_exponent (cubic instead of the default quadratic normal dilation), one per path.
+    cubic_grid_sensor = scene.add_sensor(
+        gs.sensors.ElastomerTaxel(
+            probe_local_pos=probe_local_pos,
+            dilate_scale=1.0,
+            shear_scale=0.0,
+            normal_exponent=3.0,
+            **sensor_kwargs,
+        )
+    )
+    cubic_flat_sensor = scene.add_sensor(
+        gs.sensors.ElastomerTaxel(
+            probe_local_pos=probe_local_pos.reshape(-1, 3),
+            dilate_scale=1.0,
+            shear_scale=0.0,
+            normal_exponent=3.0,
+            **sensor_kwargs,
+        )
+    )
     assert elastomer_grid_sensor._is_grid and elastomer_grid_sensor._use_grid_fft
     assert not elastomer_sensor._is_grid and not elastomer_sensor._use_grid_fft
     assert_allclose(elastomer_sensor.probe_local_pos, elastomer_grid_sensor.probe_local_pos, tol=gs.EPS)
@@ -2040,6 +2471,13 @@ def test_elastomer_sensor_grid_box_sphere(show_viewer, tol, n_envs):
     assert_allclose(shear_sensor.read_ground_truth(), 0.0, tol=tol)
     assert_allclose(combined_sensor.read_ground_truth(), flat_data, tol=tol)
 
+    # normal_exponent reshapes only the out-of-plane channel: the grid-FFT and direct paths still agree, and the
+    # cubic-normal response differs from the default quadratic one (sub-unit depths here, so depth**3 < depth**2).
+    cubic_data = cubic_grid_sensor.read_ground_truth()
+    assert_allclose(cubic_flat_sensor.read_ground_truth(), cubic_data, tol=tol)
+    cubic_diff = torch.as_tensor(cubic_data, device=gs.device) - torch.as_tensor(grid_data, device=gs.device)
+    assert torch.linalg.norm(cubic_diff, dim=-1).max() > tol, "normal_exponent=3 should change the dilation output"
+
     # Test combined displacement: dilate + shear contributions should add when the box slides laterally.
     box.set_pos((LATERAL_SHIFT, 0.0, SPHERE_RADIUS * 2 + BOX_SIZE / 2 - PENETRATION))
     scene.step()
@@ -2056,12 +2494,140 @@ def test_elastomer_sensor_grid_box_sphere(show_viewer, tol, n_envs):
     assert_equal(combined_sensor.read_ground_truth(), 0.0, err_msg="ElastomerTaxel should be zero in air.")
 
 
+@pytest.mark.required
+@pytest.mark.parametrize("n_envs", [0, 2])
+def test_tactile_filler_probes_radius_zero(show_viewer, tol, n_envs):
+    """probe_radius == 0 marks inactive filler probes on ElastomerTaxel / KinematicTaxel: they read 0 and are
+    excluded from dilation / force, letting an irregular taxel set be padded into a regular grid for FFT."""
+    SPHERE_RADIUS = 0.1
+    BOX_SIZE = 0.1
+    PENETRATION = 0.01
+    GRID = (8, 8)
+    RADIUS = 0.02
+
+    scene = gs.Scene(
+        sim_options=gs.options.SimOptions(
+            gravity=(0.0, 0.0, 0.0),
+        ),
+        profiling_options=gs.options.ProfilingOptions(
+            show_FPS=False,
+        ),
+        show_viewer=show_viewer,
+    )
+    sphere = scene.add_entity(
+        gs.morphs.Sphere(
+            radius=SPHERE_RADIUS,
+            pos=(0.0, 0.0, SPHERE_RADIUS),
+            fixed=True,
+        )
+    )
+    box = scene.add_entity(
+        gs.morphs.Box(
+            size=(BOX_SIZE, BOX_SIZE, BOX_SIZE),
+            pos=(0.0, 0.0, SPHERE_RADIUS * 2 + BOX_SIZE / 2 - PENETRATION),
+            fixed=False,
+        )
+    )
+    grid_pos = gu.generate_grid_points_on_plane(
+        lo=(-BOX_SIZE / 2, -BOX_SIZE / 2, -BOX_SIZE / 2),
+        hi=(BOX_SIZE / 2, BOX_SIZE / 2, -BOX_SIZE / 2),
+        normal=(0.0, 0.0, -1.0),
+        nx=GRID[0],
+        ny=GRID[1],
+    )
+    flat_pos = grid_pos.reshape(-1, 3)
+    # Mark a 2x2 corner block (flat indices iy*nx+ix) as inactive fillers; the rest sense normally.
+    filler_idx = [0, 1, GRID[0], GRID[0] + 1]
+    radii = np.full(flat_pos.shape[0], RADIUS)
+    radii[filler_idx] = 0.0
+    active_mask = radii > 0.0
+
+    elastomer_kwargs = dict(
+        entity_idx=box.idx,
+        probe_local_normal=(0.0, 0.0, -1.0),
+        track_link_idx=(sphere.base_link_idx,),
+        n_sample_points=600,
+        lambda_s=0.0,
+        shear_scale=0.0,
+        dilate_scale=1.0,
+        draw_debug=show_viewer,
+    )
+    elastomer_grid = scene.add_sensor(
+        gs.sensors.ElastomerTaxel(
+            probe_local_pos=grid_pos,
+            probe_radius=radii.tolist(),
+            **elastomer_kwargs,
+        )
+    )
+    elastomer_active = scene.add_sensor(
+        gs.sensors.ElastomerTaxel(
+            probe_local_pos=flat_pos[active_mask],
+            probe_radius=RADIUS,
+            **elastomer_kwargs,
+        )
+    )
+    kinematic_kwargs = dict(
+        entity_idx=box.idx,
+        probe_local_normal=(0.0, 0.0, -1.0),
+        normal_stiffness=500.0,
+        draw_debug=show_viewer,
+    )
+    kinematic_grid = scene.add_sensor(
+        gs.sensors.KinematicTaxel(
+            probe_local_pos=grid_pos,
+            probe_radius=radii.tolist(),
+            **kinematic_kwargs,
+        )
+    )
+    kinematic_full = scene.add_sensor(
+        gs.sensors.KinematicTaxel(
+            probe_local_pos=grid_pos,
+            probe_radius=RADIUS,
+            **kinematic_kwargs,
+        )
+    )
+    kinematic_crosstalk = scene.add_sensor(
+        gs.sensors.KinematicTaxel(
+            probe_local_pos=grid_pos,
+            probe_radius=radii.tolist(),
+            crosstalk_strength=1.0,
+            crosstalk_sigma=BOX_SIZE / GRID[0],
+            **kinematic_kwargs,
+        )
+    )
+    assert elastomer_grid._use_grid_fft
+    scene.build(n_envs=n_envs)
+    scene.step()
+
+    # ElastomerTaxel (FFT dilation): filler probes read 0; active probes match a sensor built from only the
+    # active probes -- the fillers contribute no dilation, so the active readings are unchanged by their padding.
+    # The grid-input sensor reports (..., ny, nx, 3); flatten the grid axes for filler-index comparison.
+    grid_data = torch.as_tensor(elastomer_grid.read_ground_truth(), device=gs.device).flatten(-3, -2)
+    active_data = torch.as_tensor(elastomer_active.read_ground_truth(), device=gs.device)
+    assert torch.linalg.norm(grid_data, dim=-1).max() > tol, "active elastomer probes should detect contact"
+    assert_allclose(grid_data[..., filler_idx, :], 0.0, tol=gs.EPS)
+    assert_allclose(grid_data[..., active_mask, :], active_data, tol=tol)
+
+    # KinematicTaxel: filler probes read 0 force; active probes match the all-active grid (per-probe force).
+    # KinematicTaxel reports a grid-shaped (..., ny, nx, 3) reading; flatten the grid axes to the flat index.
+    kin_grid = torch.as_tensor(kinematic_grid.read().force, device=gs.device).flatten(-3, -2)
+    kin_full = torch.as_tensor(kinematic_full.read().force, device=gs.device).flatten(-3, -2)
+    assert torch.linalg.norm(kin_full, dim=-1).max() > tol, "active kinematic probes should detect contact"
+    assert_allclose(kin_grid[..., filler_idx, :], 0.0, tol=gs.EPS)
+    assert_allclose(kin_grid[..., active_mask, :], kin_full[..., active_mask, :], tol=tol)
+
+    # KinematicTaxel FFT crosstalk smears neighbour force, but filler probes are still masked back to 0.
+    kin_xt = torch.as_tensor(kinematic_crosstalk.read().force, device=gs.device).flatten(-3, -2)
+    assert_allclose(kin_xt[..., filler_idx, :], 0.0, tol=gs.EPS)
+
+
 @pytest.mark.required
 @pytest.mark.parametrize("n_envs", [0, 2])
 def test_proximity_sensor_box_on_box(show_viewer, tol, n_envs):
     """ProximityTaxel reports a nonzero point-cloud force in contact and near-zero force in air."""
     BOX_SIZE = 0.2
     PENETRATION = 0.01
+    GAIN = 1.5
 
     scene = gs.Scene(
         sim_options=gs.options.SimOptions(
@@ -2100,16 +2666,37 @@ def test_proximity_sensor_box_on_box(show_viewer, tol, n_envs):
             draw_debug=show_viewer,
         )
     )
+    # probe_gain variant (no radius noise so the measured branch is deterministic): force is linear in the summed
+    # penetration, so the measured force scales by the gain while GT is untouched.
+    gained_sensor = scene.add_sensor(
+        gs.sensors.ProximityTaxel(
+            entity_idx=taxel_box.idx,
+            probe_local_pos=((0.0, 0.0, -BOX_SIZE / 2), (BOX_SIZE / 4, 0.0, -BOX_SIZE / 2)),
+            probe_local_normal=(0.0, 0.0, -1.0),
+            probe_radius=0.06,
+            probe_gain=GAIN,
+            track_link_idx=(support.base_link_idx,),
+            n_sample_points=600,
+            stiffness=100.0,
+            shear_coupling=0.0,
+            draw_debug=show_viewer,
+        )
+    )
 
     scene.build(n_envs=n_envs)
     scene.step()
 
-    force_norm = torch.linalg.norm(_as_env_batch(sensor.read_ground_truth().force, n_envs), dim=-1)
+    force_norm = torch.linalg.norm(sensor.read_ground_truth().force, dim=-1)
     assert (force_norm > tol).all()
 
+    gained_meas = gained_sensor.read().force
+    gained_gt = gained_sensor.read_ground_truth().force
+    assert (torch.linalg.norm(gained_gt, dim=-1) > tol).all()  # sanity: in contact
+    assert_allclose(gained_meas, gained_gt * GAIN, tol=tol)
+
     taxel_box.set_pos((0.0, 0.0, BOX_SIZE + BOX_SIZE / 2 + 0.2))
     scene.step()
-    force_norm = torch.linalg.norm(_as_env_batch(sensor.read_ground_truth().force, n_envs), dim=-1)
+    force_norm = torch.linalg.norm(sensor.read_ground_truth().force, dim=-1)
     assert_allclose(force_norm, 0.0, tol=gs.EPS)
 
 

From 7b4765489aae6f40c5c4302a969b281585f205c5 Mon Sep 17 00:00:00 2001
From: Trinity Chung <trinityjchung@gmail.com>
Date: Sun, 24 May 2026 16:59:48 -0400
Subject: [PATCH 2/7] fix elastomertaxel is_grid

---
 genesis/engine/sensors/point_cloud_tactile.py | 4 ++--
 tests/test_sensors.py                         | 9 +++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/genesis/engine/sensors/point_cloud_tactile.py b/genesis/engine/sensors/point_cloud_tactile.py
index d7f66a51f5..00739408d5 100644
--- a/genesis/engine/sensors/point_cloud_tactile.py
+++ b/genesis/engine/sensors/point_cloud_tactile.py
@@ -1505,12 +1505,12 @@ class ElastomerTaxelSensor(
     def __init__(self, sensor_options: ElastomerTaxelSensorOptions, sensor_idx: int, sensor_manager: "SensorManager"):
         super().__init__(sensor_options, sensor_idx, sensor_manager)
         # FFT-grid eligibility check (flat pos/normals are already populated by the base mixins).
-        is_grid = len(self._probe_layout_shape) == 2
+        self._is_grid = len(self._probe_layout_shape) == 2
         _, _, self._use_grid_fft, grid_normal, grid_tangent_u, grid_tangent_v, grid_spacing = (
             normalize_grid_probe_layout(
                 np.asarray(sensor_options.probe_local_pos, dtype=gs.np_float),
                 np.asarray(sensor_options.probe_local_normal, dtype=gs.np_float),
-                is_grid,
+                self._is_grid,
             )
         )
         self._grid_normal = torch.tensor(grid_normal, dtype=gs.tc_float, device=gs.device)
diff --git a/tests/test_sensors.py b/tests/test_sensors.py
index 1614012a3e..bf4edf26a2 100644
--- a/tests/test_sensors.py
+++ b/tests/test_sensors.py
@@ -2464,18 +2464,19 @@ def test_elastomer_sensor_grid_box_sphere(show_viewer, tol, n_envs):
     scene.step()
 
     # Test dilate displacement: grid sensor should match the flat-layout sensor and detect contact magnitude.
-    grid_data = elastomer_grid_sensor.read_ground_truth()
+    # The grid-input sensor reports (..., ny, nx, 3); flatten the grid axes for comparison with the flat sensor.
+    grid_data = torch.as_tensor(elastomer_grid_sensor.read_ground_truth(), device=gs.device).flatten(-3, -2)
     flat_data = elastomer_sensor.read_ground_truth()
     assert_allclose(flat_data, grid_data, tol=tol)
-    assert torch.linalg.norm(torch.as_tensor(grid_data, device=gs.device), dim=-1).max() > tol
+    assert torch.linalg.norm(grid_data, dim=-1).max() > tol
     assert_allclose(shear_sensor.read_ground_truth(), 0.0, tol=tol)
     assert_allclose(combined_sensor.read_ground_truth(), flat_data, tol=tol)
 
     # normal_exponent reshapes only the out-of-plane channel: the grid-FFT and direct paths still agree, and the
     # cubic-normal response differs from the default quadratic one (sub-unit depths here, so depth**3 < depth**2).
-    cubic_data = cubic_grid_sensor.read_ground_truth()
+    cubic_data = torch.as_tensor(cubic_grid_sensor.read_ground_truth(), device=gs.device).flatten(-3, -2)
     assert_allclose(cubic_flat_sensor.read_ground_truth(), cubic_data, tol=tol)
-    cubic_diff = torch.as_tensor(cubic_data, device=gs.device) - torch.as_tensor(grid_data, device=gs.device)
+    cubic_diff = cubic_data - grid_data
     assert torch.linalg.norm(cubic_diff, dim=-1).max() > tol, "normal_exponent=3 should change the dilation output"
 
     # Test combined displacement: dilate + shear contributions should add when the box slides laterally.

From df0b9673d5a114ad16b634a3a452e54bf54149e6 Mon Sep 17 00:00:00 2001
From: Trinity Chung <trinityjchung@gmail.com>
Date: Mon, 25 May 2026 01:56:26 -0400
Subject: [PATCH 3/7] allow nonexact grid but with warning

---
 genesis/engine/sensors/kinematic_tactile.py   | 24 ++++---
 genesis/engine/sensors/point_cloud_tactile.py | 12 +++-
 genesis/engine/sensors/tactile_shared.py      | 62 ++++++++++++-------
 3 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/genesis/engine/sensors/kinematic_tactile.py b/genesis/engine/sensors/kinematic_tactile.py
index cc04228871..ade9304931 100644
--- a/genesis/engine/sensors/kinematic_tactile.py
+++ b/genesis/engine/sensors/kinematic_tactile.py
@@ -775,10 +775,11 @@ class KinematicTaxelSensor(
 
     def __init__(self, sensor_options: KinematicTaxelOptions, sensor_idx: int, sensor_manager: "SensorManager"):
         super().__init__(sensor_options, sensor_idx, sensor_manager)
-        # FFT-grid eligibility: validates that a 2D layout has uniform spacing/normals/orthogonal tangents.
+        # FFT-grid eligibility: requires a 2D probe layout with non-degenerate spacing. Strict regularity
+        # (uniform normals, orthogonal tangents, exact rectangle) is reported separately as a warning.
         # Flat pos/normals are already populated by ProbeSensorMixin / ProbesWithNormalSensorMixin.
         is_grid = len(self._probe_layout_shape) == 2
-        _, _, self._use_grid_fft, grid_normal, grid_tangent_u, grid_tangent_v, grid_spacing = (
+        _, _, self._use_grid_fft, is_grid_regular, grid_normal, grid_tangent_u, grid_tangent_v, grid_spacing = (
             normalize_grid_probe_layout(
                 np.asarray(sensor_options.probe_local_pos, dtype=gs.np_float),
                 np.asarray(sensor_options.probe_local_normal, dtype=gs.np_float),
@@ -790,11 +791,18 @@ def __init__(self, sensor_options: KinematicTaxelOptions, sensor_idx: int, senso
         self._grid_tangent_v = torch.tensor(grid_tangent_v, dtype=gs.tc_float, device=gs.device)
         self._grid_spacing = torch.tensor(grid_spacing, dtype=gs.tc_float, device=gs.device)
 
-        if self._options.crosstalk_strength > 0.0 and not self._use_grid_fft:
-            gs.raise_exception(
-                "KinematicTaxel crosstalk requires a validated grid layout (probe_local_pos shape (ny, nx, 3) with "
-                "uniform spacing, uniform normals, and orthogonal tangents)."
-            )
+        if self._options.crosstalk_strength > 0.0:
+            if not self._use_grid_fft:
+                gs.raise_exception(
+                    "KinematicTaxel crosstalk requires a 2D grid-shaped probe_local_pos (shape (ny, nx, 3) with "
+                    f"ny, nx >= 2 and non-degenerate spacing); got shape {tuple(self._probe_layout_shape)}."
+                )
+            if not is_grid_regular:
+                gs.logger.warning(
+                    "KinematicTaxel crosstalk grid is not strictly regular (uniform spacing, uniform normals, "
+                    "orthogonal tangents); FFT crosstalk will use averaged spacing and normal as a best-fit "
+                    "approximation."
+                )
 
     def build(self):
         super().build()
@@ -815,7 +823,7 @@ def build(self):
             self._shared_metadata.twist_scalar, float(self._options.twist_scalar), expand=(1,)
         )
 
-        if self._options.crosstalk_strength > 0.0:
+        if self._options.crosstalk_strength > 0.0 and self._use_grid_fft:
             self._register_crosstalk()
 
     def _get_return_format(self) -> tuple[tuple[int, ...], ...]:
diff --git a/genesis/engine/sensors/point_cloud_tactile.py b/genesis/engine/sensors/point_cloud_tactile.py
index 00739408d5..e20908d375 100644
--- a/genesis/engine/sensors/point_cloud_tactile.py
+++ b/genesis/engine/sensors/point_cloud_tactile.py
@@ -1504,9 +1504,11 @@ class ElastomerTaxelSensor(
 ):
     def __init__(self, sensor_options: ElastomerTaxelSensorOptions, sensor_idx: int, sensor_manager: "SensorManager"):
         super().__init__(sensor_options, sensor_idx, sensor_manager)
-        # FFT-grid eligibility check (flat pos/normals are already populated by the base mixins).
+        # FFT-grid eligibility check (flat pos/normals are already populated by the base mixins). 2D layouts with
+        # non-degenerate spacing use the FFT dilation path; strictly irregular grids still take that path with
+        # averaged metadata and only emit a warning.
         self._is_grid = len(self._probe_layout_shape) == 2
-        _, _, self._use_grid_fft, grid_normal, grid_tangent_u, grid_tangent_v, grid_spacing = (
+        _, _, self._use_grid_fft, is_grid_regular, grid_normal, grid_tangent_u, grid_tangent_v, grid_spacing = (
             normalize_grid_probe_layout(
                 np.asarray(sensor_options.probe_local_pos, dtype=gs.np_float),
                 np.asarray(sensor_options.probe_local_normal, dtype=gs.np_float),
@@ -1518,6 +1520,12 @@ def __init__(self, sensor_options: ElastomerTaxelSensorOptions, sensor_idx: int,
         self._grid_tangent_v = torch.tensor(grid_tangent_v, dtype=gs.tc_float, device=gs.device)
         self._grid_spacing = torch.tensor(grid_spacing, dtype=gs.tc_float, device=gs.device)
 
+        if self._use_grid_fft and not is_grid_regular:
+            gs.logger.warning(
+                "ElastomerTaxel grid is not strictly regular (uniform spacing, uniform normals, orthogonal "
+                "tangents); FFT dilation will use averaged spacing and normal as a best-fit approximation."
+            )
+
     def build(self):
         super().build()
 
diff --git a/genesis/engine/sensors/tactile_shared.py b/genesis/engine/sensors/tactile_shared.py
index 7613db64a1..df430a8ac4 100644
--- a/genesis/engine/sensors/tactile_shared.py
+++ b/genesis/engine/sensors/tactile_shared.py
@@ -124,17 +124,23 @@ def expand_probe_normals(normals: np.ndarray, n_probes: int, probe_shape: tuple[
 
 def normalize_grid_probe_layout(
     probe_pos: np.ndarray, probe_normals: np.ndarray, is_grid: bool
-) -> tuple[np.ndarray, np.ndarray, bool, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+) -> tuple[np.ndarray, np.ndarray, bool, bool, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
     """
     Validate a probe layout and extract grid-FFT metadata when the layout qualifies.
 
-    Returns ``(flat_positions, flat_normals, use_grid_fft, grid_normal, tangent_u, tangent_v, grid_spacing)``. When
-    the layout is flat (``is_grid=False``) or fails any grid-FFT precondition, ``use_grid_fft`` is False and the
-    tangent / spacing entries are zero.
+    Returns ``(flat_positions, flat_normals, use_grid_fft, is_grid_regular, grid_normal, tangent_u, tangent_v,
+    grid_spacing)``.
 
-    Grid-FFT preconditions: shape ``(ny, nx, 3)`` with ``ny, nx >= 2``, normals uniform within tolerance, tangents
-    orthogonal, both tangents in the plane perpendicular to the normal, and all interior probes laid out on a
-    regular ``(spacing_u, spacing_v)`` rectangle.
+    ``use_grid_fft`` is True when the layout has shape ``(ny, nx, 3)`` with ``ny, nx >= 2`` and non-degenerate
+    spacing along both axes -- the FFT path is usable and the grid metadata is populated as a best-fit
+    approximation (average step vectors over all adjacent pairs, average unit normal over all probes).
+
+    ``is_grid_regular`` is True when, in addition, the layout is strictly regular: normals uniform within
+    tolerance, tangents orthogonal, both tangents in the plane perpendicular to the normal, and all probes lie
+    on the regular ``(spacing_u, spacing_v)`` rectangle implied by the averaged steps. Callers that proceed
+    with FFT on an irregular layout (``use_grid_fft`` and not ``is_grid_regular``) should warn the user.
+
+    When ``use_grid_fft`` is False, the tangent / spacing / normal entries are zero.
     """
     probe_shape = probe_pos.shape[:-1]
     flat = probe_pos.reshape(-1, 3)
@@ -146,6 +152,7 @@ def normalize_grid_probe_layout(
     normals = normals / normal_norms[:, None]
 
     use_grid_fft = False
+    is_grid_regular = False
     grid_normal = np.zeros(3, dtype=gs.np_float)
     tangent_u = np.zeros(3, dtype=gs.np_float)
     tangent_v = np.zeros(3, dtype=gs.np_float)
@@ -157,14 +164,23 @@ def normalize_grid_probe_layout(
         ny, nx = int(probe_shape[0]), int(probe_shape[1])
         if nx >= 2 and ny >= 2:
             grid = probe_pos.reshape(ny, nx, 3)
-            step_u = grid[0, 1] - grid[0, 0]
-            step_v = grid[1, 0] - grid[0, 0]
-            spacing_u = float(np.linalg.norm(step_u))
-            spacing_v = float(np.linalg.norm(step_v))
+            # Averaged step vectors across all adjacent pairs along each axis -- robust to local jitter.
+            avg_step_u = (grid[:, 1:, :] - grid[:, :-1, :]).reshape(-1, 3).mean(axis=0)
+            avg_step_v = (grid[1:, :, :] - grid[:-1, :, :]).reshape(-1, 3).mean(axis=0)
+            spacing_u = float(np.linalg.norm(avg_step_u))
+            spacing_v = float(np.linalg.norm(avg_step_v))
             if spacing_u >= gs.EPS and spacing_v >= gs.EPS:
-                tangent_u_candidate = (step_u / spacing_u).astype(gs.np_float)
-                tangent_v_candidate = (step_v / spacing_v).astype(gs.np_float)
-                normal_candidate = normals[0].astype(gs.np_float, copy=False)
+                tangent_u_candidate = (avg_step_u / spacing_u).astype(gs.np_float)
+                tangent_v_candidate = (avg_step_v / spacing_v).astype(gs.np_float)
+                # Average unit normal across all probes. If they cancel out (e.g. opposing normals), fall back
+                # to the first probe's normal so downstream FFT still has a defined orientation.
+                avg_normal = normals.mean(axis=0)
+                normal_norm = float(np.linalg.norm(avg_normal))
+                if normal_norm < gs.EPS:
+                    normal_candidate = normals[0].astype(gs.np_float, copy=False)
+                else:
+                    normal_candidate = (avg_normal / normal_norm).astype(gs.np_float)
+
                 normals_are_uniform = bool(np.all(normals @ normal_candidate >= 1.0 - _GRID_TOL))
                 axes_are_orthogonal = abs(float(tangent_u_candidate @ tangent_v_candidate)) <= _GRID_TOL
                 axes_in_plane = (
@@ -173,21 +189,23 @@ def normalize_grid_probe_layout(
                 )
                 expected = (
                     grid[0, 0]
-                    + np.arange(nx, dtype=gs.np_float)[None, :, None] * step_u[None, None, :]
-                    + np.arange(ny, dtype=gs.np_float)[:, None, None] * step_v[None, None, :]
+                    + np.arange(nx, dtype=gs.np_float)[None, :, None] * avg_step_u[None, None, :]
+                    + np.arange(ny, dtype=gs.np_float)[:, None, None] * avg_step_v[None, None, :]
                 )
                 is_regular = bool(np.max(np.linalg.norm(grid - expected, axis=-1)) <= _GRID_TOL)
-                use_grid_fft = normals_are_uniform and axes_are_orthogonal and axes_in_plane and is_regular
-                if use_grid_fft:
-                    grid_normal = normal_candidate
-                    tangent_u = tangent_u_candidate
-                    tangent_v = tangent_v_candidate
-                    grid_spacing = np.array((spacing_u, spacing_v), dtype=gs.np_float)
+
+                use_grid_fft = True
+                is_grid_regular = normals_are_uniform and axes_are_orthogonal and axes_in_plane and is_regular
+                grid_normal = normal_candidate
+                tangent_u = tangent_u_candidate
+                tangent_v = tangent_v_candidate
+                grid_spacing = np.array((spacing_u, spacing_v), dtype=gs.np_float)
 
     return (
         flat.astype(gs.np_float, copy=False),
         normals.astype(gs.np_float, copy=False),
         use_grid_fft,
+        is_grid_regular,
         grid_normal.astype(gs.np_float, copy=False),
         tangent_u.astype(gs.np_float, copy=False),
         tangent_v.astype(gs.np_float, copy=False),

From 077c19c86cf99a6508a8e1ba3608a96fd1e07f6e Mon Sep 17 00:00:00 2001
From: Trinity Chung <trinityjchung@gmail.com>
Date: Mon, 25 May 2026 17:55:28 -0400
Subject: [PATCH 4/7] crosstalk reduce mem usage

---
 genesis/engine/sensors/kinematic_tactile.py | 227 ++++++++++----------
 genesis/options/sensors/tactile.py          |   6 +-
 2 files changed, 111 insertions(+), 122 deletions(-)

diff --git a/genesis/engine/sensors/kinematic_tactile.py b/genesis/engine/sensors/kinematic_tactile.py
index ade9304931..a61ae0aa11 100644
--- a/genesis/engine/sensors/kinematic_tactile.py
+++ b/genesis/engine/sensors/kinematic_tactile.py
@@ -1,10 +1,11 @@
 import math
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar
 
 import numpy as np
 import quadrants as qd
 import torch
+import torch.nn.functional as F
 
 import genesis as gs
 import genesis.utils.array_class as array_class
@@ -27,12 +28,9 @@
     get_measured_bufs,
 )
 from .tactile_shared import (
-    GridFFTConvMetadataMixin,
     ViscoelasticHysteresisMetadataMixin,
     ViscoelasticHysteresisMixin,
-    next_pow2,
     normalize_grid_probe_layout,
-    register_grid_fft_sensor,
 )
 
 if TYPE_CHECKING:
@@ -582,7 +580,6 @@ class KinematicTaxelData(NamedTuple):
 @dataclass
 class KinematicTaxelMetadata(
     ViscoelasticHysteresisMetadataMixin,
-    GridFFTConvMetadataMixin,
     ProbesWithNormalSensorMetadataMixin,
     RigidSensorMetadataMixin,
     SimpleSensorMetadata,
@@ -593,108 +590,104 @@ class KinematicTaxelMetadata(
     shear_scalar: torch.Tensor = make_tensor_field((0,))
     twist_scalar: torch.Tensor = make_tensor_field((0,))
 
-    # Spatial crosstalk reuses the shared ``GridFFTConvMetadataMixin`` state. ``grid_fft_meta`` tuples for this
-    # sensor are ``(sensor_idx, g_ny, g_nx, probe_start, cache_start, sigma, strength, spacing_u, spacing_v)``;
-    # the kernel is a combined ``(1 - strength) * identity + strength * Gaussian / sum(Gaussian)`` blur and the
-    # per-step buffer has 6 channels (force xyz + torque xyz).
+    # Per-sensor spatial crosstalk state. Each entry of ``crosstalk_meta`` is
+    # ``(g_ny, g_nx, probe_start, cache_start, strength, r_v, r_u)``; ``crosstalk_kernels_{v,u}`` hold the matching
+    # depthwise 1D Gaussian weights pre-shaped for ``F.conv2d`` with ``groups=6``. The 2D Gaussian is applied as two
+    # separable depthwise passes (kv along v-axis then ku along u-axis) on a per-sensor ``(B, 6, g_ny, g_nx)``
+    # temporary, with an identity blend ``(1 - strength) * x + strength * conv(x)``. No persistent buffer is held.
+    crosstalk_meta: list[tuple] = field(default_factory=list)
+    crosstalk_kernels_v: list[torch.Tensor] = field(default_factory=list)
+    crosstalk_kernels_u: list[torch.Tensor] = field(default_factory=list)
+    any_crosstalk: bool = False
 
 
-@torch.jit.script
-def _precompute_crosstalk_kernel_fft(
+def _build_separable_crosstalk_kernels(
     sigma: float,
-    strength: float,
-    grid_spacing: tuple[float, float],
-    fft_n: tuple[int, int],
+    spacing_u: float,
+    spacing_v: float,
     device: torch.device,
     dtype: torch.dtype,
-) -> torch.Tensor:
-    """Combined ``(1 - strength) * identity + strength * Gaussian/sum(Gaussian)`` kernel, real-FFT'd.
-
-    Kernel is centered on the FFT origin via ``ifftshift`` so circular convolution is equivalent to convolution with
-    a kernel anchored at the taxel itself. The Gaussian is L1-normalized so a uniform field passes through unchanged
-    (DC bin = 1); the identity-blend keeps the response peaked at the source taxel and the rest leaked into the
-    Gaussian skirt. The output is a complex ``(fft_n[0], fft_n[1] // 2 + 1)`` half-spectrum ready to multiply against
-    ``rfft2(field)``.
+) -> tuple[torch.Tensor, torch.Tensor, int, int]:
+    """Build two L1-normalized depthwise 1D Gaussian kernels for separable crosstalk convolution.
+
+    A 2D isotropic Gaussian is the outer product of two 1D Gaussians, so it is applied as two sequential 1D
+    convolutions (one per axis) on the unpadded active grid. Each 1D kernel has half-radius
+    ``r = ceil(3 * sigma / spacing)`` (3-sigma truncation; tail leakage below ~0.3%) and is L1-normalized after
+    truncation so a uniform field passes through unchanged.
+
+    Returns ``(kernel_v, kernel_u, r_v, r_u)``. The kernels are pre-shaped for ``F.conv2d`` with ``groups=6``:
+    ``kernel_v`` has shape ``(6, 1, 2*r_v + 1, 1)`` (axis 0 of the field, the v / ny axis) and ``kernel_u`` has
+    shape ``(6, 1, 1, 2*r_u + 1)`` (axis 1 of the field, the u / nx axis). The kernel is replicated across all 6
+    channels (force xyz + torque xyz) so a single ``groups=6`` call covers everything.
     """
-    i = torch.arange(fft_n[0], dtype=dtype, device=device)
-    j = torch.arange(fft_n[1], dtype=dtype, device=device)
-    yy, xx = torch.meshgrid((i - fft_n[0] // 2) * grid_spacing[0], (j - fft_n[1] // 2) * grid_spacing[1], indexing="ij")
-    sigma_t = torch.tensor(sigma, dtype=dtype, device=device)
-    g = torch.exp(-(xx * xx + yy * yy) / (2.0 * sigma_t * sigma_t))
-    g = g / g.sum()
-    # Identity in centered layout: 1 at the central cell. ``ifftshift`` then aligns it with FFT index 0.
-    identity = torch.zeros_like(g)
-    identity[fft_n[0] // 2, fft_n[1] // 2] = 1.0
-    combined = (1.0 - strength) * identity + strength * g
-    combined = torch.fft.ifftshift(combined, dim=(-2, -1))
-    return torch.fft.rfft2(combined)
-
-
-def _crosstalk_kernel_builder(meta_entry: tuple, fft_n: tuple[int, int]) -> torch.Tensor:
-    """``register_grid_fft_sensor`` kernel builder for spatial crosstalk: 1 plane (identity-blended Gaussian).
-
-    ``meta_entry`` is ``(sensor_idx, g_ny, g_nx, probe_start, cache_start, sigma, strength, spacing_u, spacing_v)``.
-    The crosstalk kernel's axis 0 spans ny / tangent_v and axis 1 spans nx / tangent_u, so spacing is passed as
-    ``(spacing_v, spacing_u)``.
-    """
-    _, _, _, _, _, sigma, strength, spacing_u, spacing_v = meta_entry
-    k = _precompute_crosstalk_kernel_fft(sigma, strength, (spacing_v, spacing_u), fft_n, gs.device, gs.tc_float)
-    return k.unsqueeze(0)  # (1, fft_ny, fft_nx) -- single kernel plane
-
-
-def _kinematic_taxel_grid_fft_crosstalk(
-    grid_fft_meta: list[tuple],
-    grid_fft_kernels_stacked: torch.Tensor,
+    r_v = max(1, int(math.ceil(3.0 * sigma / spacing_v)))
+    r_u = max(1, int(math.ceil(3.0 * sigma / spacing_u)))
+    iv = torch.arange(2 * r_v + 1, dtype=dtype, device=device) - r_v
+    iu = torch.arange(2 * r_u + 1, dtype=dtype, device=device) - r_u
+    gv = torch.exp(-((iv * spacing_v) ** 2) / (2.0 * sigma * sigma))
+    gu = torch.exp(-((iu * spacing_u) ** 2) / (2.0 * sigma * sigma))
+    gv = gv / gv.sum()
+    gu = gu / gu.sum()
+    kernel_v = gv.view(1, 1, -1, 1).repeat(6, 1, 1, 1)
+    kernel_u = gu.view(1, 1, 1, -1).repeat(6, 1, 1, 1)
+    return kernel_v, kernel_u, r_v, r_u
+
+
+def _kinematic_taxel_grid_separable_crosstalk(
+    crosstalk_meta: list[tuple],
+    crosstalk_kernels_v: list[torch.Tensor],
+    crosstalk_kernels_u: list[torch.Tensor],
     cache_data: torch.Tensor,
-    grid_fft_buffer: torch.Tensor,
     probe_radii: torch.Tensor,
 ) -> None:
     """
-    Apply per-sensor 2D-FFT spatial crosstalk to all 6 channels (force xyz + torque xyz) of every registered
-    grid-crosstalk KinematicTaxel sensor. Mutates ``cache_data`` in place.
-
-    ``cache_data`` is the per-class intermediate cache in ``(B, total_cols)`` layout. Each KinematicTaxel sensor's
-    slice spans ``2 * n_probes * 3`` columns: 3 force xyz cols per probe, then 3 torque xyz cols per probe.
+    Apply per-sensor 2D Gaussian spatial crosstalk to all 6 channels (force xyz + torque xyz) of every registered
+    grid-crosstalk KinematicTaxel sensor as two depthwise 1D convolutions (separable). Mutates ``cache_data`` in
+    place.
+
+    ``cache_data`` is the per-class intermediate cache in ``(B, total_cols)`` layout. Each sensor's slice spans
+    ``2 * n_probes * 3`` columns: 3 force xyz cols per probe, then 3 torque xyz cols per probe, both probe-major
+    (probe flat index ``iy * nx + ix``). Peak per-sensor working memory is ~3 * (B * 6 * g_ny * g_nx * sizeof(float)):
+    the input field, the intermediate after the first pass, and the blurred output. No persistent buffer is held.
     """
-    if not grid_fft_meta:
+    if not crosstalk_meta:
         return
     B = cache_data.shape[0]
-    fft_ny, fft_nx = grid_fft_buffer.shape[-2], grid_fft_buffer.shape[-1]
-
-    # 1) Fill the active region of the buffer. The zero-padding region is never written here and stays zero from
-    # allocation (``register_grid_fft_sensor`` allocates with ``torch.zeros``); the active ``[:g_ny, :g_nx]`` region
-    # is fully overwritten every step, so no per-step ``zero_()`` is needed.
-    for grid_pos, (_, g_ny, g_nx, _, cache_start, _, _, _, _) in enumerate(grid_fft_meta):
+    for (g_ny, g_nx, probe_start, cache_start, strength, r_v, r_u), kv, ku in zip(
+        crosstalk_meta, crosstalk_kernels_v, crosstalk_kernels_u
+    ):
         n_probes = g_ny * g_nx
-        # Layout in cache: force.xyz for all probes, then torque.xyz for all probes. Each block is ``n_probes * 3``
-        # cols, with probe-major ordering matching probe flat index ``iy * nx + ix``.
+        # Build a contiguous (B, 6, g_ny, g_nx) field: force xyz then torque xyz stacked on the channel axis. The
+        # cache holds probe-major ``iy * nx + ix`` ordering, so reshape (B, n_probes, 3) -> (B, ny, nx, 3) and
+        # permute -> (B, 3, ny, nx) per group, then concat along channels. ``.contiguous()`` materializes the
+        # permuted layout that ``F.conv2d`` needs.
         force_block = cache_data[:, cache_start : cache_start + n_probes * 3]
         torque_block = cache_data[:, cache_start + n_probes * 3 : cache_start + 2 * n_probes * 3]
-        # Reshape (B, ny, nx, 3) -> (B, 3, ny, nx); the slice-assignment accepts the non-contiguous permuted view.
-        grid_fft_buffer[:, grid_pos, 0:3, :g_ny, :g_nx] = force_block.view(B, g_ny, g_nx, 3).permute(0, 3, 1, 2)
-        grid_fft_buffer[:, grid_pos, 3:6, :g_ny, :g_nx] = torque_block.view(B, g_ny, g_nx, 3).permute(0, 3, 1, 2)
-
-    # 2) Batched real FFT over the last two dims; kernel is per-sensor, broadcast over B and 6 channels. Inputs are
-    # real so ``rfft2`` (half spectrum) is ~2x cheaper than the full complex ``fft2``.
-    H_fft = torch.fft.rfft2(grid_fft_buffer)  # (B, n_grid_xt, 6, fft_ny, fft_nx // 2 + 1) complex
-    # Stacked kernels: (n_grid_xt, 1, fft_ny, fft_nx // 2 + 1) -> (1, n_grid_xt, 1, ...) for broadcast.
-    K = grid_fft_kernels_stacked.unsqueeze(0)
-    smeared = torch.fft.irfft2(H_fft * K, s=(fft_ny, fft_nx))  # (B, n_grid_xt, 6, fft_ny, fft_nx)
-
-    # 3) Slice each sensor back to its (g_ny, g_nx) grid and write into the cache.
-    for grid_pos, (_, g_ny, g_nx, probe_start, cache_start, _, _, _, _) in enumerate(grid_fft_meta):
-        n_probes = g_ny * g_nx
-        # Zero inactive filler probes (probe_radius == 0): the blur smears neighbour force/torque into their cells.
-        active = (probe_radii[probe_start : probe_start + n_probes] > 0.0).to(smeared.dtype).view(1, 1, g_ny, g_nx)
-        force_smeared = smeared[:, grid_pos, 0:3, :g_ny, :g_nx] * active  # (B, 3, ny, nx)
-        torque_smeared = smeared[:, grid_pos, 3:6, :g_ny, :g_nx] * active
-        # Inverse of the permute used in step 1: (B, 3, ny, nx) -> (B, ny, nx, 3) -> flat (B, ny*nx*3).
-        cache_data[:, cache_start : cache_start + n_probes * 3] = force_smeared.permute(0, 2, 3, 1).reshape(
-            B, n_probes * 3
+        force = force_block.view(B, g_ny, g_nx, 3).permute(0, 3, 1, 2)
+        torque = torque_block.view(B, g_ny, g_nx, 3).permute(0, 3, 1, 2)
+        field_in = torch.cat((force, torque), dim=1).contiguous()  # (B, 6, g_ny, g_nx)
+
+        # Depthwise separable convolution: one Gaussian per channel via groups=6. ``padding='zeros'`` (the default)
+        # means out-of-grid taps contribute zero, so probes near the edge see no spurious mass from beyond the
+        # sensor footprint.
+        tmp = F.conv2d(field_in, kv, groups=6, padding=(r_v, 0))
+        blurred = F.conv2d(tmp, ku, groups=6, padding=(0, r_u))
+
+        # Identity blend: out = (1 - strength) * field_in + strength * blurred. Mathematically equivalent to
+        # convolving with ``(1 - strength) * delta + strength * Gaussian``.
+        out = field_in.mul_(1.0 - strength).add_(blurred, alpha=strength)
+
+        # Zero inactive filler probes (probe_radius == 0): the blur leaks neighbour force/torque into their cells.
+        active = (probe_radii[probe_start : probe_start + n_probes] > 0.0).to(out.dtype).view(1, 1, g_ny, g_nx)
+        out.mul_(active)
+
+        # Inverse of the build permute: (B, 3, ny, nx) -> (B, ny, nx, 3) -> flat (B, ny*nx*3).
+        cache_data[:, cache_start : cache_start + n_probes * 3] = (
+            out[:, 0:3].permute(0, 2, 3, 1).reshape(B, n_probes * 3)
+        )
+        cache_data[:, cache_start + n_probes * 3 : cache_start + 2 * n_probes * 3] = (
+            out[:, 3:6].permute(0, 2, 3, 1).reshape(B, n_probes * 3)
         )
-        cache_data[:, cache_start + n_probes * 3 : cache_start + 2 * n_probes * 3] = torque_smeared.permute(
-            0, 2, 3, 1
-        ).reshape(B, n_probes * 3)
 
 
 CrosstalkSharedMetadataT = TypeVar("CrosstalkSharedMetadataT", bound=KinematicTaxelMetadata)
@@ -702,19 +695,20 @@ def _kinematic_taxel_grid_fft_crosstalk(
 
 class KinematicTaxelCrosstalkMixin(Generic[CrosstalkSharedMetadataT]):
     """
-    Adds FFT-based spatial crosstalk (Gaussian blur, optionally mixed with identity) to KinematicTaxel on the
-    measured branch. Operates on all 6 channels (force xyz + torque xyz) of every grid-shaped sensor with
-    ``crosstalk_strength > 0``. Must come BEFORE ``SimpleSensor`` and AFTER ``ViscoelasticHysteresisMixin`` in MRO
-    so the data flow is: kernel output -> crosstalk -> hysteresis -> hardware imperfections.
+    Adds Gaussian spatial crosstalk (optionally mixed with identity) to KinematicTaxel on the measured branch.
+    Operates on all 6 channels (force xyz + torque xyz) of every grid-shaped sensor with ``crosstalk_strength > 0``.
+    Must come BEFORE ``SimpleSensor`` and AFTER ``ViscoelasticHysteresisMixin`` in MRO so the data flow is:
+    kernel output -> crosstalk -> hysteresis -> hardware imperfections.
     """
 
     _shared_metadata: CrosstalkSharedMetadataT
 
     def _register_crosstalk(self):
-        """Register this sensor for FFT crosstalk via the shared ``register_grid_fft_sensor`` scaffolding.
+        """Build this sensor's separable Gaussian crosstalk kernels and append them to the shared metadata lists.
 
-        Called only when this sensor has a validated grid layout AND ``crosstalk_strength > 0``. The FFT size is
-        ``(ny, nx)``, padded for the Gaussian tail so circular wrap stays below tolerance.
+        Called only when this sensor has a validated grid layout AND ``crosstalk_strength > 0``. Stores two
+        L1-normalized 1D Gaussians (truncated at 3 sigma on each axis, pre-shaped for depthwise ``F.conv2d`` with
+        ``groups=6``); no persistent per-step buffer is allocated.
         """
         sm = self._shared_metadata
         sensor_idx = sm.n_probes_per_sensor.shape[0] - 1  # this sensor was just registered
@@ -725,18 +719,13 @@ def _register_crosstalk(self):
         strength = float(self._options.crosstalk_strength)
         spacing_u = float(self._grid_spacing[0].item())
         spacing_v = float(self._grid_spacing[1].item())
-        # FFT size per axis: grid extent + the 3-sigma Gaussian tail on each side, rounded up to a power of 2.
-        # Truncating at 3 sigma leaves circular wraparound below ~0.3% (sub-tolerance for a well-localized blur).
-        fft_ny = next_pow2(g_ny + 2 * int(math.ceil(3.0 * sigma / spacing_v)))
-        fft_nx = next_pow2(g_nx + 2 * int(math.ceil(3.0 * sigma / spacing_u)))
-        register_grid_fft_sensor(
-            sm,
-            meta_entry=(sensor_idx, g_ny, g_nx, probe_start, cache_start, sigma, strength, spacing_u, spacing_v),
-            this_fft_n=(fft_ny, fft_nx),
-            kernel_builder=_crosstalk_kernel_builder,
-            n_buffer_channels=6,
-            batch_size=self._manager._sim._B,
+        kernel_v, kernel_u, r_v, r_u = _build_separable_crosstalk_kernels(
+            sigma, spacing_u, spacing_v, gs.device, gs.tc_float
         )
+        sm.crosstalk_meta.append((g_ny, g_nx, probe_start, cache_start, strength, r_v, r_u))
+        sm.crosstalk_kernels_v.append(kernel_v)
+        sm.crosstalk_kernels_u.append(kernel_u)
+        sm.any_crosstalk = True
 
     @classmethod
     def _apply_transform(
@@ -748,13 +737,13 @@ def _apply_transform(
         is_measured: bool,
     ):
         super()._apply_transform(shared_metadata, data, timeline, is_measured=is_measured)
-        if not is_measured or not shared_metadata.any_grid_fft:
+        if not is_measured or not shared_metadata.any_crosstalk:
             return
-        _kinematic_taxel_grid_fft_crosstalk(
-            shared_metadata.grid_fft_meta,
-            shared_metadata.grid_fft_kernels_stacked,
+        _kinematic_taxel_grid_separable_crosstalk(
+            shared_metadata.crosstalk_meta,
+            shared_metadata.crosstalk_kernels_v,
+            shared_metadata.crosstalk_kernels_u,
             data,
-            shared_metadata.grid_fft_buffer,
             shared_metadata.probe_radii,
         )
 
@@ -775,11 +764,11 @@ class KinematicTaxelSensor(
 
     def __init__(self, sensor_options: KinematicTaxelOptions, sensor_idx: int, sensor_manager: "SensorManager"):
         super().__init__(sensor_options, sensor_idx, sensor_manager)
-        # FFT-grid eligibility: requires a 2D probe layout with non-degenerate spacing. Strict regularity
-        # (uniform normals, orthogonal tangents, exact rectangle) is reported separately as a warning.
+        # Grid eligibility for spatial crosstalk: requires a 2D probe layout with non-degenerate spacing. Strict
+        # regularity (uniform normals, orthogonal tangents, exact rectangle) is reported separately as a warning.
         # Flat pos/normals are already populated by ProbeSensorMixin / ProbesWithNormalSensorMixin.
         is_grid = len(self._probe_layout_shape) == 2
-        _, _, self._use_grid_fft, is_grid_regular, grid_normal, grid_tangent_u, grid_tangent_v, grid_spacing = (
+        _, _, self._use_grid_crosstalk, is_grid_regular, grid_normal, grid_tangent_u, grid_tangent_v, grid_spacing = (
             normalize_grid_probe_layout(
                 np.asarray(sensor_options.probe_local_pos, dtype=gs.np_float),
                 np.asarray(sensor_options.probe_local_normal, dtype=gs.np_float),
@@ -792,7 +781,7 @@ def __init__(self, sensor_options: KinematicTaxelOptions, sensor_idx: int, senso
         self._grid_spacing = torch.tensor(grid_spacing, dtype=gs.tc_float, device=gs.device)
 
         if self._options.crosstalk_strength > 0.0:
-            if not self._use_grid_fft:
+            if not self._use_grid_crosstalk:
                 gs.raise_exception(
                     "KinematicTaxel crosstalk requires a 2D grid-shaped probe_local_pos (shape (ny, nx, 3) with "
                     f"ny, nx >= 2 and non-degenerate spacing); got shape {tuple(self._probe_layout_shape)}."
@@ -800,7 +789,7 @@ def __init__(self, sensor_options: KinematicTaxelOptions, sensor_idx: int, senso
             if not is_grid_regular:
                 gs.logger.warning(
                     "KinematicTaxel crosstalk grid is not strictly regular (uniform spacing, uniform normals, "
-                    "orthogonal tangents); FFT crosstalk will use averaged spacing and normal as a best-fit "
+                    "orthogonal tangents); crosstalk will use averaged spacing and normal as a best-fit "
                     "approximation."
                 )
 
@@ -823,7 +812,7 @@ def build(self):
             self._shared_metadata.twist_scalar, float(self._options.twist_scalar), expand=(1,)
         )
 
-        if self._options.crosstalk_strength > 0.0 and self._use_grid_fft:
+        if self._options.crosstalk_strength > 0.0 and self._use_grid_crosstalk:
             self._register_crosstalk()
 
     def _get_return_format(self) -> tuple[tuple[int, ...], ...]:
diff --git a/genesis/options/sensors/tactile.py b/genesis/options/sensors/tactile.py
index 337a98af6a..e335154bdb 100644
--- a/genesis/options/sensors/tactile.py
+++ b/genesis/options/sensors/tactile.py
@@ -229,9 +229,9 @@ class KinematicTaxel(
     If this sensor is attached to a fixed entity, it will not detect contacts with other fixed entities.
 
     ``probe_local_pos`` may be either an arbitrary set of probes with shape ``(N, 3)`` or a grid-shaped set with shape
-    ``(M, N, 3)``. Regular planar grids enable FFT-based spatial crosstalk on the measured branch (see
-    ``crosstalk_strength``). A probe whose ``probe_radius`` is 0 is treated as an inactive filler -- it reads 0
-    force/torque and is skipped -- so an irregular taxel set can be padded into a regular grid for FFT crosstalk.
+    ``(M, N, 3)``. Regular planar grids enable spatial crosstalk on the measured branch (see ``crosstalk_strength``).
+    A probe whose ``probe_radius`` is 0 is treated as an inactive filler -- it reads 0 force/torque and is skipped --
+    so an irregular taxel set can be padded into a regular grid for crosstalk.
 
     Parameters
     ----------

From 99c79afdcf342faad044cb9406fafc74648f0fa2 Mon Sep 17 00:00:00 2001
From: Trinity Chung <trinityjchung@gmail.com>
Date: Mon, 25 May 2026 19:30:23 -0400
Subject: [PATCH 5/7] optimizations

---
 genesis/engine/sensors/kinematic_tactile.py   | 150 +++++--
 genesis/engine/sensors/point_cloud_tactile.py | 419 ++++++++----------
 .../engine/sensors/surface_distance_probe.py  | 304 ++++++++++---
 genesis/engine/sensors/tactile_shared.py      | 222 +++++++++-
 4 files changed, 756 insertions(+), 339 deletions(-)

diff --git a/genesis/engine/sensors/kinematic_tactile.py b/genesis/engine/sensors/kinematic_tactile.py
index a61ae0aa11..41ea478b9b 100644
--- a/genesis/engine/sensors/kinematic_tactile.py
+++ b/genesis/engine/sensors/kinematic_tactile.py
@@ -28,6 +28,7 @@
     get_measured_bufs,
 )
 from .tactile_shared import (
+    ContactPrefilterMetadataMixin,
     ViscoelasticHysteresisMetadataMixin,
     ViscoelasticHysteresisMixin,
     normalize_grid_probe_layout,
@@ -44,6 +45,7 @@
 @qd.func
 def _func_query_contact_depth_penetration(
     i_b: int,
+    i_s: int,
     probe_pos: qd.types.vector(3),
     probe_radius_gt: float,
     probe_radius_m: float,
@@ -51,25 +53,23 @@ def _func_query_contact_depth_penetration(
     geoms_info: array_class.GeomsInfo,
     geoms_state: array_class.GeomsState,
     collider_state: array_class.ColliderState,
+    sensor_contacts_idx: qd.types.ndarray(),
+    sensor_n_contacts: qd.types.ndarray(),
     sdf_info: array_class.SDFInfo,
 ):
     """
     Max probe penetration from SDF for contacts involving the sensor link, dual-radius.
-
-    Returns ``(max_pen_gt, max_pen_m)`` from a single SDF pass: both penetrations come from the same ``sd`` per contact
-    via ``pen = radius - sd``. Callers that do not need the noised-radius branch pass ``probe_radius_m ==
-    probe_radius_gt`` and ignore the second return.
     """
     max_pen_gt = gs.qd_float(0.0)
     max_pen_m = gs.qd_float(0.0)
 
-    n_contacts = collider_state.n_contacts[i_b]
-    for i_c in range(n_contacts):
-        i_col = collider_state.contact_sort_idx[i_c, i_b]
-        c_link_a = collider_state.contact_data.link_a[i_col, i_b]
-        c_link_b = collider_state.contact_data.link_b[i_col, i_b]
-        c_geom_a = collider_state.contact_data.geom_a[i_col, i_b]
-        c_geom_b = collider_state.contact_data.geom_b[i_col, i_b]
+    n_c = sensor_n_contacts[i_b, i_s]
+    for k in range(n_c):
+        i_c = sensor_contacts_idx[i_b, i_s, k]
+        c_link_a = collider_state.contact_data.link_a[i_c, i_b]
+        c_link_b = collider_state.contact_data.link_b[i_c, i_b]
+        c_geom_a = collider_state.contact_data.geom_a[i_c, i_b]
+        c_geom_b = collider_state.contact_data.geom_b[i_c, i_b]
 
         for side in qd.static(range(2)):
             c_link = c_link_a if side == 0 else c_link_b
@@ -89,9 +89,50 @@ def _func_query_contact_depth_penetration(
     return max_pen_gt, max_pen_m
 
 
+# Per-(env, sensor) cap on the prefiltered contact list consumed by ``_func_query_contact_depth``
+# and ``_func_query_contact_depth_penetration``. Sensors track a single rigid link; even with multicontact
+# and many neighbouring geoms, the count of contacts touching one link rarely exceeds a few hundred.
+_MAX_CONTACTS_PER_SENSOR = 1024
+
+
+@qd.kernel
+def _kernel_build_sensor_contact_idx(
+    sensor_link_idx: qd.types.ndarray(),
+    collider_state: array_class.ColliderState,
+    sensor_contacts_idx: qd.types.ndarray(),
+    sensor_n_contacts: qd.types.ndarray(),
+):
+    """
+    Per-(env, sensor) compact contact index for the KinematicTaxel pre-pass.
+
+    Parallelizes over ``(n_batches, n_sensors)``: each thread scans the collider's contact list once and writes
+    the indices of contacts whose ``link_a`` or ``link_b`` equals the sensor's tracked link. Drops the main
+    kernel's per-probe contact-list scan from O(n_probes * n_contacts) to O(n_probes * sensor_n_contacts).
+    Cap-overflows (count >= last dim of ``sensor_contacts_idx``) silently truncate; see the module-level
+    ``_MAX_CONTACTS_PER_SENSOR`` comment.
+    """
+    n_sensors = sensor_link_idx.shape[0]
+    n_batches = sensor_n_contacts.shape[0]
+    max_per_sensor = sensor_contacts_idx.shape[2]
+    for i_b, i_s in qd.ndrange(n_batches, n_sensors):
+        link = sensor_link_idx[i_s]
+        count = gs.qd_int(0)
+        n_c = collider_state.n_contacts[i_b]
+        for i_c in range(n_c):
+            if count >= max_per_sensor:
+                break
+            la = collider_state.contact_data.link_a[i_c, i_b]
+            lb = collider_state.contact_data.link_b[i_c, i_b]
+            if la == link or lb == link:
+                sensor_contacts_idx[i_b, i_s, count] = i_c
+                count = count + 1
+        sensor_n_contacts[i_b, i_s] = count
+
+
 @qd.func
 def _func_query_contact_depth(
     i_b: int,
+    i_s: int,
     probe_pos: qd.types.vector(3),
     probe_radius_gt: float,
     probe_radius_m: float,
@@ -101,15 +142,18 @@ def _func_query_contact_depth(
     rigid_global_info: array_class.RigidGlobalInfo,
     collider_static_config: qd.template(),
     collider_state: array_class.ColliderState,
+    sensor_contacts_idx: qd.types.ndarray(),
+    sensor_n_contacts: qd.types.ndarray(),
     sdf_info: array_class.SDFInfo,
     eps: float,
 ):
     """
     Dual-radius probe query: single SDF + normal pass yielding both GT and noised-radius results.
 
-    Returns ``(max_pen_gt, contact_link_gt, contact_normal_gt, max_pen_m, contact_link_m, contact_normal_m)``. AABB
-    pre-filter expands by ``max(probe_radius_gt, probe_radius_m)`` so neither branch is silently skipped. Callers
-    without a noised radius pass ``probe_radius_m == probe_radius_gt``.
+    Iterates only the per-(env, sensor) prefiltered contact list built by ``_kernel_build_sensor_contact_idx``;
+    every contact in that list has ``link_a`` or ``link_b`` equal to ``sensor_link_idx``. AABB pre-filter
+    expands by ``max(probe_radius_gt, probe_radius_m)`` so neither branch is silently skipped. Callers without
+    a noised radius pass ``probe_radius_m == probe_radius_gt``.
     """
     max_pen_gt = gs.qd_float(0.0)
     contact_link_gt = gs.qd_int(-1)
@@ -119,16 +163,16 @@ def _func_query_contact_depth(
     contact_normal_m = qd.Vector.zero(gs.qd_float, 3)
 
     aabb_expansion = qd.max(probe_radius_gt, probe_radius_m)
-    # Iterate over contacts directly from collider state; each contact may have the sensor link on either side.
-    n_contacts = collider_state.n_contacts[i_b]
-    for i_c in range(n_contacts):
-        i_col = collider_state.contact_sort_idx[i_c, i_b]
-        c_link_a = collider_state.contact_data.link_a[i_col, i_b]
-        c_link_b = collider_state.contact_data.link_b[i_col, i_b]
-        c_geom_a = collider_state.contact_data.geom_a[i_col, i_b]
-        c_geom_b = collider_state.contact_data.geom_b[i_col, i_b]
-
-        # Check if either side of this contact involves the sensor link.
+    n_c = sensor_n_contacts[i_b, i_s]
+    for k in range(n_c):
+        i_c = sensor_contacts_idx[i_b, i_s, k]
+        c_link_a = collider_state.contact_data.link_a[i_c, i_b]
+        c_link_b = collider_state.contact_data.link_b[i_c, i_b]
+        c_geom_a = collider_state.contact_data.geom_a[i_c, i_b]
+        c_geom_b = collider_state.contact_data.geom_b[i_c, i_b]
+
+        # Each prefiltered contact touches the sensor link on at least one side; check both since the link
+        # may appear as link_a, link_b, or (degenerately) both.
         for side in qd.static(range(2)):
             c_link = c_link_a if side == 0 else c_link_b
             i_g = c_geom_b if side == 0 else c_geom_a
@@ -226,6 +270,8 @@ def _kernel_kinematic_taxel(
     sensor_cache_start: qd.types.ndarray(),
     sensor_probe_start: qd.types.ndarray(),
     n_probes_per_sensor: qd.types.ndarray(),
+    sensor_contacts_idx: qd.types.ndarray(),
+    sensor_n_contacts: qd.types.ndarray(),
     collider_state: array_class.ColliderState,
     collider_static_config: qd.template(),
     links_state: array_class.LinksState,
@@ -284,6 +330,7 @@ def _kernel_kinematic_taxel(
             contact_normal_m,
         ) = _func_query_contact_depth(
             i_b,
+            i_s,
             probe_pos,
             probe_radius,
             probe_radius_m,
@@ -293,6 +340,8 @@ def _kernel_kinematic_taxel(
             rigid_global_info,
             collider_static_config,
             collider_state,
+            sensor_contacts_idx,
+            sensor_n_contacts,
             sdf_info,
             eps,
         )
@@ -355,6 +404,8 @@ def _kernel_contact_depth_probe(
     links_idx: qd.types.ndarray(),
     sensor_cache_start: qd.types.ndarray(),
     sensor_probe_start: qd.types.ndarray(),
+    sensor_contacts_idx: qd.types.ndarray(),
+    sensor_n_contacts: qd.types.ndarray(),
     collider_state: array_class.ColliderState,
     links_state: array_class.LinksState,
     geoms_state: array_class.GeomsState,
@@ -394,6 +445,7 @@ def _kernel_contact_depth_probe(
 
         max_penetration_gt, max_penetration_m = _func_query_contact_depth_penetration(
             i_b,
+            i_s,
             probe_pos,
             probe_radius,
             probe_radius_m,
@@ -401,6 +453,8 @@ def _kernel_contact_depth_probe(
             geoms_info,
             geoms_state,
             collider_state,
+            sensor_contacts_idx,
+            sensor_n_contacts,
             sdf_info,
         )
         # Per-(env, probe) gain on the measured-branch depth only.
@@ -420,6 +474,7 @@ def build(self):
 class ContactDepthProbeMetadata(
     ViscoelasticHysteresisMetadataMixin,
     ProbeSensorMetadataMixin,
+    ContactPrefilterMetadataMixin,
     RigidSensorMetadataMixin,
     SimpleSensorMetadata,
 ):
@@ -432,7 +487,19 @@ class ContactDepthProbeSensor(
     RigidSensorMixin[ContactDepthProbeMetadata],
     SimpleSensor[ContactDepthProbeOptions, ContactDepthProbeMetadata, tuple],
 ):
-    """Returns contact depth in meters per probe."""
+    """
+    Returns contact depth in meters per probe.
+    """
+
+    def build(self):
+        super().build()
+        # Re-allocate the per-(env, sensor) contact prefilter buffers to absorb the newly-registered sensor.
+        B = self._manager._sim._B
+        n_sensors_built = self._shared_metadata.n_probes_per_sensor.shape[0]
+        self._shared_metadata.sensor_contacts_idx = torch.zeros(
+            (B, n_sensors_built, _MAX_CONTACTS_PER_SENSOR), dtype=gs.tc_int, device=gs.device
+        )
+        self._shared_metadata.sensor_n_contacts = torch.zeros((B, n_sensors_built), dtype=gs.tc_int, device=gs.device)
 
     def _get_return_format(self) -> tuple[int, ...]:
         return self._probe_layout_shape
@@ -453,6 +520,12 @@ def _update_current_timestep_data(
         measured, measured_cols_b = get_measured_bufs(
             shared_metadata, current_ground_truth_data_T, measured_data_timeline
         )
+        _kernel_build_sensor_contact_idx(
+            shared_metadata.links_idx,
+            solver.collider._collider_state,
+            shared_metadata.sensor_contacts_idx,
+            shared_metadata.sensor_n_contacts,
+        )
         _kernel_contact_depth_probe(
             shared_metadata.probe_positions,
             shared_metadata.probe_sensor_idx,
@@ -462,6 +535,8 @@ def _update_current_timestep_data(
             shared_metadata.links_idx,
             shared_metadata.sensor_cache_start,
             shared_metadata.sensor_probe_start,
+            shared_metadata.sensor_contacts_idx,
+            shared_metadata.sensor_n_contacts,
             solver.collider._collider_state,
             solver.links_state,
             solver.geoms_state,
@@ -581,6 +656,7 @@ class KinematicTaxelData(NamedTuple):
 class KinematicTaxelMetadata(
     ViscoelasticHysteresisMetadataMixin,
     ProbesWithNormalSensorMetadataMixin,
+    ContactPrefilterMetadataMixin,
     RigidSensorMetadataMixin,
     SimpleSensorMetadata,
 ):
@@ -590,11 +666,10 @@ class KinematicTaxelMetadata(
     shear_scalar: torch.Tensor = make_tensor_field((0,))
     twist_scalar: torch.Tensor = make_tensor_field((0,))
 
-    # Per-sensor spatial crosstalk state. Each entry of ``crosstalk_meta`` is
+    # Per-sensor spatial crosstalk state. ``crosstalk_meta[i]`` is
     # ``(g_ny, g_nx, probe_start, cache_start, strength, r_v, r_u)``; ``crosstalk_kernels_{v,u}`` hold the matching
-    # depthwise 1D Gaussian weights pre-shaped for ``F.conv2d`` with ``groups=6``. The 2D Gaussian is applied as two
-    # separable depthwise passes (kv along v-axis then ku along u-axis) on a per-sensor ``(B, 6, g_ny, g_nx)``
-    # temporary, with an identity blend ``(1 - strength) * x + strength * conv(x)``. No persistent buffer is held.
+    # depthwise 1D Gaussian weights pre-shaped for ``F.conv2d`` with ``groups=6``. Applied as two separable
+    # passes (kv then ku) with an identity blend ``(1 - strength) * x + strength * conv(x)``.
     crosstalk_meta: list[tuple] = field(default_factory=list)
     crosstalk_kernels_v: list[torch.Tensor] = field(default_factory=list)
     crosstalk_kernels_u: list[torch.Tensor] = field(default_factory=list)
@@ -815,6 +890,15 @@ def build(self):
         if self._options.crosstalk_strength > 0.0 and self._use_grid_crosstalk:
             self._register_crosstalk()
 
+        # Re-allocate the per-(env, sensor) contact prefilter buffers to absorb the newly-registered sensor.
+        # Sized at build time; the per-step kernel writes into the same buffers without further allocation.
+        B = self._manager._sim._B
+        n_sensors_built = self._shared_metadata.n_probes_per_sensor.shape[0]
+        self._shared_metadata.sensor_contacts_idx = torch.zeros(
+            (B, n_sensors_built, _MAX_CONTACTS_PER_SENSOR), dtype=gs.tc_int, device=gs.device
+        )
+        self._shared_metadata.sensor_n_contacts = torch.zeros((B, n_sensors_built), dtype=gs.tc_int, device=gs.device)
+
     def _get_return_format(self) -> tuple[tuple[int, ...], ...]:
         shape = (*self._probe_layout_shape, 3)
         return shape, shape
@@ -840,6 +924,12 @@ def _update_current_timestep_data(
         measured_equals_gt = int(
             not shared_metadata.has_any_probe_radius_noise and not shared_metadata.has_any_probe_gain
         )
+        _kernel_build_sensor_contact_idx(
+            shared_metadata.links_idx,
+            solver.collider._collider_state,
+            shared_metadata.sensor_contacts_idx,
+            shared_metadata.sensor_n_contacts,
+        )
         _kernel_kinematic_taxel(
             shared_metadata.probe_positions,
             shared_metadata.probe_sensor_idx,
@@ -855,6 +945,8 @@ def _update_current_timestep_data(
             shared_metadata.sensor_cache_start,
             shared_metadata.sensor_probe_start,
             shared_metadata.n_probes_per_sensor,
+            shared_metadata.sensor_contacts_idx,
+            shared_metadata.sensor_n_contacts,
             solver.collider._collider_state,
             solver.collider._collider_static_config,
             solver.links_state,
diff --git a/genesis/engine/sensors/point_cloud_tactile.py b/genesis/engine/sensors/point_cloud_tactile.py
index e20908d375..d5d6d34c9a 100644
--- a/genesis/engine/sensors/point_cloud_tactile.py
+++ b/genesis/engine/sensors/point_cloud_tactile.py
@@ -24,9 +24,17 @@
     get_measured_bufs,
 )
 from .tactile_shared import (
+    BVH_LEAF_SIZE,
+    BVH_STACK_SIZE,
+    BVHMetadata,
     GridFFTConvMetadataMixin,
     ViscoelasticHysteresisMetadataMixin,
     ViscoelasticHysteresisMixin,
+    build_static_chunk_bvh,
+    func_aabb_intersects_aabb,
+    func_sphere_intersects_aabb,
+    func_vec3_at,
+    get_mesh_geom_chunks,
     next_pow2,
     normalize_grid_probe_layout,
     register_grid_fft_sensor,
@@ -40,30 +48,6 @@
     from .sensor_manager import SensorManager
 
 
-def _get_mesh_geom_chunks(link, prefer_visual: bool) -> list[tuple[object, np.ndarray, np.ndarray]]:
-    """Return per-geom mesh chunks in link-local frame."""
-    if prefer_visual:
-        geoms = list(link.vgeoms) if link.vgeoms else list(link.geoms)
-        use_vverts = bool(link.vgeoms)
-    else:
-        geoms = list(link.geoms) if link.geoms else list(link.vgeoms)
-        use_vverts = not bool(link.geoms) and bool(link.vgeoms)
-
-    chunks: list[tuple[object, np.ndarray, np.ndarray]] = []
-    for geom in geoms:
-        if use_vverts:
-            verts = np.asarray(geom.init_vverts, dtype=np.float32)
-            faces = np.asarray(geom.init_vfaces, dtype=np.int32)
-        else:
-            verts = np.asarray(geom.init_verts, dtype=np.float32)
-            faces = np.asarray(geom.init_faces, dtype=np.int32)
-        if verts.size == 0 or faces.size == 0:
-            continue
-        verts_link = gu.transform_by_trans_quat(verts, geom.init_pos, geom.init_quat)
-        chunks.append((geom, verts_link.astype(np.float32, copy=False), np.asarray(faces, dtype=np.int32)))
-    return chunks
-
-
 def _n_sample_points_per_link(n_sample_points: int | list | tuple, n_links: int) -> list[int]:
     if n_links <= 0:
         return []
@@ -143,7 +127,7 @@ def _sample_track_links_point_cloud_tensors(
         n_pts = n_per_link[i_l]
         link_idx = int(track_link_idx[i_l])
         link = solver.links[link_idx]
-        geom_chunks = _get_mesh_geom_chunks(link, prefer_visual)
+        geom_chunks = get_mesh_geom_chunks(link, prefer_visual)
         if not geom_chunks:
             gs.raise_exception(f"No mesh geometry on tracked link index {link_idx}.")
         for n_geom_pts, (geom, verts, faces) in zip(_split_count_by_area(n_pts, geom_chunks), geom_chunks):
@@ -172,116 +156,27 @@ def _sample_track_links_point_cloud_tensors(
     )
 
 
-_POINT_CLOUD_BVH_LEAF_SIZE = 8
-_POINT_CLOUD_BVH_STACK_SIZE = 32
 _ELASTOMER_QUERY_AABB_MARGIN = 1e-3
 
 
-def _build_static_chunk_bvh(points: np.ndarray, global_rows: np.ndarray, leaf_size: int):
-    """Median-split AABB BVH over ``points`` (a single tracked link's local-frame point cloud).
-
-    Leaves carry the caller-provided ``global_rows`` (absolute rows into ``pc_pos_link``); the
-    kernel can therefore index directly into the per-class point-cloud tensors with no extra
-    indirection. Internal nodes use -1 for ``node_left`` / ``node_right``.
-    """
-    node_min: list[np.ndarray] = []
-    node_max: list[np.ndarray] = []
-    node_left: list[int] = []
-    node_right: list[int] = []
-    node_point_start: list[int] = []
-    node_point_n: list[int] = []
-    point_idx: list[int] = []
-
-    def _alloc() -> int:
-        i = len(node_min)
-        node_min.append(np.zeros(3, dtype=np.float32))
-        node_max.append(np.zeros(3, dtype=np.float32))
-        node_left.append(-1)
-        node_right.append(-1)
-        node_point_start.append(-1)
-        node_point_n.append(0)
-        return i
-
-    def _build(rows: np.ndarray, pts: np.ndarray) -> int:
-        nid = _alloc()
-        bmin = pts.min(axis=0).astype(np.float32)
-        bmax = pts.max(axis=0).astype(np.float32)
-        node_min[nid] = bmin
-        node_max[nid] = bmax
-        if rows.shape[0] <= leaf_size:
-            start = len(point_idx)
-            point_idx.extend(int(r) for r in rows)
-            node_point_start[nid] = start
-            node_point_n[nid] = int(rows.shape[0])
-            return nid
-        axis = int(np.argmax(bmax - bmin))
-        order = np.argsort(pts[:, axis], kind="stable")
-        mid = order.shape[0] // 2
-        node_left[nid] = _build(rows[order[:mid]], pts[order[:mid]])
-        node_right[nid] = _build(rows[order[mid:]], pts[order[mid:]])
-        return nid
-
-    if points.shape[0] == 0:
-        return (
-            np.zeros((0, 3), dtype=np.float32),
-            np.zeros((0, 3), dtype=np.float32),
-            np.zeros((0,), dtype=np.int32),
-            np.zeros((0,), dtype=np.int32),
-            np.zeros((0,), dtype=np.int32),
-            np.zeros((0,), dtype=np.int32),
-            np.zeros((0,), dtype=np.int32),
-        )
-
-    root = _build(global_rows.astype(np.int32, copy=False), points.astype(np.float32, copy=False))
-    assert root == 0
-    return (
-        np.stack(node_min, axis=0),
-        np.stack(node_max, axis=0),
-        np.asarray(node_left, dtype=np.int32),
-        np.asarray(node_right, dtype=np.int32),
-        np.asarray(node_point_start, dtype=np.int32),
-        np.asarray(node_point_n, dtype=np.int32),
-        np.asarray(point_idx, dtype=np.int32),
-    )
-
-
 @dataclass
-class PointCloudBVH:
-    """Static link-local BVH over the tracked point clouds of one sensor class.
-
-    One chunk per (sensor, tracked_link); each chunk is built once in that link's LOCAL frame at
-    scene-build time and never rebuilt. Queries must transform into a chunk's link-local frame at
-    query time. Chunk nodes are flat-packed across all chunks: chunk_node_start/n delimits each
-    chunk's contiguous run; node_left/right are ABSOLUTE indices into the flat node tensors (-1
-    for leaves). Leaves' point_idx values are absolute rows into pc_pos_link / pc_active_envs_mask
-    / pc_normal_link, so a leaf hit resolves to per-point data with one indirection.
+class PointCloudBVH(BVHMetadata):
+    """
+    BVH over the tracked point clouds of one sensor class. ``leaf_elem_idx`` entries are absolute
+    rows into ``pc_pos_link`` / ``pc_active_envs_mask`` / ``pc_normal_link`` so a leaf hit resolves
+    to per-point data with one indirection. See ``BVHMetadata`` for the shared scaffolding semantics.
     """
 
-    sensor_chunk_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-    sensor_chunk_n: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-
-    chunk_link_idx: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-    # Inverse of sensor_chunk_start/n: chunk_sensor_idx[i_c] is the owning sensor's index. Enables
-    # (env, chunk)-parallel kernels without redundant per-thread scans of sensor_chunk_start.
+    # Inverse of sensor_chunk_start/count: chunk_sensor_idx[i_c] is the owning sensor's index. Enables
+    # (env, chunk)-parallel kernels (e.g. ElastomerTaxel surface state) without rescanning sensor_chunk_start
+    # in every thread; ProximityTaxel parallelizes per-probe and does not consume this field.
     chunk_sensor_idx: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-    chunk_node_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-    chunk_node_n: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-
-    node_min: torch.Tensor = make_tensor_field((0, 3))
-    node_max: torch.Tensor = make_tensor_field((0, 3))
-    node_left: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-    node_right: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-    node_point_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-    node_point_n: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
-
-    point_idx: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
 
     def append_sensor(self, *, pc_start_row: int, idx_cat: torch.Tensor, pos_cat: torch.Tensor) -> None:
-        """Build per-tracked-link chunks for one sensor and append into the flat tensors.
-
-        Must be called immediately after extending ``pc_pos_link`` by ``pos_cat`` — each leaf's
-        ``point_idx`` entry is ``pc_start_row + local_row`` and must address the freshly-grown
-        rows.
+        """
+        Build per-tracked-link chunks for one sensor and append into the flat tensors. Must be called
+        immediately after extending ``pc_pos_link`` by ``pos_cat`` so each leaf's element index
+        (``pc_start_row + local_row``) addresses the freshly-grown rows.
         """
         n_local = int(pos_cat.shape[0])
         if n_local == 0:
@@ -293,40 +188,42 @@ def append_sensor(self, *, pc_start_row: int, idx_cat: torch.Tensor, pos_cat: to
 
         chunk_start_for_sensor = int(self.chunk_link_idx.shape[0])
         node_offset = int(self.node_min.shape[0])
-        point_offset = int(self.point_idx.shape[0])
+        point_offset = int(self.leaf_elem_idx.shape[0])
 
         new_chunk_link_idx: list[int] = []
         new_chunk_node_start: list[int] = []
-        new_chunk_node_n: list[int] = []
+        new_chunk_node_count: list[int] = []
         all_node_min: list[np.ndarray] = []
         all_node_max: list[np.ndarray] = []
         all_node_left: list[np.ndarray] = []
         all_node_right: list[np.ndarray] = []
-        all_node_point_start: list[np.ndarray] = []
-        all_node_point_n: list[np.ndarray] = []
-        all_point_idx: list[np.ndarray] = []
+        all_node_leaf_start: list[np.ndarray] = []
+        all_node_leaf_count: list[np.ndarray] = []
+        all_leaf_elem_idx: list[np.ndarray] = []
 
         for link_idx in unique_links:
             local_rows = np.nonzero(idx_np == int(link_idx))[0].astype(np.int32)
             global_rows = (int(pc_start_row) + local_rows).astype(np.int32)
             pts_link = pos_np[local_rows]
 
-            nmin, nmax, nleft, nright, npstart, npn, pidx = _build_static_chunk_bvh(
-                pts_link, global_rows, _POINT_CLOUD_BVH_LEAF_SIZE
+            # Point cloud: AABB per element is degenerate (the point itself), so pass the points as both
+            # centroids and the per-element min/max bounds.
+            nmin, nmax, nleft, nright, npstart, npn, pidx = build_static_chunk_bvh(
+                pts_link, pts_link, pts_link, global_rows, BVH_LEAF_SIZE
             )
 
             new_chunk_link_idx.append(int(link_idx))
             new_chunk_node_start.append(node_offset)
-            new_chunk_node_n.append(int(nmin.shape[0]))
+            new_chunk_node_count.append(int(nmin.shape[0]))
 
             all_node_min.append(nmin)
             all_node_max.append(nmax)
             # Rebase intra-chunk child / leaf-start indices into the flat tensors' absolute space.
             all_node_left.append(np.where(nleft >= 0, nleft + node_offset, nleft).astype(np.int32))
             all_node_right.append(np.where(nright >= 0, nright + node_offset, nright).astype(np.int32))
-            all_node_point_start.append(np.where(npn > 0, npstart + point_offset, npstart).astype(np.int32))
-            all_node_point_n.append(npn)
-            all_point_idx.append(pidx)
+            all_node_leaf_start.append(np.where(npn > 0, npstart + point_offset, npstart).astype(np.int32))
+            all_node_leaf_count.append(npn)
+            all_leaf_elem_idx.append(pidx)
 
             node_offset += int(nmin.shape[0])
             point_offset += int(pidx.shape[0])
@@ -335,12 +232,12 @@ def append_sensor(self, *, pc_start_row: int, idx_cat: torch.Tensor, pos_cat: to
         nx = torch.tensor(np.concatenate(all_node_max, axis=0), dtype=gs.tc_float, device=gs.device)
         nl = torch.tensor(np.concatenate(all_node_left, axis=0), dtype=gs.tc_int, device=gs.device)
         nr = torch.tensor(np.concatenate(all_node_right, axis=0), dtype=gs.tc_int, device=gs.device)
-        nps = torch.tensor(np.concatenate(all_node_point_start, axis=0), dtype=gs.tc_int, device=gs.device)
-        npn_t = torch.tensor(np.concatenate(all_node_point_n, axis=0), dtype=gs.tc_int, device=gs.device)
-        pidx_t = torch.tensor(np.concatenate(all_point_idx, axis=0), dtype=gs.tc_int, device=gs.device)
+        nps = torch.tensor(np.concatenate(all_node_leaf_start, axis=0), dtype=gs.tc_int, device=gs.device)
+        npn_t = torch.tensor(np.concatenate(all_node_leaf_count, axis=0), dtype=gs.tc_int, device=gs.device)
+        pidx_t = torch.tensor(np.concatenate(all_leaf_elem_idx, axis=0), dtype=gs.tc_int, device=gs.device)
         cli = torch.tensor(new_chunk_link_idx, dtype=gs.tc_int, device=gs.device)
         cns = torch.tensor(new_chunk_node_start, dtype=gs.tc_int, device=gs.device)
-        cnn = torch.tensor(new_chunk_node_n, dtype=gs.tc_int, device=gs.device)
+        cnn = torch.tensor(new_chunk_node_count, dtype=gs.tc_int, device=gs.device)
         # Sensor index for this batch of chunks = current sensor count (the entry we're about to add).
         sensor_idx_for_chunks = int(self.sensor_chunk_start.shape[0])
         csi = torch.full((len(unique_links),), sensor_idx_for_chunks, dtype=gs.tc_int, device=gs.device)
@@ -349,51 +246,15 @@ def append_sensor(self, *, pc_start_row: int, idx_cat: torch.Tensor, pos_cat: to
         self.node_max = concat_with_tensor(self.node_max, nx, expand=(nx.shape[0], 3))
         self.node_left = concat_with_tensor(self.node_left, nl, expand=(nl.shape[0],))
         self.node_right = concat_with_tensor(self.node_right, nr, expand=(nr.shape[0],))
-        self.node_point_start = concat_with_tensor(self.node_point_start, nps, expand=(nps.shape[0],))
-        self.node_point_n = concat_with_tensor(self.node_point_n, npn_t, expand=(npn_t.shape[0],))
-        self.point_idx = concat_with_tensor(self.point_idx, pidx_t, expand=(pidx_t.shape[0],))
+        self.node_leaf_start = concat_with_tensor(self.node_leaf_start, nps, expand=(nps.shape[0],))
+        self.node_leaf_count = concat_with_tensor(self.node_leaf_count, npn_t, expand=(npn_t.shape[0],))
+        self.leaf_elem_idx = concat_with_tensor(self.leaf_elem_idx, pidx_t, expand=(pidx_t.shape[0],))
         self.chunk_link_idx = concat_with_tensor(self.chunk_link_idx, cli, expand=(cli.shape[0],))
         self.chunk_sensor_idx = concat_with_tensor(self.chunk_sensor_idx, csi, expand=(csi.shape[0],))
         self.chunk_node_start = concat_with_tensor(self.chunk_node_start, cns, expand=(cns.shape[0],))
-        self.chunk_node_n = concat_with_tensor(self.chunk_node_n, cnn, expand=(cnn.shape[0],))
+        self.chunk_node_count = concat_with_tensor(self.chunk_node_count, cnn, expand=(cnn.shape[0],))
         self.sensor_chunk_start = concat_with_tensor(self.sensor_chunk_start, chunk_start_for_sensor, expand=(1,))
-        self.sensor_chunk_n = concat_with_tensor(self.sensor_chunk_n, len(unique_links), expand=(1,))
-
-
-@qd.func
-def _func_vec3_at(values: qd.types.ndarray(), i: int) -> qd.types.vector(3):
-    return qd.Vector([values[i, 0], values[i, 1], values[i, 2]], dt=float)
-
-
-@qd.func
-def _func_sphere_intersects_aabb(center, radius_sq, bmin, bmax):  # -> bool
-    """Squared-distance sphere-vs-AABB test: returns True iff the closest point of the AABB to
-    ``center`` is within ``radius_sq``. Used by ProximityTaxel BVH traversal."""
-    d_sq = gs.qd_float(0.0)
-    for k in qd.static(range(3)):
-        v = center[k]
-        lo = bmin[k]
-        hi = bmax[k]
-        if v < lo:
-            d = lo - v
-            d_sq = d_sq + d * d
-        elif v > hi:
-            d = v - hi
-            d_sq = d_sq + d * d
-    return d_sq <= radius_sq
-
-
-@qd.func
-def _func_aabb_intersects_aabb(amin, amax, bmin, bmax):  # -> bool
-    """Standard 6-axis AABB-vs-AABB overlap test. Used by ElastomerTaxel BVH traversal."""
-    return (
-        amin[0] <= bmax[0]
-        and amax[0] >= bmin[0]
-        and amin[1] <= bmax[1]
-        and amax[1] >= bmin[1]
-        and amin[2] <= bmax[2]
-        and amax[2] >= bmin[2]
-    )
+        self.sensor_chunk_count = concat_with_tensor(self.sensor_chunk_count, len(unique_links), expand=(1,))
 
 
 @qd.kernel
@@ -406,16 +267,16 @@ def _kernel_point_cloud_proximity_taxel_bvh(
     sensor_probe_start: qd.types.ndarray(),
     n_probes_per_sensor: qd.types.ndarray(),
     bvh_sensor_chunk_start: qd.types.ndarray(),
-    bvh_sensor_chunk_n: qd.types.ndarray(),
+    bvh_sensor_chunk_count: qd.types.ndarray(),
     bvh_chunk_link_idx: qd.types.ndarray(),
     bvh_chunk_node_start: qd.types.ndarray(),
     bvh_node_min: qd.types.ndarray(),
     bvh_node_max: qd.types.ndarray(),
     bvh_node_left: qd.types.ndarray(),
     bvh_node_right: qd.types.ndarray(),
-    bvh_node_point_start: qd.types.ndarray(),
-    bvh_node_point_n: qd.types.ndarray(),
-    bvh_point_idx: qd.types.ndarray(),
+    bvh_node_leaf_start: qd.types.ndarray(),
+    bvh_node_leaf_count: qd.types.ndarray(),
+    bvh_leaf_elem_idx: qd.types.ndarray(),
     pc_pos_link: qd.types.ndarray(),
     pc_active_envs_mask: qd.types.ndarray(),
     probe_radii: qd.types.ndarray(),
@@ -450,10 +311,10 @@ def _kernel_point_cloud_proximity_taxel_bvh(
         s_ang = links_state.cd_ang[sensor_link_idx, i_b]
         s_com = links_state.root_COM[sensor_link_idx, i_b]
 
-        probe_local = _func_vec3_at(probe_positions_local, i_p)
+        probe_local = func_vec3_at(probe_positions_local, i_p)
         probe_world = s_pos + gu.qd_transform_by_quat(probe_local, s_quat)
 
-        a_loc = _func_vec3_at(probe_local_normal, i_p)
+        a_loc = func_vec3_at(probe_local_normal, i_p)
         a_w = gu.qd_transform_by_quat(a_loc, s_quat)
         a_norm = qd.sqrt(a_w.dot(a_w)) + eps
         for j in qd.static(range(3)):
@@ -481,7 +342,7 @@ def _kernel_point_cloud_proximity_taxel_bvh(
         tau_w_m = qd.Vector.zero(gs.qd_float, 3)
 
         chunk_start = bvh_sensor_chunk_start[i_s]
-        n_chunks = bvh_sensor_chunk_n[i_s]
+        n_chunks = bvh_sensor_chunk_count[i_s]
         for c_off in range(n_chunks):
             i_c = chunk_start + c_off
             track_link_idx = bvh_chunk_link_idx[i_c]
@@ -493,26 +354,26 @@ def _kernel_point_cloud_proximity_taxel_bvh(
             # BVH nodes live in tracked-link local frame: bring the probe sphere center over.
             probe_link = gu.qd_inv_transform_by_trans_quat(probe_world, track_pos, track_quat)
 
-            stack = qd.Vector.zero(gs.qd_int, qd.static(_POINT_CLOUD_BVH_STACK_SIZE))
+            stack = qd.Vector.zero(gs.qd_int, qd.static(BVH_STACK_SIZE))
             stack[0] = bvh_chunk_node_start[i_c]
             stack_idx = 1
 
             while stack_idx > 0:
                 stack_idx -= 1
                 n = stack[stack_idx]
-                bmin = _func_vec3_at(bvh_node_min, n)
-                bmax = _func_vec3_at(bvh_node_max, n)
-                if not _func_sphere_intersects_aabb(probe_link, R_query_sq, bmin, bmax):
+                bmin = func_vec3_at(bvh_node_min, n)
+                bmax = func_vec3_at(bvh_node_max, n)
+                if not func_sphere_intersects_aabb(probe_link, R_query_sq, bmin, bmax):
                     continue
                 left = bvh_node_left[n]
                 if left == -1:
-                    pstart = bvh_node_point_start[n]
-                    pn = bvh_node_point_n[n]
+                    pstart = bvh_node_leaf_start[n]
+                    pn = bvh_node_leaf_count[n]
                     for j in range(pn):
-                        i_o = bvh_point_idx[pstart + j]
+                        i_o = bvh_leaf_elem_idx[pstart + j]
                         if not pc_active_envs_mask[i_o, i_b]:
                             continue
-                        pos_l = _func_vec3_at(pc_pos_link, i_o)
+                        pos_l = func_vec3_at(pc_pos_link, i_o)
                         d_link = pos_l - probe_link
                         dsq = d_link.dot(d_link)
                         dist = qd.sqrt(dsq)
@@ -672,7 +533,7 @@ def build(self):
             self._shared_metadata.sensor_pc_n, self._shared_metadata.pc_pos_link.shape[0] - pc_start_row, expand=(1,)
         )
 
-        # BVH growth follows pc_pos_link growth in lockstep: each leaf's point_idx is an absolute
+        # BVH growth follows pc_pos_link growth in lockstep: each leaf's leaf_elem_idx is an absolute
         # row into the just-grown pc_pos_link.
         self._shared_metadata.pc_bvh.append_sensor(
             pc_start_row=pc_start_row,
@@ -814,16 +675,16 @@ def _update_current_timestep_data(
             shared_metadata.sensor_probe_start,
             shared_metadata.n_probes_per_sensor,
             bvh.sensor_chunk_start,
-            bvh.sensor_chunk_n,
+            bvh.sensor_chunk_count,
             bvh.chunk_link_idx,
             bvh.chunk_node_start,
             bvh.node_min,
             bvh.node_max,
             bvh.node_left,
             bvh.node_right,
-            bvh.node_point_start,
-            bvh.node_point_n,
-            bvh.point_idx,
+            bvh.node_leaf_start,
+            bvh.node_leaf_count,
+            bvh.leaf_elem_idx,
             shared_metadata.pc_pos_link,
             shared_metadata.pc_active_envs_mask,
             shared_metadata.probe_radii,
@@ -846,8 +707,9 @@ def _draw_debug(self, context: "RasterizerContext"):
         self._draw_debug_probes(
             context,
             self._tactile_color_groups_fn(
-                lambda envs_idx: self._debug_probe_buffer_magnitudes(self._shared_metadata.taxel_signal_buf, envs_idx)
-                >= gs.EPS,
+                lambda envs_idx: (
+                    self._debug_probe_buffer_magnitudes(self._shared_metadata.taxel_signal_buf, envs_idx) >= gs.EPS
+                ),
             ),
         )
 
@@ -1035,7 +897,7 @@ def _kernel_elastomer_probe_depth(
         sensor_link_idx = links_idx[i_s]
         link_pos = links_state.pos[sensor_link_idx, i_b]
         link_quat = links_state.quat[sensor_link_idx, i_b]
-        probe_local = _func_vec3_at(probe_positions_local, i_p)
+        probe_local = func_vec3_at(probe_positions_local, i_p)
         probe_world = link_pos + gu.qd_transform_by_quat(probe_local, link_quat)
 
         min_sdf = _func_elastomer_min_sdf_over_active_geoms(
@@ -1095,8 +957,8 @@ def _kernel_elastomer_dilate_accumulate(
                 output[cache_start + _i_p * 3 + k, i_b] = gs.qd_float(0.0)
             continue
 
-        target_local = _func_vec3_at(probe_positions_local, i_p)
-        target_normal = _func_vec3_at(probe_local_normal, i_p)
+        target_local = func_vec3_at(probe_positions_local, i_p)
+        target_normal = func_vec3_at(probe_local_normal, i_p)
 
         acc = qd.Vector.zero(gs.qd_float, 3)
         for j in range(n_probes):
@@ -1105,7 +967,7 @@ def _kernel_elastomer_dilate_accumulate(
             if src_depth <= gs.qd_float(0.0):
                 continue
             contribution = _func_elastomer_direct_dilate_contribution(
-                _func_vec3_at(probe_positions_local, j_p),
+                func_vec3_at(probe_positions_local, j_p),
                 target_local,
                 target_normal,
                 src_depth,
@@ -1134,9 +996,9 @@ def _kernel_elastomer_surface_state_bvh(
     bvh_node_max: qd.types.ndarray(),
     bvh_node_left: qd.types.ndarray(),
     bvh_node_right: qd.types.ndarray(),
-    bvh_node_point_start: qd.types.ndarray(),
-    bvh_node_point_n: qd.types.ndarray(),
-    bvh_point_idx: qd.types.ndarray(),
+    bvh_node_leaf_start: qd.types.ndarray(),
+    bvh_node_leaf_count: qd.types.ndarray(),
+    bvh_leaf_elem_idx: qd.types.ndarray(),
     pc_pos_link: qd.types.ndarray(),
     pc_active_envs_mask: qd.types.ndarray(),
     sdf_enter: qd.types.ndarray(),
@@ -1221,28 +1083,28 @@ def _kernel_elastomer_surface_state_bvh(
         sensor_pos = links_state.pos[sensor_link_idx, i_b]
         sensor_quat = links_state.quat[sensor_link_idx, i_b]
 
-        stack = qd.Vector.zero(gs.qd_int, qd.static(_POINT_CLOUD_BVH_STACK_SIZE))
+        stack = qd.Vector.zero(gs.qd_int, qd.static(BVH_STACK_SIZE))
         stack[0] = bvh_chunk_node_start[i_c]
         stack_idx = 1
 
         while stack_idx > 0:
             stack_idx -= 1
             n = stack[stack_idx]
-            bmin = _func_vec3_at(bvh_node_min, n)
-            bmax = _func_vec3_at(bvh_node_max, n)
-            if not _func_aabb_intersects_aabb(bmin, bmax, qmin, qmax):
+            bmin = func_vec3_at(bvh_node_min, n)
+            bmax = func_vec3_at(bvh_node_max, n)
+            if not func_aabb_intersects_aabb(bmin, bmax, qmin, qmax):
                 continue
             left = bvh_node_left[n]
             if left == -1:
-                pstart = bvh_node_point_start[n]
-                pn = bvh_node_point_n[n]
+                pstart = bvh_node_leaf_start[n]
+                pn = bvh_node_leaf_count[n]
                 for j in range(pn):
-                    i_o = bvh_point_idx[pstart + j]
+                    i_o = bvh_leaf_elem_idx[pstart + j]
                     if not pc_active_envs_mask[i_o, i_b]:
                         continue
                     surface_candidate_buf[i_b, i_o] = True
 
-                    point_link = _func_vec3_at(pc_pos_link, i_o)
+                    point_link = func_vec3_at(pc_pos_link, i_o)
                     point_world = track_pos + gu.qd_transform_by_quat(point_link, track_quat)
                     point_sensor = gu.qd_inv_transform_by_trans_quat(point_world, sensor_pos, sensor_quat)
                     for k in qd.static(range(3)):
@@ -1289,23 +1151,24 @@ def _kernel_elastomer_shear_accumulate(
     sensor_cache_start: qd.types.ndarray(),
     sensor_probe_start: qd.types.ndarray(),
     sensor_pc_start: qd.types.ndarray(),
-    sensor_pc_n: qd.types.ndarray(),
     lambda_s: qd.types.ndarray(),
     shear_scale: qd.types.ndarray(),
     eps: float,
     surface_pos_sensor_buf: qd.types.ndarray(),
     surface_entry_pos_sensor_buf: qd.types.ndarray(),
     surface_depth_buf: qd.types.ndarray(),
-    surface_initialized_buf: qd.types.ndarray(),
+    shear_active_pc_idx: qd.types.ndarray(),
+    shear_active_pc_count: qd.types.ndarray(),
     output: qd.types.ndarray(),
 ):
-    """Target-major shear accumulator: per (env, target_probe), iterate over the sensor's pc rows
-    that are flagged ``surface_initialized`` and sum Gaussian contributions into a register, then
-    += the result into ``output``. No atomic_add (each (i_b, i_p) thread owns its output slot).
-
-    Must run after the surface-state kernel AND after the post-kernel ``surface_initialized_buf &= candidate``
-    cleanup that invalidates the flag for BVH-pruned points -- otherwise stale True flags from prior
-    steps would corrupt this step's accumulation.
+    """Target-major shear accumulator: per (env, target_probe), iterate over the sensor's compact
+    active surface-point index and sum Gaussian contributions into a register, then += the result
+    into ``output``. No atomic_add (each (i_b, i_p) thread owns its output slot).
+
+    Consumes the compact index produced by ``_build_shear_active_pc_index`` (must run after the
+    surface-state kernel AND after the post-kernel ``surface_initialized_buf &= candidate`` cleanup).
+    Inner-loop cost is O(active_count[i_b, i_s]) rather than O(sensor_pc_n[i_s]), so the kernel scales
+    with contact density rather than total point-cloud size.
     """
     total_n_probes = probe_positions_local.shape[0]
     n_batches = surface_pos_sensor_buf.shape[0]
@@ -1322,15 +1185,14 @@ def _kernel_elastomer_shear_accumulate(
         cache_start = sensor_cache_start[i_s]
         _i_p = i_p - sensor_probe_start[i_s]
         pc_start = sensor_pc_start[i_s]
-        pc_end = pc_start + sensor_pc_n[i_s]
+        n_active = shear_active_pc_count[i_b, i_s]
 
-        probe_local = _func_vec3_at(probe_positions_local, i_p)
-        probe_normal = _func_vec3_at(probe_local_normal, i_p)
+        probe_local = func_vec3_at(probe_positions_local, i_p)
+        probe_normal = func_vec3_at(probe_local_normal, i_p)
 
         acc = qd.Vector.zero(gs.qd_float, 3)
-        for i_o in range(pc_start, pc_end):
-            if not surface_initialized_buf[i_b, i_o]:
-                continue
+        for j in range(n_active):
+            i_o = shear_active_pc_idx[i_b, pc_start + j]
             depth = surface_depth_buf[i_b, i_o]
             if depth <= eps:
                 continue
@@ -1367,6 +1229,52 @@ def _kernel_elastomer_shear_accumulate(
             output[cache_start + _i_p * 3 + k, i_b] = output[cache_start + _i_p * 3 + k, i_b] + acc[k]
 
 
+def _build_shear_active_pc_index(
+    surface_initialized_buf: torch.Tensor,
+    sensor_pc_start: torch.Tensor,
+    sensor_pc_n: torch.Tensor,
+    shear_scale: torch.Tensor,
+    active_pc_idx: torch.Tensor,
+    active_pc_count: torch.Tensor,
+) -> None:
+    """Build the compact per-(env, sensor) active surface-point index consumed by
+    ``_kernel_elastomer_shear_accumulate``. Mutates ``active_pc_idx`` and ``active_pc_count`` in place.
+
+    For each sensor ``s`` with ``shear_scale[s] > 0``, gathers the indices of True entries in
+    ``surface_initialized_buf[:, pc_start[s] : pc_start[s] + pc_n[s]]`` into the per-sensor compact slice
+    ``active_pc_idx[:, pc_start[s] : pc_start[s] + active_count[:, s]]``; the per-(env, sensor) active
+    count is written to ``active_pc_count[:, s]``. Sensors with ``shear_scale == 0`` are skipped and
+    their count is left at zero so the kernel's outer early-exit handles them with no extra work.
+
+    Uses exclusive cumsum + ``torch.nonzero`` for the per-sensor scatter so per-env Python loops are
+    avoided; cost is ~O(B * total_n_surface) torch ops over the whole pass.
+    """
+    active_pc_count.zero_()
+    n_sensors = sensor_pc_start.shape[0]
+    if n_sensors == 0:
+        return
+    # Single host sync up front so the per-sensor loop is metadata-only on the Python side.
+    pc_starts = sensor_pc_start.tolist()
+    pc_ns = sensor_pc_n.tolist()
+    scales = shear_scale.tolist()
+    idx_dtype = active_pc_idx.dtype
+    for s in range(n_sensors):
+        if scales[s] <= 0.0:
+            continue
+        pc_start = int(pc_starts[s])
+        pc_n = int(pc_ns[s])
+        if pc_n == 0:
+            continue
+        mask = surface_initialized_buf[:, pc_start : pc_start + pc_n]  # (B, pc_n) bool
+        int_mask = mask.to(idx_dtype)
+        # Exclusive cumsum -> per-row write position within the sensor's compact slice.
+        write_pos = torch.cumsum(int_mask, dim=1) - int_mask
+        active_pc_count[:, s] = int_mask.sum(dim=1)
+        bs, js = torch.nonzero(mask, as_tuple=True)
+        if bs.numel() > 0:
+            active_pc_idx[bs, pc_start + write_pos[bs, js]] = (pc_start + js).to(idx_dtype)
+
+
 def _elastomer_taxel_grid_fft_dilate(
     grid_fft_meta: list[tuple],
     grid_fft_kernels_stacked: torch.Tensor,
@@ -1480,6 +1388,16 @@ class ElastomerTaxelSensorMetadata(
     # stale surface_initialized / surface_entry_pos for points the BVH skipped this step.
     surface_candidate_buf: torch.Tensor = make_tensor_field((0, 0), dtype_factory=lambda: gs.tc_bool)
 
+    # Compact per-(env, sensor) active surface-point index, rebuilt every step right after the
+    # ``surface_initialized_buf &= candidate`` cleanup and consumed by ``_kernel_elastomer_shear_accumulate``.
+    # For sensor ``s`` in env ``i_b``, the first ``shear_active_pc_count[i_b, s]`` entries of
+    # ``shear_active_pc_idx[i_b, sensor_pc_start[s]:]`` hold the global pc-row indices whose
+    # ``surface_initialized_buf`` is True. Drops the per-probe inner-loop cost from O(sensor_pc_n[s]) to
+    # O(active_count[i_b, s]), so shear accumulate now scales with contact density rather than total point-cloud
+    # size. Sensors with ``shear_scale == 0`` have count = 0 (the kernel skips them anyway).
+    shear_active_pc_idx: torch.Tensor = make_tensor_field((0, 0), dtype_factory=lambda: gs.tc_int)
+    shear_active_pc_count: torch.Tensor = make_tensor_field((0, 0), dtype_factory=lambda: gs.tc_int)
+
     # Per-sensor flag selecting the FFT dilation path vs the direct (non-grid) dilation kernel.
     use_grid_fft: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_bool)
     # Per-grid-FFT-sensor tangent basis, consumed by the dilation write-back. ``grid_fft_meta`` tuples for this
@@ -1619,6 +1537,16 @@ def build(self):
             (B, total_n_surface), dtype=gs.tc_bool, device=gs.device
         )
 
+        # Compact active-point index for the shear accumulator. Re-allocated on each ElastomerTaxel build so the
+        # ``(B, total_n_surface)`` idx buffer and ``(B, n_sensors)`` count buffer absorb the newly registered sensor.
+        # Both are allocated unconditionally (zero-init); the per-step build at ``_build_shear_active_pc_index``
+        # leaves entries for non-shear sensors at count == 0, so unread regions remain harmless zeros.
+        n_sensors_built = self._shared_metadata.n_probes_per_sensor.shape[0]
+        self._shared_metadata.shear_active_pc_idx = torch.zeros((B, total_n_surface), dtype=gs.tc_int, device=gs.device)
+        self._shared_metadata.shear_active_pc_count = torch.zeros(
+            (B, n_sensors_built), dtype=gs.tc_int, device=gs.device
+        )
+
         self._shared_metadata.use_grid_fft = concat_with_tensor(
             self._shared_metadata.use_grid_fft, self._use_grid_fft, expand=(1,)
         )
@@ -1786,9 +1714,9 @@ def _update_current_timestep_data(
                 bvh.node_max,
                 bvh.node_left,
                 bvh.node_right,
-                bvh.node_point_start,
-                bvh.node_point_n,
-                bvh.point_idx,
+                bvh.node_leaf_start,
+                bvh.node_leaf_count,
+                bvh.leaf_elem_idx,
                 shared_metadata.pc_pos_link,
                 shared_metadata.pc_active_envs_mask,
                 shared_metadata.elastomer_contact_sdf_enter,
@@ -1806,12 +1734,21 @@ def _update_current_timestep_data(
             )
             # Invalidate stale surface state for points the BVH did not visit. surface_initialized
             # and entry-pos survive across steps; depth/pos are gated by initialized downstream so
-            # they don't need clearing. The shear accumulator below gates on surface_initialized_buf
-            # -- without this step, stale True from a prior step would corrupt accumulation.
+            # they don't need clearing. The shear accumulator below reads from a compact index
+            # rebuilt from surface_initialized -- without this step, stale True from a prior step
+            # would inject phantom contributions.
             cand = shared_metadata.surface_candidate_buf
             shared_metadata.surface_initialized_buf &= cand
             # Implicit bool→float broadcast zeros entries where cand=False, no `~` allocation.
             shared_metadata.surface_entry_pos_sensor_buf.mul_(cand.unsqueeze(-1))
+            _build_shear_active_pc_index(
+                shared_metadata.surface_initialized_buf,
+                shared_metadata.sensor_pc_start,
+                shared_metadata.sensor_pc_n,
+                shared_metadata.shear_scale,
+                shared_metadata.shear_active_pc_idx,
+                shared_metadata.shear_active_pc_count,
+            )
             _kernel_elastomer_shear_accumulate(
                 shared_metadata.probe_positions,
                 shared_metadata.probe_local_normal,
@@ -1820,14 +1757,14 @@ def _update_current_timestep_data(
                 shared_metadata.sensor_cache_start,
                 shared_metadata.sensor_probe_start,
                 shared_metadata.sensor_pc_start,
-                shared_metadata.sensor_pc_n,
                 shared_metadata.lambda_s,
                 shared_metadata.shear_scale,
                 gs.EPS,
                 shared_metadata.surface_pos_sensor_buf,
                 shared_metadata.surface_entry_pos_sensor_buf,
                 shared_metadata.surface_depth_buf,
-                shared_metadata.surface_initialized_buf,
+                shared_metadata.shear_active_pc_idx,
+                shared_metadata.shear_active_pc_count,
                 current_ground_truth_data_T,
             )
 
diff --git a/genesis/engine/sensors/surface_distance_probe.py b/genesis/engine/sensors/surface_distance_probe.py
index 66eaa4ad5f..53abca82d1 100644
--- a/genesis/engine/sensors/surface_distance_probe.py
+++ b/genesis/engine/sensors/surface_distance_probe.py
@@ -1,4 +1,4 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING
 
 import numpy as np
@@ -8,10 +8,8 @@
 import genesis as gs
 import genesis.utils.array_class as array_class
 import genesis.utils.geom as gu
-from genesis.engine.solvers.rigid.abd.forward_kinematics import func_update_all_verts
 from genesis.options.sensors import SurfaceDistanceProbe as SurfaceDistanceProbeOptions
 from genesis.utils.misc import concat_with_tensor, make_tensor_field, tensor_to_array
-from genesis.utils.raycast_qd import get_triangle_vertices
 
 from .base_sensor import RigidSensorMetadataMixin, RigidSensorMixin, SimpleSensor, SimpleSensorMetadata
 from .probe import (
@@ -20,6 +18,15 @@
     func_noised_probe_radius,
     get_measured_bufs,
 )
+from .tactile_shared import (
+    BVH_LEAF_SIZE,
+    BVH_STACK_SIZE,
+    BVHMetadata,
+    build_static_chunk_bvh,
+    func_sphere_intersects_aabb,
+    func_vec3_at,
+    get_mesh_geom_chunks,
+)
 
 if TYPE_CHECKING:
     from genesis.utils.ring_buffer import TensorRingBuffer
@@ -88,8 +95,125 @@ def _func_closest_point_on_triangle(point: gs.qd_vec3, v0: gs.qd_vec3, v1: gs.qd
     return closest
 
 
+@dataclass
+class TriangleMeshBVH(BVHMetadata):
+    """
+    BVH over tracked mesh triangles for one sensor class. ``leaf_elem_idx`` entries are absolute rows
+    into ``tri_verts``, a flat per-class table of link-local triangle vertices (shape ``(total_n_tri,
+    3, 3)``: per triangle, three xyz vertex positions). See ``BVHMetadata`` for the shared scaffolding
+    semantics. Rigid-link assumption: built once at scene init, never rebuilt.
+    """
+
+    tri_verts: torch.Tensor = make_tensor_field((0, 3, 3))
+
+    def append_sensor(self, track_link_idx: np.ndarray, solver) -> None:
+        """
+        Build per-tracked-link chunks for one sensor (link-local triangle BVH) and append into the flat
+        tensors. Sensors with no tracked-link geometry register zero chunks; the kernel's per-sensor
+        chunk loop iterates ``[0, sensor_chunk_count[i_s])`` and is a no-op for those.
+        """
+        new_chunk_link_idx: list[int] = []
+        new_chunk_node_start: list[int] = []
+        new_chunk_node_count: list[int] = []
+        all_node_min: list[np.ndarray] = []
+        all_node_max: list[np.ndarray] = []
+        all_node_left: list[np.ndarray] = []
+        all_node_right: list[np.ndarray] = []
+        all_node_leaf_start: list[np.ndarray] = []
+        all_node_leaf_count: list[np.ndarray] = []
+        all_leaf_elem_idx: list[np.ndarray] = []
+        all_tri_verts: list[np.ndarray] = []
+
+        chunk_start_for_sensor = int(self.chunk_link_idx.shape[0])
+        node_offset = int(self.node_min.shape[0])
+        leaf_offset = int(self.leaf_elem_idx.shape[0])
+        tri_offset = int(self.tri_verts.shape[0])
+
+        for i_l in range(int(track_link_idx.shape[0])):
+            link_idx = int(track_link_idx[i_l])
+            link = solver.links[link_idx]
+            geom_chunks = get_mesh_geom_chunks(link, prefer_visual=False)
+            if not geom_chunks:
+                continue
+            # Concatenate triangles from all geoms of this link into one chunk.
+            tri_v0_list: list[np.ndarray] = []
+            tri_v1_list: list[np.ndarray] = []
+            tri_v2_list: list[np.ndarray] = []
+            for _geom, verts_link, faces in geom_chunks:
+                tri_v0_list.append(verts_link[faces[:, 0]])
+                tri_v1_list.append(verts_link[faces[:, 1]])
+                tri_v2_list.append(verts_link[faces[:, 2]])
+            v0 = np.concatenate(tri_v0_list, axis=0).astype(np.float32, copy=False)
+            v1 = np.concatenate(tri_v1_list, axis=0).astype(np.float32, copy=False)
+            v2 = np.concatenate(tri_v2_list, axis=0).astype(np.float32, copy=False)
+            n_tri = int(v0.shape[0])
+            if n_tri == 0:
+                continue
+
+            centroids = (v0 + v1 + v2) / 3.0
+            aabb_mins = np.minimum(np.minimum(v0, v1), v2)
+            aabb_maxs = np.maximum(np.maximum(v0, v1), v2)
+
+            tri_stack = np.stack((v0, v1, v2), axis=1)  # (n_tri, 3, 3)
+            global_rows = (tri_offset + np.arange(n_tri, dtype=np.int32)).astype(np.int32)
+
+            nmin, nmax, nleft, nright, lstart, lcount, eidx = build_static_chunk_bvh(
+                centroids, aabb_mins, aabb_maxs, global_rows, BVH_LEAF_SIZE
+            )
+
+            new_chunk_link_idx.append(link_idx)
+            new_chunk_node_start.append(node_offset)
+            new_chunk_node_count.append(int(nmin.shape[0]))
+
+            all_node_min.append(nmin)
+            all_node_max.append(nmax)
+            # Rebase intra-chunk child / leaf-start indices into the flat tensors' absolute space.
+            all_node_left.append(np.where(nleft >= 0, nleft + node_offset, nleft).astype(np.int32))
+            all_node_right.append(np.where(nright >= 0, nright + node_offset, nright).astype(np.int32))
+            all_node_leaf_start.append(np.where(lcount > 0, lstart + leaf_offset, lstart).astype(np.int32))
+            all_node_leaf_count.append(lcount)
+            all_leaf_elem_idx.append(eidx)
+            all_tri_verts.append(tri_stack.astype(np.float32, copy=False))
+
+            node_offset += int(nmin.shape[0])
+            leaf_offset += int(eidx.shape[0])
+            tri_offset += n_tri
+
+        if not new_chunk_link_idx:
+            # No tracked links contributed geometry; record zero chunks for this sensor.
+            self.sensor_chunk_start = concat_with_tensor(self.sensor_chunk_start, chunk_start_for_sensor, expand=(1,))
+            self.sensor_chunk_count = concat_with_tensor(self.sensor_chunk_count, 0, expand=(1,))
+            return
+
+        nm = torch.tensor(np.concatenate(all_node_min, axis=0), dtype=gs.tc_float, device=gs.device)
+        nx = torch.tensor(np.concatenate(all_node_max, axis=0), dtype=gs.tc_float, device=gs.device)
+        nl = torch.tensor(np.concatenate(all_node_left, axis=0), dtype=gs.tc_int, device=gs.device)
+        nr = torch.tensor(np.concatenate(all_node_right, axis=0), dtype=gs.tc_int, device=gs.device)
+        lst = torch.tensor(np.concatenate(all_node_leaf_start, axis=0), dtype=gs.tc_int, device=gs.device)
+        lct = torch.tensor(np.concatenate(all_node_leaf_count, axis=0), dtype=gs.tc_int, device=gs.device)
+        eidx_t = torch.tensor(np.concatenate(all_leaf_elem_idx, axis=0), dtype=gs.tc_int, device=gs.device)
+        tv = torch.tensor(np.concatenate(all_tri_verts, axis=0), dtype=gs.tc_float, device=gs.device)
+        cli = torch.tensor(new_chunk_link_idx, dtype=gs.tc_int, device=gs.device)
+        cns = torch.tensor(new_chunk_node_start, dtype=gs.tc_int, device=gs.device)
+        cnc = torch.tensor(new_chunk_node_count, dtype=gs.tc_int, device=gs.device)
+
+        self.node_min = concat_with_tensor(self.node_min, nm, expand=(nm.shape[0], 3))
+        self.node_max = concat_with_tensor(self.node_max, nx, expand=(nx.shape[0], 3))
+        self.node_left = concat_with_tensor(self.node_left, nl, expand=(nl.shape[0],))
+        self.node_right = concat_with_tensor(self.node_right, nr, expand=(nr.shape[0],))
+        self.node_leaf_start = concat_with_tensor(self.node_leaf_start, lst, expand=(lst.shape[0],))
+        self.node_leaf_count = concat_with_tensor(self.node_leaf_count, lct, expand=(lct.shape[0],))
+        self.leaf_elem_idx = concat_with_tensor(self.leaf_elem_idx, eidx_t, expand=(eidx_t.shape[0],))
+        self.tri_verts = concat_with_tensor(self.tri_verts, tv, expand=(tv.shape[0], 3, 3))
+        self.chunk_link_idx = concat_with_tensor(self.chunk_link_idx, cli, expand=(cli.shape[0],))
+        self.chunk_node_start = concat_with_tensor(self.chunk_node_start, cns, expand=(cns.shape[0],))
+        self.chunk_node_count = concat_with_tensor(self.chunk_node_count, cnc, expand=(cnc.shape[0],))
+        self.sensor_chunk_start = concat_with_tensor(self.sensor_chunk_start, chunk_start_for_sensor, expand=(1,))
+        self.sensor_chunk_count = concat_with_tensor(self.sensor_chunk_count, len(new_chunk_link_idx), expand=(1,))
+
+
 @qd.kernel
-def _kernel_surface_distance_probe(
+def _kernel_surface_distance_probe_bvh(
     probe_positions_local: qd.types.ndarray(),
     probe_radii: qd.types.ndarray(),
     probe_radii_noise: qd.types.ndarray(),
@@ -97,44 +221,48 @@ def _kernel_surface_distance_probe(
     links_idx: qd.types.ndarray(),
     sensor_cache_start: qd.types.ndarray(),
     sensor_probe_start: qd.types.ndarray(),
-    track_link_start: qd.types.ndarray(),
-    track_link_end: qd.types.ndarray(),
-    track_link_flat: qd.types.ndarray(),
-    static_rigid_sim_config: qd.template(),
+    bvh_sensor_chunk_start: qd.types.ndarray(),
+    bvh_sensor_chunk_count: qd.types.ndarray(),
+    bvh_chunk_link_idx: qd.types.ndarray(),
+    bvh_chunk_node_start: qd.types.ndarray(),
+    bvh_node_min: qd.types.ndarray(),
+    bvh_node_max: qd.types.ndarray(),
+    bvh_node_left: qd.types.ndarray(),
+    bvh_node_right: qd.types.ndarray(),
+    bvh_node_leaf_start: qd.types.ndarray(),
+    bvh_node_leaf_count: qd.types.ndarray(),
+    bvh_leaf_elem_idx: qd.types.ndarray(),
+    bvh_tri_verts: qd.types.ndarray(),
     links_state: array_class.LinksState,
-    links_info: array_class.LinksInfo,
-    geoms_info: array_class.GeomsInfo,
-    geoms_state: array_class.GeomsState,
-    faces_info: array_class.FacesInfo,
-    verts_info: array_class.VertsInfo,
-    fixed_verts_state: array_class.VertsState,
-    free_verts_state: array_class.VertsState,
     positions_gt: qd.types.ndarray(),
     positions_measured: qd.types.ndarray(),
     output_gt: qd.types.ndarray(),
     output_measured: qd.types.ndarray(),
 ):
+    """
+    BVH-accelerated surface-distance query.
+
+    Per ``(probe, env)``: transform the probe into each tracked-link's local frame, traverse the
+    per-(sensor, tracked-link) static BVH with a fixed-depth stack, cull nodes via sphere-vs-AABB with
+    radius squared = current best (the larger of GT / measured branch), and on leaf nodes call
+    closest-point-on-triangle against the stored link-local vertices. The closest world-frame point is
+    written to ``positions_*`` and the distance to ``output_*``.
+    """
     total_n_probes = probe_positions_local.shape[0]
     n_batches = output_gt.shape[-1]
 
-    func_update_all_verts(
-        geoms_state, geoms_info, verts_info, free_verts_state, fixed_verts_state, static_rigid_sim_config
-    )
-
     for i_p, i_b in qd.ndrange(total_n_probes, n_batches):
         i_s = probe_sensor_idx[i_p]
         sensor_link_idx = links_idx[i_s]
         link_pos = links_state.pos[sensor_link_idx, i_b]
         link_quat = links_state.quat[sensor_link_idx, i_b]
 
-        probe_pos_local = qd.Vector(
-            [probe_positions_local[i_p, 0], probe_positions_local[i_p, 1], probe_positions_local[i_p, 2]]
-        )
-        probe_pos = link_pos + gu.qd_transform_by_quat(probe_pos_local, link_quat)
+        probe_local = func_vec3_at(probe_positions_local, i_p)
+        probe_world = link_pos + gu.qd_transform_by_quat(probe_local, link_quat)
 
         max_r_gt = probe_radii[i_p]
         best_dist_sq_gt = max_r_gt * max_r_gt
-        best_point_gt = probe_pos
+        best_point_gt = probe_world
 
         probe_radius_noise = probe_radii_noise[i_p]
         use_noised_radius = probe_radius_noise > gs.EPS
@@ -142,37 +270,67 @@ def _kernel_surface_distance_probe(
         if use_noised_radius:
             max_r_m = func_noised_probe_radius(max_r_gt, probe_radius_noise)
         best_dist_sq_m = max_r_m * max_r_m
-        best_point_m = probe_pos
-
-        start = track_link_start[i_s]
-        end = track_link_end[i_s]
-
-        for k in range(start, end):
-            i_l = track_link_flat[k]
-            I_l = [i_l, i_b] if qd.static(static_rigid_sim_config.batch_links_info) else i_l
-            geom_start = links_info.geom_start[I_l]
-            geom_end = links_info.geom_end[I_l]
-
-            for i_g in range(geom_start, geom_end):
-                face_start = geoms_info.face_start[i_g]
-                face_end = geoms_info.face_end[i_g]
-
-                for i_f in range(face_start, face_end):
-                    tri_verts = get_triangle_vertices(
-                        i_f, i_b, faces_info, verts_info, fixed_verts_state, free_verts_state
-                    )
-                    v0 = tri_verts[:, 0]
-                    v1 = tri_verts[:, 1]
-                    v2 = tri_verts[:, 2]
-                    closest = _func_closest_point_on_triangle(probe_pos, v0, v1, v2)
-                    diff = closest - probe_pos
-                    dist_sq = diff.dot(diff)
-                    if dist_sq < best_dist_sq_gt:
-                        best_dist_sq_gt = dist_sq
-                        best_point_gt = closest
-                    if use_noised_radius and dist_sq < best_dist_sq_m:
-                        best_dist_sq_m = dist_sq
-                        best_point_m = closest
+        best_point_m = probe_world
+
+        chunk_start = bvh_sensor_chunk_start[i_s]
+        n_chunks = bvh_sensor_chunk_count[i_s]
+        for c_off in range(n_chunks):
+            i_c = chunk_start + c_off
+            track_link_idx = bvh_chunk_link_idx[i_c]
+            track_pos = links_state.pos[track_link_idx, i_b]
+            track_quat = links_state.quat[track_link_idx, i_b]
+            # BVH lives in the tracked link's local frame; bring the probe over.
+            probe_link = gu.qd_inv_transform_by_trans_quat(probe_world, track_pos, track_quat)
+
+            stack = qd.Vector.zero(gs.qd_int, qd.static(BVH_STACK_SIZE))
+            stack[0] = bvh_chunk_node_start[i_c]
+            stack_idx = 1
+
+            while stack_idx > 0:
+                stack_idx -= 1
+                n = stack[stack_idx]
+                bmin = func_vec3_at(bvh_node_min, n)
+                bmax = func_vec3_at(bvh_node_max, n)
+                # Cull when min distance from probe to AABB exceeds the conservative current best.
+                cull_radius_sq = qd.max(best_dist_sq_gt, best_dist_sq_m)
+                if not func_sphere_intersects_aabb(probe_link, cull_radius_sq, bmin, bmax):
+                    continue
+                left = bvh_node_left[n]
+                if left == -1:
+                    fstart = bvh_node_leaf_start[n]
+                    fn = bvh_node_leaf_count[n]
+                    for j in range(fn):
+                        i_f = bvh_leaf_elem_idx[fstart + j]
+                        v0 = qd.Vector(
+                            [bvh_tri_verts[i_f, 0, 0], bvh_tri_verts[i_f, 0, 1], bvh_tri_verts[i_f, 0, 2]],
+                            dt=gs.qd_float,
+                        )
+                        v1 = qd.Vector(
+                            [bvh_tri_verts[i_f, 1, 0], bvh_tri_verts[i_f, 1, 1], bvh_tri_verts[i_f, 1, 2]],
+                            dt=gs.qd_float,
+                        )
+                        v2 = qd.Vector(
+                            [bvh_tri_verts[i_f, 2, 0], bvh_tri_verts[i_f, 2, 1], bvh_tri_verts[i_f, 2, 2]],
+                            dt=gs.qd_float,
+                        )
+                        closest_link = _func_closest_point_on_triangle(probe_link, v0, v1, v2)
+                        diff = closest_link - probe_link
+                        dist_sq = diff.dot(diff)
+                        if dist_sq < best_dist_sq_gt or (use_noised_radius and dist_sq < best_dist_sq_m):
+                            # Transform the hit back to world frame and record on whichever branch tightened.
+                            closest_world = track_pos + gu.qd_transform_by_quat(closest_link, track_quat)
+                            if dist_sq < best_dist_sq_gt:
+                                best_dist_sq_gt = dist_sq
+                                best_point_gt = closest_world
+                            if use_noised_radius and dist_sq < best_dist_sq_m:
+                                best_dist_sq_m = dist_sq
+                                best_point_m = closest_world
+                else:
+                    right = bvh_node_right[n]
+                    stack[stack_idx] = left
+                    stack_idx += 1
+                    stack[stack_idx] = right
+                    stack_idx += 1
 
         best_dist_gt = qd.sqrt(best_dist_sq_gt)
         best_dist_m = best_dist_gt
@@ -194,13 +352,17 @@ def _kernel_surface_distance_probe(
 
 @dataclass
 class SurfaceDistanceProbeSensorMetadataMixin(ProbeSensorMetadataMixin):
-    """Shared metadata for surface distance probe sensors: tracked links and nearest-point buffer."""
+    """
+    Shared metadata for surface distance probe sensors: tracked-link bookkeeping, nearest-point buffer,
+    and the per-class static triangle-mesh BVH consumed by ``_kernel_surface_distance_probe_bvh``.
+    """
 
     track_link_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
     track_link_end: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
     track_link_flat: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
     nearest_positions: torch.Tensor = make_tensor_field((0, 0, 3))
     nearest_positions_measured: torch.Tensor = make_tensor_field((0, 0, 3))
+    bvh: TriangleMeshBVH = field(default_factory=TriangleMeshBVH)
 
 
 @dataclass
@@ -258,6 +420,10 @@ def build(self):
         slice_start = self._shared_metadata.sensor_probe_start[self._idx]
         self._nearest_points_slice = slice(slice_start, slice_start + self._n_probes)
 
+        # Build the per-(sensor, tracked-link) triangle BVH in link-local frame. Rigid links don't deform,
+        # so this is a one-shot scene-build cost; per-step queries traverse the static structure.
+        self._shared_metadata.bvh.append_sensor(track_link_idx, self._shared_metadata.solver)
+
     @classmethod
     def reset(cls, shared_metadata: SurfaceDistanceProbeMetadata, shared_ground_truth_cache: torch.Tensor, envs_idx):
         super().reset(shared_metadata, shared_ground_truth_cache, envs_idx)
@@ -278,7 +444,8 @@ def _update_current_timestep_data(
         measured, measured_cols_b = get_measured_bufs(
             shared_metadata, current_ground_truth_data_T, measured_data_timeline
         )
-        _kernel_surface_distance_probe(
+        bvh = shared_metadata.bvh
+        _kernel_surface_distance_probe_bvh(
             shared_metadata.probe_positions,
             shared_metadata.probe_radii,
             shared_metadata.probe_radii_noise,
@@ -286,18 +453,19 @@ def _update_current_timestep_data(
             shared_metadata.links_idx,
             shared_metadata.sensor_cache_start,
             shared_metadata.sensor_probe_start,
-            shared_metadata.track_link_start,
-            shared_metadata.track_link_end,
-            shared_metadata.track_link_flat,
-            solver._static_rigid_sim_config,
+            bvh.sensor_chunk_start,
+            bvh.sensor_chunk_count,
+            bvh.chunk_link_idx,
+            bvh.chunk_node_start,
+            bvh.node_min,
+            bvh.node_max,
+            bvh.node_left,
+            bvh.node_right,
+            bvh.node_leaf_start,
+            bvh.node_leaf_count,
+            bvh.leaf_elem_idx,
+            bvh.tri_verts,
             solver.links_state,
-            solver.links_info,
-            solver.geoms_info,
-            solver.geoms_state,
-            solver.faces_info,
-            solver.verts_info,
-            solver.fixed_verts_state,
-            solver.free_verts_state,
             shared_metadata.nearest_positions,
             shared_metadata.nearest_positions_measured,
             current_ground_truth_data_T,
diff --git a/genesis/engine/sensors/tactile_shared.py b/genesis/engine/sensors/tactile_shared.py
index df430a8ac4..3c8866fa50 100644
--- a/genesis/engine/sensors/tactile_shared.py
+++ b/genesis/engine/sensors/tactile_shared.py
@@ -3,9 +3,11 @@
 from typing import TYPE_CHECKING, Callable, Generic, TypeVar
 
 import numpy as np
+import quadrants as qd
 import torch
 
 import genesis as gs
+import genesis.utils.geom as gu
 from genesis.utils.misc import concat_with_tensor, make_tensor_field
 
 if TYPE_CHECKING:
@@ -16,7 +18,9 @@
 
 
 def next_pow2(n: int) -> int:
-    """Smallest power of 2 >= ``n`` (1 if ``n == 0``)."""
+    """
+    Smallest power of 2 >= ``n`` (1 if ``n == 0``).
+    """
     if n <= 1:
         return 1
     p = 1
@@ -25,6 +29,202 @@ def next_pow2(n: int) -> int:
     return p
 
 
+# ============================ BVH helpers (shared by point-cloud and triangle-mesh sensors) ============================
+
+
+BVH_LEAF_SIZE = 8
+BVH_STACK_SIZE = 32
+
+
+def get_mesh_geom_chunks(link, prefer_visual: bool) -> list[tuple[object, np.ndarray, np.ndarray]]:
+    """
+    Return per-geom mesh chunks ``(geom, verts_link, faces)`` in link-local frame.
+
+    ``prefer_visual`` picks vgeoms over geoms when both exist; falls back to the other type when the
+    preferred one is absent. Empty meshes are dropped from the list.
+    """
+    if prefer_visual:
+        geoms = list(link.vgeoms) if link.vgeoms else list(link.geoms)
+        use_vverts = bool(link.vgeoms)
+    else:
+        geoms = list(link.geoms) if link.geoms else list(link.vgeoms)
+        use_vverts = not bool(link.geoms) and bool(link.vgeoms)
+
+    chunks: list[tuple[object, np.ndarray, np.ndarray]] = []
+    for geom in geoms:
+        if use_vverts:
+            verts = np.asarray(geom.init_vverts, dtype=np.float32)
+            faces = np.asarray(geom.init_vfaces, dtype=np.int32)
+        else:
+            verts = np.asarray(geom.init_verts, dtype=np.float32)
+            faces = np.asarray(geom.init_faces, dtype=np.int32)
+        if verts.size == 0 or faces.size == 0:
+            continue
+        verts_link = gu.transform_by_trans_quat(verts, geom.init_pos, geom.init_quat)
+        chunks.append((geom, verts_link.astype(np.float32, copy=False), np.asarray(faces, dtype=np.int32)))
+    return chunks
+
+
+def build_static_chunk_bvh(
+    centroids: np.ndarray,
+    aabb_mins: np.ndarray,
+    aabb_maxs: np.ndarray,
+    global_rows: np.ndarray,
+    leaf_size: int,
+) -> tuple[np.ndarray, ...]:
+    """
+    Median-split AABB BVH over a static set of elements (points, triangles, etc.) in link-local frame.
+
+    Split decisions use ``centroids`` along the longest-spread axis; node AABBs union the per-element
+    ``aabb_mins``/``aabb_maxs``. For point-cloud BVHs, callers pass ``centroids == aabb_mins == aabb_maxs``
+    (the points themselves); for triangle BVHs, callers pass per-triangle centroid + min/max bounds.
+
+    Leaves carry the caller-provided ``global_rows`` (absolute rows into the sensor-class element table);
+    the kernel indexes directly into that table with no extra indirection. Internal nodes use -1 for
+    ``node_left`` / ``node_right``. Returns ``(node_min, node_max, node_left, node_right, node_elem_start,
+    node_elem_n, elem_idx)``.
+    """
+    node_min: list[np.ndarray] = []
+    node_max: list[np.ndarray] = []
+    node_left: list[int] = []
+    node_right: list[int] = []
+    node_elem_start: list[int] = []
+    node_elem_n: list[int] = []
+    elem_idx: list[int] = []
+
+    def _alloc() -> int:
+        i = len(node_min)
+        node_min.append(np.zeros(3, dtype=np.float32))
+        node_max.append(np.zeros(3, dtype=np.float32))
+        node_left.append(-1)
+        node_right.append(-1)
+        node_elem_start.append(-1)
+        node_elem_n.append(0)
+        return i
+
+    def _build(rows: np.ndarray, cents: np.ndarray, a_mins: np.ndarray, a_maxs: np.ndarray) -> int:
+        nid = _alloc()
+        bmin = a_mins.min(axis=0).astype(np.float32)
+        bmax = a_maxs.max(axis=0).astype(np.float32)
+        node_min[nid] = bmin
+        node_max[nid] = bmax
+        if rows.shape[0] <= leaf_size:
+            start = len(elem_idx)
+            elem_idx.extend(int(r) for r in rows)
+            node_elem_start[nid] = start
+            node_elem_n[nid] = int(rows.shape[0])
+            return nid
+        axis = int(np.argmax(bmax - bmin))
+        order = np.argsort(cents[:, axis], kind="stable")
+        mid = order.shape[0] // 2
+        node_left[nid] = _build(rows[order[:mid]], cents[order[:mid]], a_mins[order[:mid]], a_maxs[order[:mid]])
+        node_right[nid] = _build(rows[order[mid:]], cents[order[mid:]], a_mins[order[mid:]], a_maxs[order[mid:]])
+        return nid
+
+    if centroids.shape[0] == 0:
+        return (
+            np.zeros((0, 3), dtype=np.float32),
+            np.zeros((0, 3), dtype=np.float32),
+            np.zeros((0,), dtype=np.int32),
+            np.zeros((0,), dtype=np.int32),
+            np.zeros((0,), dtype=np.int32),
+            np.zeros((0,), dtype=np.int32),
+            np.zeros((0,), dtype=np.int32),
+        )
+
+    root = _build(
+        global_rows.astype(np.int32, copy=False),
+        centroids.astype(np.float32, copy=False),
+        aabb_mins.astype(np.float32, copy=False),
+        aabb_maxs.astype(np.float32, copy=False),
+    )
+    assert root == 0
+    return (
+        np.stack(node_min, axis=0),
+        np.stack(node_max, axis=0),
+        np.asarray(node_left, dtype=np.int32),
+        np.asarray(node_right, dtype=np.int32),
+        np.asarray(node_elem_start, dtype=np.int32),
+        np.asarray(node_elem_n, dtype=np.int32),
+        np.asarray(elem_idx, dtype=np.int32),
+    )
+
+
+@qd.func
+def func_vec3_at(values: qd.types.ndarray(), i: int) -> qd.types.vector(3):
+    return qd.Vector([values[i, 0], values[i, 1], values[i, 2]], dt=float)
+
+
+@qd.func
+def func_sphere_intersects_aabb(center, radius_sq, bmin, bmax):  # -> bool
+    """
+    Squared-distance sphere-vs-AABB test: True iff the closest AABB point to ``center`` is within
+    ``radius_sq``. Reused as a closest-point cull by passing ``radius_sq = current_best_dist_sq``.
+    """
+    d_sq = gs.qd_float(0.0)
+    for k in qd.static(range(3)):
+        v = center[k]
+        lo = bmin[k]
+        hi = bmax[k]
+        if v < lo:
+            d = lo - v
+            d_sq = d_sq + d * d
+        elif v > hi:
+            d = v - hi
+            d_sq = d_sq + d * d
+    return d_sq <= radius_sq
+
+
+@qd.func
+def func_aabb_intersects_aabb(amin, amax, bmin, bmax):  # -> bool
+    """
+    Standard 6-axis AABB-vs-AABB overlap test.
+    """
+    return (
+        amin[0] <= bmax[0]
+        and amax[0] >= bmin[0]
+        and amin[1] <= bmax[1]
+        and amax[1] >= bmin[1]
+        and amin[2] <= bmax[2]
+        and amax[2] >= bmin[2]
+    )
+
+
+@dataclass
+class BVHMetadata:
+    """
+    Element-agnostic scaffolding for a static, link-local, chunked AABB BVH shared across one sensor class.
+
+    One *chunk* per (sensor, tracked_link): each chunk is a small subtree built once at scene init in the
+    tracked link's local frame and never rebuilt. Subclasses (PointCloudBVH, TriangleMeshBVH) layer on
+    element-specific payload tables; ``leaf_elem_idx`` entries are absolute rows into those tables.
+
+    Per-sensor slice into the chunk arrays:
+        ``chunks[sensor_chunk_start[s] : sensor_chunk_start[s] + sensor_chunk_count[s]]``
+    Per-chunk slice into the flat node arrays:
+        ``nodes[chunk_node_start[c] : chunk_node_start[c] + chunk_node_count[c]]``
+    Per-leaf slice into ``leaf_elem_idx``:
+        ``leaf_elem_idx[node_leaf_start[n] : node_leaf_start[n] + node_leaf_count[n]]``
+    ``node_left == -1`` marks a leaf; otherwise ``node_left``/``node_right`` are absolute child indices.
+    """
+
+    sensor_chunk_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
+    sensor_chunk_count: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
+
+    chunk_link_idx: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
+    chunk_node_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
+    chunk_node_count: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
+
+    node_min: torch.Tensor = make_tensor_field((0, 3))
+    node_max: torch.Tensor = make_tensor_field((0, 3))
+    node_left: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
+    node_right: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
+
+    node_leaf_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
+    node_leaf_count: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
+    leaf_elem_idx: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
+
+
 # ============================ FFT helpers ============================
 
 
@@ -213,6 +413,26 @@ def normalize_grid_probe_layout(
     )
 
 
+# ============================ Contact prefilter ============================
+
+
+@dataclass
+class ContactPrefilterMetadataMixin:
+    """
+    Per-(env, sensor) compact contact list shared by tactile sensors whose kernels iterate the collider's
+    contact list per probe (KinematicTaxel, ContactDepthProbe). Populated each step by
+    ``_kernel_build_sensor_contact_idx`` in ``kinematic_tactile.py``; the main kernel's per-probe loop
+    then touches only contacts whose ``link_a`` or ``link_b`` matches the sensor's tracked link.
+
+    Shape: ``sensor_contacts_idx (B, n_sensors, max_per_sensor)``, ``sensor_n_contacts (B, n_sensors)``.
+    The per-sensor cap is held by the consuming class (see ``_MAX_CONTACTS_PER_SENSOR``); the kernel
+    reads it off ``sensor_contacts_idx.shape[2]``.
+    """
+
+    sensor_contacts_idx: torch.Tensor = make_tensor_field((0, 0, 0), dtype_factory=lambda: gs.tc_int)
+    sensor_n_contacts: torch.Tensor = make_tensor_field((0, 0), dtype_factory=lambda: gs.tc_int)
+
+
 # ============================ ViscoelasticHysteresis ============================
 
 

From 106f72009376f00cf6c85c64b891b5bd382ad005 Mon Sep 17 00:00:00 2001
From: Trinity Chung <trinityjchung@gmail.com>
Date: Wed, 27 May 2026 23:45:33 -0400
Subject: [PATCH 6/7] add option to switch raycast or sdf for contact depth
 query

---
 genesis/engine/sensors/kinematic_tactile.py   | 655 ++++++++++++++++--
 genesis/engine/sensors/point_cloud_tactile.py | 549 +++++++++++++--
 genesis/engine/sensors/raycaster.py           | 120 ++--
 genesis/engine/sensors/tactile_shared.py      |  25 +
 genesis/options/sensors/options.py            |   8 +
 genesis/options/sensors/tactile.py            |  24 +-
 genesis/utils/raycast_qd.py                   |  74 +-
 tests/test_sensors.py                         |  88 +++
 8 files changed, 1379 insertions(+), 164 deletions(-)

diff --git a/genesis/engine/sensors/kinematic_tactile.py b/genesis/engine/sensors/kinematic_tactile.py
index 41ea478b9b..456573eeb7 100644
--- a/genesis/engine/sensors/kinematic_tactile.py
+++ b/genesis/engine/sensors/kinematic_tactile.py
@@ -11,11 +11,18 @@
 import genesis.utils.array_class as array_class
 import genesis.utils.geom as gu
 import genesis.utils.sdf as sdf
+from genesis.engine.bvh import STACK_SIZE as _BVH_STACK_SIZE
 from genesis.engine.solvers.rigid.collider.utils import func_point_in_geom_aabb
 from genesis.options.sensors import ContactDepthProbe as ContactDepthProbeOptions
 from genesis.options.sensors import ContactProbe as ContactProbeOptions
 from genesis.options.sensors import KinematicTaxel as KinematicTaxelOptions
 from genesis.utils.misc import concat_with_tensor, make_tensor_field, tensor_to_array
+from genesis.utils.raycast_qd import (
+    closest_point_on_triangle,
+    get_triangle_vertices,
+    ray_triangle_intersection,
+    triangle_face_normal,
+)
 
 from .base_sensor import RigidSensorMetadataMixin, RigidSensorMixin, SimpleSensor, SimpleSensorMetadata
 from .probe import (
@@ -28,9 +35,11 @@
     get_measured_bufs,
 )
 from .tactile_shared import (
+    ContactDepthQueryMetadataMixin,
     ContactPrefilterMetadataMixin,
     ViscoelasticHysteresisMetadataMixin,
     ViscoelasticHysteresisMixin,
+    func_sphere_intersects_aabb,
     normalize_grid_probe_layout,
 )
 
@@ -464,17 +473,483 @@ def _kernel_contact_depth_probe(
         output_measured[cache_idx, i_b] = max_penetration_m
 
 
+# ============================ Raycast / BVH contact-depth path ============================
+
+
+@qd.kernel
+def _kernel_build_sensor_candidate_geom_mask(
+    sensor_contacts_idx: qd.types.ndarray(),
+    sensor_n_contacts: qd.types.ndarray(),
+    collider_state: array_class.ColliderState,
+    sensor_candidate_geom_mask: qd.types.ndarray(),
+):
+    """
+    Scatter the per-(env, sensor) candidate-geom bitmask from the prefiltered contact list. Run only when the
+    sensor class is in ``contact_depth_query="raycast"`` mode; the BVH leaf loop consults this mask to skip
+    triangles whose owning geom isn't in the sensor's current contact list.
+    """
+    n_batches = sensor_n_contacts.shape[0]
+    n_sensors = sensor_n_contacts.shape[1]
+    n_geoms = sensor_candidate_geom_mask.shape[2]
+    for i_b, i_s in qd.ndrange(n_batches, n_sensors):
+        for i_g in range(n_geoms):
+            sensor_candidate_geom_mask[i_b, i_s, i_g] = False
+        n_c = sensor_n_contacts[i_b, i_s]
+        for k in range(n_c):
+            i_c = sensor_contacts_idx[i_b, i_s, k]
+            sensor_candidate_geom_mask[i_b, i_s, collider_state.contact_data.geom_a[i_c, i_b]] = True
+            sensor_candidate_geom_mask[i_b, i_s, collider_state.contact_data.geom_b[i_c, i_b]] = True
+
+
+@qd.func
+def _func_query_contact_depth_penetration_bvh(
+    i_b: int,
+    i_s: int,
+    probe_pos: qd.types.vector(3),
+    probe_normal: qd.types.vector(3),
+    probe_radius_gt: float,
+    probe_radius_m: float,
+    bvh_nodes: qd.template(),
+    bvh_morton_codes: qd.template(),
+    faces_info: array_class.FacesInfo,
+    verts_info: array_class.VertsInfo,
+    fixed_verts_state: array_class.VertsState,
+    free_verts_state: array_class.VertsState,
+    sensor_candidate_geom_mask: qd.types.ndarray(),
+    eps: float,
+):
+    """
+    BVH-based dual-radius probe penetration: at each candidate leaf, combines a sphere/triangle closest-point
+    test (``pen = R - dist``) with a raycast along ``-probe_normal`` (``pen = R - hit_distance``). Max wins per
+    triangle and across all visited triangles. Mirrors ``_func_query_contact_depth_penetration``'s return shape
+    so callers can dispatch without further branching.
+    """
+    n_triangles = faces_info.verts_idx.shape[0]
+    R_query = qd.max(probe_radius_gt, probe_radius_m)
+    R_query_sq = R_query * R_query
+    neg_normal = -probe_normal
+
+    max_pen_gt = gs.qd_float(0.0)
+    max_pen_m = gs.qd_float(0.0)
+
+    node_stack = qd.Vector.zero(gs.qd_int, qd.static(_BVH_STACK_SIZE))
+    node_stack[0] = 0
+    stack_idx = 1
+
+    while stack_idx > 0:
+        stack_idx -= 1
+        node_idx = node_stack[stack_idx]
+        node = bvh_nodes[i_b, node_idx]
+
+        if not func_sphere_intersects_aabb(probe_pos, R_query_sq, node.bound.min, node.bound.max):
+            continue
+
+        if node.left == -1:
+            sorted_leaf_idx = node_idx - (n_triangles - 1)
+            i_f = qd.cast(bvh_morton_codes[i_b, sorted_leaf_idx][1], gs.qd_int)
+            i_g = faces_info.geom_idx[i_f]
+            if not sensor_candidate_geom_mask[i_b, i_s, i_g]:
+                continue
+
+            tri = get_triangle_vertices(i_f, i_b, faces_info, verts_info, fixed_verts_state, free_verts_state)
+            v0 = tri[:, 0]
+            v1 = tri[:, 1]
+            v2 = tri[:, 2]
+
+            closest = closest_point_on_triangle(probe_pos, v0, v1, v2)
+            diff = closest - probe_pos
+            dist = qd.sqrt(diff.dot(diff))
+            pen_gt = probe_radius_gt - dist
+            if pen_gt > max_pen_gt:
+                max_pen_gt = pen_gt
+            pen_m = probe_radius_m - dist
+            if pen_m > max_pen_m:
+                max_pen_m = pen_m
+
+            hit = ray_triangle_intersection(probe_pos, neg_normal, v0, v1, v2, eps)
+            if hit.w > 0.5:
+                if hit.x <= probe_radius_gt:
+                    pen_ray_gt = probe_radius_gt - hit.x
+                    if pen_ray_gt > max_pen_gt:
+                        max_pen_gt = pen_ray_gt
+                if hit.x <= probe_radius_m:
+                    pen_ray_m = probe_radius_m - hit.x
+                    if pen_ray_m > max_pen_m:
+                        max_pen_m = pen_ray_m
+        else:
+            if stack_idx < qd.static(_BVH_STACK_SIZE - 2):
+                node_stack[stack_idx] = node.left
+                node_stack[stack_idx + 1] = node.right
+                stack_idx += 2
+
+    return max_pen_gt, max_pen_m
+
+
+@qd.func
+def _func_query_contact_depth_bvh(
+    i_b: int,
+    i_s: int,
+    probe_pos: qd.types.vector(3),
+    probe_normal: qd.types.vector(3),
+    probe_radius_gt: float,
+    probe_radius_m: float,
+    bvh_nodes: qd.template(),
+    bvh_morton_codes: qd.template(),
+    faces_info: array_class.FacesInfo,
+    verts_info: array_class.VertsInfo,
+    fixed_verts_state: array_class.VertsState,
+    free_verts_state: array_class.VertsState,
+    geoms_info: array_class.GeomsInfo,
+    sensor_candidate_geom_mask: qd.types.ndarray(),
+    eps: float,
+):
+    """
+    BVH-based dual-radius probe query with contact normal and link, mirroring ``_func_query_contact_depth``'s return.
+    The normal returned is the triangle face normal of the deepest-penetrating triangle (raycast or sphere subtest).
+    """
+    n_triangles = faces_info.verts_idx.shape[0]
+    R_query = qd.max(probe_radius_gt, probe_radius_m)
+    R_query_sq = R_query * R_query
+    radius_gt_sq = probe_radius_gt * probe_radius_gt
+    radius_m_sq = probe_radius_m * probe_radius_m
+    neg_normal = -probe_normal
+
+    max_pen_gt = gs.qd_float(0.0)
+    contact_link_gt = gs.qd_int(-1)
+    contact_normal_gt = qd.Vector.zero(gs.qd_float, 3)
+    max_pen_m = gs.qd_float(0.0)
+    contact_link_m = gs.qd_int(-1)
+    contact_normal_m = qd.Vector.zero(gs.qd_float, 3)
+
+    node_stack = qd.Vector.zero(gs.qd_int, qd.static(_BVH_STACK_SIZE))
+    node_stack[0] = 0
+    stack_idx = 1
+
+    while stack_idx > 0:
+        stack_idx -= 1
+        node_idx = node_stack[stack_idx]
+        node = bvh_nodes[i_b, node_idx]
+
+        if not func_sphere_intersects_aabb(probe_pos, R_query_sq, node.bound.min, node.bound.max):
+            continue
+
+        if node.left == -1:
+            sorted_leaf_idx = node_idx - (n_triangles - 1)
+            i_f = qd.cast(bvh_morton_codes[i_b, sorted_leaf_idx][1], gs.qd_int)
+            i_g = faces_info.geom_idx[i_f]
+            if not sensor_candidate_geom_mask[i_b, i_s, i_g]:
+                continue
+
+            link_for_geom = geoms_info.link_idx[i_g]
+
+            tri = get_triangle_vertices(i_f, i_b, faces_info, verts_info, fixed_verts_state, free_verts_state)
+            v0 = tri[:, 0]
+            v1 = tri[:, 1]
+            v2 = tri[:, 2]
+
+            face_normal = triangle_face_normal(v0, v1, v2)
+
+            closest = closest_point_on_triangle(probe_pos, v0, v1, v2)
+            diff = closest - probe_pos
+            d_sq = diff.dot(diff)
+            if d_sq <= R_query_sq:
+                pen_along_normal = diff.dot(neg_normal)
+                if pen_along_normal > 0.0:
+                    if d_sq <= radius_gt_sq and pen_along_normal > max_pen_gt:
+                        max_pen_gt = pen_along_normal
+                        contact_link_gt = link_for_geom
+                        contact_normal_gt = face_normal
+                    if d_sq <= radius_m_sq and pen_along_normal > max_pen_m:
+                        max_pen_m = pen_along_normal
+                        contact_link_m = link_for_geom
+                        contact_normal_m = face_normal
+
+            hit = ray_triangle_intersection(probe_pos, neg_normal, v0, v1, v2, eps)
+            if hit.w > 0.5:
+                if hit.x <= probe_radius_gt:
+                    pen_ray_gt = probe_radius_gt - hit.x
+                    if pen_ray_gt > max_pen_gt:
+                        max_pen_gt = pen_ray_gt
+                        contact_link_gt = link_for_geom
+                        contact_normal_gt = face_normal
+                if hit.x <= probe_radius_m:
+                    pen_ray_m = probe_radius_m - hit.x
+                    if pen_ray_m > max_pen_m:
+                        max_pen_m = pen_ray_m
+                        contact_link_m = link_for_geom
+                        contact_normal_m = face_normal
+        else:
+            if stack_idx < qd.static(_BVH_STACK_SIZE - 2):
+                node_stack[stack_idx] = node.left
+                node_stack[stack_idx + 1] = node.right
+                stack_idx += 2
+
+    return max_pen_gt, contact_link_gt, contact_normal_gt, max_pen_m, contact_link_m, contact_normal_m
+
+
+@qd.kernel(fastcache=False)
+def _kernel_contact_depth_probe_bvh(
+    probe_positions_local: qd.types.ndarray(),
+    probe_local_normal: qd.types.ndarray(),
+    probe_sensor_idx: qd.types.ndarray(),
+    probe_radii: qd.types.ndarray(),
+    probe_radii_noise: qd.types.ndarray(),
+    probe_gains: qd.types.ndarray(),
+    links_idx: qd.types.ndarray(),
+    sensor_cache_start: qd.types.ndarray(),
+    sensor_probe_start: qd.types.ndarray(),
+    sensor_candidate_geom_mask: qd.types.ndarray(),
+    bvh_nodes: qd.template(),
+    bvh_morton_codes: qd.template(),
+    links_state: array_class.LinksState,
+    faces_info: array_class.FacesInfo,
+    verts_info: array_class.VertsInfo,
+    fixed_verts_state: array_class.VertsState,
+    free_verts_state: array_class.VertsState,
+    output_gt: qd.types.ndarray(),
+    output_measured: qd.types.ndarray(),
+):
+    total_n_probes = probe_positions_local.shape[0]
+    n_batches = output_gt.shape[-1]
+
+    for i_p, i_b in qd.ndrange(total_n_probes, n_batches):
+        i_s = probe_sensor_idx[i_p]
+
+        if probe_radii[i_p] <= gs.qd_float(0.0):
+            cache_idx = sensor_cache_start[i_s] + i_p - sensor_probe_start[i_s]
+            output_gt[cache_idx, i_b] = gs.qd_float(0.0)
+            output_measured[cache_idx, i_b] = gs.qd_float(0.0)
+            continue
+
+        probe_pos_local = qd.Vector(
+            [probe_positions_local[i_p, 0], probe_positions_local[i_p, 1], probe_positions_local[i_p, 2]]
+        )
+        probe_normal_local = qd.Vector(
+            [probe_local_normal[i_p, 0], probe_local_normal[i_p, 1], probe_local_normal[i_p, 2]]
+        )
+
+        sensor_link_idx = links_idx[i_s]
+        link_pos = links_state.pos[sensor_link_idx, i_b]
+        link_quat = links_state.quat[sensor_link_idx, i_b]
+
+        probe_pos = link_pos + gu.qd_transform_by_quat(probe_pos_local, link_quat)
+        probe_normal = gu.qd_transform_by_quat(probe_normal_local, link_quat)
+
+        probe_radius = probe_radii[i_p]
+        probe_radius_noise = probe_radii_noise[i_p]
+        probe_radius_m = (
+            func_noised_probe_radius(probe_radius, probe_radius_noise) if probe_radius_noise > gs.EPS else probe_radius
+        )
+
+        max_penetration_gt, max_penetration_m = _func_query_contact_depth_penetration_bvh(
+            i_b,
+            i_s,
+            probe_pos,
+            probe_normal,
+            probe_radius,
+            probe_radius_m,
+            bvh_nodes,
+            bvh_morton_codes,
+            faces_info,
+            verts_info,
+            fixed_verts_state,
+            free_verts_state,
+            sensor_candidate_geom_mask,
+            gs.EPS,
+        )
+        max_penetration_m = max_penetration_m * probe_gains[i_b, i_p]
+        cache_idx = sensor_cache_start[i_s] + i_p - sensor_probe_start[i_s]
+        output_gt[cache_idx, i_b] = max_penetration_gt
+        output_measured[cache_idx, i_b] = max_penetration_m
+
+
+@qd.kernel(fastcache=False)
+def _kernel_kinematic_taxel_bvh(
+    probe_positions_local: qd.types.ndarray(),
+    probe_local_normal: qd.types.ndarray(),
+    probe_sensor_idx: qd.types.ndarray(),
+    probe_radii: qd.types.ndarray(),
+    probe_radii_noise: qd.types.ndarray(),
+    probe_gains: qd.types.ndarray(),
+    normal_stiffness: qd.types.ndarray(),
+    normal_damping: qd.types.ndarray(),
+    normal_exponent: qd.types.ndarray(),
+    shear_scalar: qd.types.ndarray(),
+    twist_scalar: qd.types.ndarray(),
+    links_idx: qd.types.ndarray(),
+    sensor_cache_start: qd.types.ndarray(),
+    sensor_probe_start: qd.types.ndarray(),
+    n_probes_per_sensor: qd.types.ndarray(),
+    sensor_candidate_geom_mask: qd.types.ndarray(),
+    bvh_nodes: qd.template(),
+    bvh_morton_codes: qd.template(),
+    links_state: array_class.LinksState,
+    faces_info: array_class.FacesInfo,
+    verts_info: array_class.VertsInfo,
+    fixed_verts_state: array_class.VertsState,
+    free_verts_state: array_class.VertsState,
+    geoms_info: array_class.GeomsInfo,
+    measured_equals_gt: int,
+    output_gt: qd.types.ndarray(),
+    output_measured: qd.types.ndarray(),
+):
+    total_n_probes = probe_positions_local.shape[0]
+    n_batches = output_gt.shape[-1]
+
+    for i_p, i_b in qd.ndrange(total_n_probes, n_batches):
+        i_s = probe_sensor_idx[i_p]
+        probe_idx_in_sensor = i_p - sensor_probe_start[i_s]
+        cache_start = sensor_cache_start[i_s]
+        n_probes = n_probes_per_sensor[i_s]
+        force_start = cache_start + probe_idx_in_sensor * 3
+        torque_start = cache_start + n_probes * 3 + probe_idx_in_sensor * 3
+
+        if probe_radii[i_p] <= gs.qd_float(0.0):
+            for j in qd.static(range(3)):
+                output_gt[force_start + j, i_b] = gs.qd_float(0.0)
+                output_gt[torque_start + j, i_b] = gs.qd_float(0.0)
+                output_measured[force_start + j, i_b] = gs.qd_float(0.0)
+                output_measured[torque_start + j, i_b] = gs.qd_float(0.0)
+            continue
+
+        probe_pos_local = qd.Vector(
+            [probe_positions_local[i_p, 0], probe_positions_local[i_p, 1], probe_positions_local[i_p, 2]]
+        )
+        probe_normal_local = qd.Vector(
+            [probe_local_normal[i_p, 0], probe_local_normal[i_p, 1], probe_local_normal[i_p, 2]]
+        )
+
+        sensor_link_idx = links_idx[i_s]
+        link_pos = links_state.pos[sensor_link_idx, i_b]
+        link_quat = links_state.quat[sensor_link_idx, i_b]
+
+        probe_pos = link_pos + gu.qd_transform_by_quat(probe_pos_local, link_quat)
+        probe_normal = gu.qd_transform_by_quat(probe_normal_local, link_quat)
+
+        probe_radius = probe_radii[i_p]
+        probe_radius_noise = probe_radii_noise[i_p]
+        use_noised_radius = probe_radius_noise > gs.EPS
+        probe_radius_m = (
+            func_noised_probe_radius(probe_radius, probe_radius_noise) if use_noised_radius else probe_radius
+        )
+
+        (
+            max_penetration_gt,
+            contact_link_gt,
+            contact_normal_gt,
+            max_penetration_m,
+            contact_link_m,
+            contact_normal_m,
+        ) = _func_query_contact_depth_bvh(
+            i_b,
+            i_s,
+            probe_pos,
+            probe_normal,
+            probe_radius,
+            probe_radius_m,
+            bvh_nodes,
+            bvh_morton_codes,
+            faces_info,
+            verts_info,
+            fixed_verts_state,
+            free_verts_state,
+            geoms_info,
+            sensor_candidate_geom_mask,
+            gs.EPS,
+        )
+
+        gained_pen_m = max_penetration_m * probe_gains[i_b, i_p]
+
+        force_gt, torque_gt = _func_kinematic_spring_damper(
+            i_b,
+            max_penetration_gt,
+            contact_link_gt,
+            contact_normal_gt,
+            sensor_link_idx,
+            probe_pos,
+            probe_pos_local,
+            link_quat,
+            normal_stiffness[i_s],
+            normal_damping[i_s],
+            normal_exponent[i_s],
+            shear_scalar[i_s],
+            twist_scalar[i_s],
+            links_state,
+        )
+        for j in qd.static(range(3)):
+            output_gt[force_start + j, i_b] = force_gt[j]
+            output_gt[torque_start + j, i_b] = torque_gt[j]
+
+        if measured_equals_gt == 1:
+            for j in qd.static(range(3)):
+                output_measured[force_start + j, i_b] = force_gt[j]
+                output_measured[torque_start + j, i_b] = torque_gt[j]
+        else:
+            force_m, torque_m = _func_kinematic_spring_damper(
+                i_b,
+                gained_pen_m,
+                contact_link_m,
+                contact_normal_m,
+                sensor_link_idx,
+                probe_pos,
+                probe_pos_local,
+                link_quat,
+                normal_stiffness[i_s],
+                normal_damping[i_s],
+                normal_exponent[i_s],
+                shear_scalar[i_s],
+                twist_scalar[i_s],
+                links_state,
+            )
+            for j in qd.static(range(3)):
+                output_measured[force_start + j, i_b] = force_m[j]
+                output_measured[torque_start + j, i_b] = torque_m[j]
+
+
+def _resolve_query_mode(shared_metadata) -> str:
+    """Resolve ``shared_metadata.contact_depth_query`` to ``"sdf"`` or ``"raycast"``. ``None`` defaults to ``"sdf"``
+    and is latched in so subsequent calls short-circuit."""
+    mode = shared_metadata.contact_depth_query
+    if mode is None:
+        mode = "sdf"
+        shared_metadata.contact_depth_query = mode
+    return mode
+
+
+def _ensure_candidate_geom_mask(shared_metadata, B: int, n_sensors: int, n_geoms: int) -> None:
+    """(Re)allocate ``sensor_candidate_geom_mask`` to ``(B, n_sensors, n_geoms)`` if its current shape doesn't match.
+    Sized lazily because the resolved mode may flip to ``"raycast"`` only after all sensors of this class have built."""
+    target = (B, n_sensors, n_geoms)
+    current = tuple(shared_metadata.sensor_candidate_geom_mask.shape)
+    if current != target:
+        shared_metadata.sensor_candidate_geom_mask = torch.zeros(target, dtype=gs.tc_bool, device=gs.device)
+
+
 class KinematicTactileSensorMixin(ProbeSensorMixin[ProbesWithNormalSensorSharedMetadataT]):
     def build(self):
         super().build()
+        # SDF activation is the default fallback; raycast mode does not require it but the call is idempotent and
+        # cheap so we keep it unconditional to avoid needing a build-finalize hook.
         self._shared_metadata.solver.collider.activate_sdf()
 
+        # Last-value-wins propagation of the contact_depth_query option onto the per-sensor-type shared metadata. If
+        # any sensor of this class opts into "raycast", ensure the per-sim shared collision BVH list exists so it's
+        # ready by the time _update_current_timestep_data runs.
+        mode = self._options.contact_depth_query
+        if mode is not None:
+            self._shared_metadata.contact_depth_query = mode
+        if self._shared_metadata.contact_depth_query == "raycast":
+            from genesis.engine.sensors.raycaster import ensure_solver_bvhs
+
+            self._shared_metadata.collision_bvh = ensure_solver_bvhs(self._manager._sim)
+
 
 @dataclass
 class ContactDepthProbeMetadata(
     ViscoelasticHysteresisMetadataMixin,
-    ProbeSensorMetadataMixin,
+    ProbesWithNormalSensorMetadataMixin,
     ContactPrefilterMetadataMixin,
+    ContactDepthQueryMetadataMixin,
     RigidSensorMetadataMixin,
     SimpleSensorMetadata,
 ):
@@ -484,6 +959,7 @@ class ContactDepthProbeMetadata(
 class ContactDepthProbeSensor(
     ViscoelasticHysteresisMixin[ContactDepthProbeMetadata],
     KinematicTactileSensorMixin[ContactDepthProbeMetadata],
+    ProbesWithNormalSensorMixin[ContactDepthProbeMetadata],
     RigidSensorMixin[ContactDepthProbeMetadata],
     SimpleSensor[ContactDepthProbeOptions, ContactDepthProbeMetadata, tuple],
 ):
@@ -526,25 +1002,61 @@ def _update_current_timestep_data(
             shared_metadata.sensor_contacts_idx,
             shared_metadata.sensor_n_contacts,
         )
-        _kernel_contact_depth_probe(
-            shared_metadata.probe_positions,
-            shared_metadata.probe_sensor_idx,
-            shared_metadata.probe_radii,
-            shared_metadata.probe_radii_noise,
-            shared_metadata.probe_gains,
-            shared_metadata.links_idx,
-            shared_metadata.sensor_cache_start,
-            shared_metadata.sensor_probe_start,
-            shared_metadata.sensor_contacts_idx,
-            shared_metadata.sensor_n_contacts,
-            solver.collider._collider_state,
-            solver.links_state,
-            solver.geoms_state,
-            solver.geoms_info,
-            solver.collider._sdf._sdf_info,
-            current_ground_truth_data_T,
-            measured_cols_b,
-        )
+
+        if _resolve_query_mode(shared_metadata) == "sdf":
+            _kernel_contact_depth_probe(
+                shared_metadata.probe_positions,
+                shared_metadata.probe_sensor_idx,
+                shared_metadata.probe_radii,
+                shared_metadata.probe_radii_noise,
+                shared_metadata.probe_gains,
+                shared_metadata.links_idx,
+                shared_metadata.sensor_cache_start,
+                shared_metadata.sensor_probe_start,
+                shared_metadata.sensor_contacts_idx,
+                shared_metadata.sensor_n_contacts,
+                solver.collider._collider_state,
+                solver.links_state,
+                solver.geoms_state,
+                solver.geoms_info,
+                solver.collider._sdf._sdf_info,
+                current_ground_truth_data_T,
+                measured_cols_b,
+            )
+        else:
+            from genesis.engine.sensors.raycaster import update_solver_bvhs
+
+            B, n_sensors = shared_metadata.sensor_n_contacts.shape
+            _ensure_candidate_geom_mask(shared_metadata, B, n_sensors, solver.n_geoms)
+            _kernel_build_sensor_candidate_geom_mask(
+                shared_metadata.sensor_contacts_idx,
+                shared_metadata.sensor_n_contacts,
+                solver.collider._collider_state,
+                shared_metadata.sensor_candidate_geom_mask,
+            )
+            update_solver_bvhs(shared_metadata.collision_bvh)
+            rigid_entry = next(e for e in shared_metadata.collision_bvh if e.raycast_mask is None)
+            _kernel_contact_depth_probe_bvh(
+                shared_metadata.probe_positions,
+                shared_metadata.probe_local_normal,
+                shared_metadata.probe_sensor_idx,
+                shared_metadata.probe_radii,
+                shared_metadata.probe_radii_noise,
+                shared_metadata.probe_gains,
+                shared_metadata.links_idx,
+                shared_metadata.sensor_cache_start,
+                shared_metadata.sensor_probe_start,
+                shared_metadata.sensor_candidate_geom_mask,
+                rigid_entry.bvh.nodes,
+                rigid_entry.bvh.morton_codes,
+                solver.links_state,
+                solver.faces_info,
+                solver.verts_info,
+                solver.fixed_verts_state,
+                solver.free_verts_state,
+                current_ground_truth_data_T,
+                measured_cols_b,
+            )
         if ground_truth_data_timeline is not None:
             ground_truth_data_timeline.at(0, copy=False).copy_(current_ground_truth_data_T.T)
         measured.copy_(measured_cols_b.T)
@@ -657,6 +1169,7 @@ class KinematicTaxelMetadata(
     ViscoelasticHysteresisMetadataMixin,
     ProbesWithNormalSensorMetadataMixin,
     ContactPrefilterMetadataMixin,
+    ContactDepthQueryMetadataMixin,
     RigidSensorMetadataMixin,
     SimpleSensorMetadata,
 ):
@@ -930,35 +1443,79 @@ def _update_current_timestep_data(
             shared_metadata.sensor_contacts_idx,
             shared_metadata.sensor_n_contacts,
         )
-        _kernel_kinematic_taxel(
-            shared_metadata.probe_positions,
-            shared_metadata.probe_sensor_idx,
-            shared_metadata.probe_radii,
-            shared_metadata.probe_radii_noise,
-            shared_metadata.probe_gains,
-            shared_metadata.normal_stiffness,
-            shared_metadata.normal_damping,
-            shared_metadata.normal_exponent,
-            shared_metadata.shear_scalar,
-            shared_metadata.twist_scalar,
-            shared_metadata.links_idx,
-            shared_metadata.sensor_cache_start,
-            shared_metadata.sensor_probe_start,
-            shared_metadata.n_probes_per_sensor,
-            shared_metadata.sensor_contacts_idx,
-            shared_metadata.sensor_n_contacts,
-            solver.collider._collider_state,
-            solver.collider._collider_static_config,
-            solver.links_state,
-            solver.geoms_state,
-            solver.geoms_info,
-            solver._rigid_global_info,
-            solver.collider._sdf._sdf_info,
-            gs.EPS,
-            measured_equals_gt,
-            current_ground_truth_data_T,
-            measured_cols_b,
-        )
+
+        if _resolve_query_mode(shared_metadata) == "sdf":
+            _kernel_kinematic_taxel(
+                shared_metadata.probe_positions,
+                shared_metadata.probe_sensor_idx,
+                shared_metadata.probe_radii,
+                shared_metadata.probe_radii_noise,
+                shared_metadata.probe_gains,
+                shared_metadata.normal_stiffness,
+                shared_metadata.normal_damping,
+                shared_metadata.normal_exponent,
+                shared_metadata.shear_scalar,
+                shared_metadata.twist_scalar,
+                shared_metadata.links_idx,
+                shared_metadata.sensor_cache_start,
+                shared_metadata.sensor_probe_start,
+                shared_metadata.n_probes_per_sensor,
+                shared_metadata.sensor_contacts_idx,
+                shared_metadata.sensor_n_contacts,
+                solver.collider._collider_state,
+                solver.collider._collider_static_config,
+                solver.links_state,
+                solver.geoms_state,
+                solver.geoms_info,
+                solver._rigid_global_info,
+                solver.collider._sdf._sdf_info,
+                gs.EPS,
+                measured_equals_gt,
+                current_ground_truth_data_T,
+                measured_cols_b,
+            )
+        else:
+            from genesis.engine.sensors.raycaster import update_solver_bvhs
+
+            B, n_sensors = shared_metadata.sensor_n_contacts.shape
+            _ensure_candidate_geom_mask(shared_metadata, B, n_sensors, solver.n_geoms)
+            _kernel_build_sensor_candidate_geom_mask(
+                shared_metadata.sensor_contacts_idx,
+                shared_metadata.sensor_n_contacts,
+                solver.collider._collider_state,
+                shared_metadata.sensor_candidate_geom_mask,
+            )
+            update_solver_bvhs(shared_metadata.collision_bvh)
+            rigid_entry = next(e for e in shared_metadata.collision_bvh if e.raycast_mask is None)
+            _kernel_kinematic_taxel_bvh(
+                shared_metadata.probe_positions,
+                shared_metadata.probe_local_normal,
+                shared_metadata.probe_sensor_idx,
+                shared_metadata.probe_radii,
+                shared_metadata.probe_radii_noise,
+                shared_metadata.probe_gains,
+                shared_metadata.normal_stiffness,
+                shared_metadata.normal_damping,
+                shared_metadata.normal_exponent,
+                shared_metadata.shear_scalar,
+                shared_metadata.twist_scalar,
+                shared_metadata.links_idx,
+                shared_metadata.sensor_cache_start,
+                shared_metadata.sensor_probe_start,
+                shared_metadata.n_probes_per_sensor,
+                shared_metadata.sensor_candidate_geom_mask,
+                rigid_entry.bvh.nodes,
+                rigid_entry.bvh.morton_codes,
+                solver.links_state,
+                solver.faces_info,
+                solver.verts_info,
+                solver.fixed_verts_state,
+                solver.free_verts_state,
+                solver.geoms_info,
+                measured_equals_gt,
+                current_ground_truth_data_T,
+                measured_cols_b,
+            )
         if ground_truth_data_timeline is not None:
             ground_truth_data_timeline.at(0, copy=False).copy_(current_ground_truth_data_T.T)
         measured.copy_(measured_cols_b.T)
diff --git a/genesis/engine/sensors/point_cloud_tactile.py b/genesis/engine/sensors/point_cloud_tactile.py
index d5d6d34c9a..455a533cd3 100644
--- a/genesis/engine/sensors/point_cloud_tactile.py
+++ b/genesis/engine/sensors/point_cloud_tactile.py
@@ -9,10 +9,16 @@
 import genesis.utils.array_class as array_class
 import genesis.utils.geom as gu
 import genesis.utils.sdf as sdf
+from genesis.engine.bvh import STACK_SIZE as _BVH_STACK_SIZE
 from genesis.options.sensors import ElastomerTaxel as ElastomerTaxelSensorOptions
 from genesis.options.sensors import ProximityTaxel as ProximityTaxelOptions
 from genesis.utils.misc import concat_with_tensor, make_tensor_field, tensor_to_array
 from genesis.utils.point_cloud import sample_mesh_point_cloud
+from genesis.utils.raycast_qd import (
+    closest_point_on_triangle,
+    get_triangle_vertices,
+    triangle_face_normal,
+)
 
 from .base_sensor import RigidSensorMetadataMixin, RigidSensorMixin, SimpleSensor, SimpleSensorMetadata
 from .probe import (
@@ -27,6 +33,7 @@
     BVH_LEAF_SIZE,
     BVH_STACK_SIZE,
     BVHMetadata,
+    ContactDepthQueryMetadataMixin,
     GridFFTConvMetadataMixin,
     ViscoelasticHysteresisMetadataMixin,
     ViscoelasticHysteresisMixin,
@@ -103,12 +110,36 @@ def _active_envs_mask_tensor(geom, batch_size: int) -> torch.Tensor:
     return geom.active_envs_mask.to(device=gs.device, dtype=gs.tc_bool)
 
 
+def _group_geoms_by_variant(
+    geom_chunks: list[tuple[object, np.ndarray, np.ndarray]], batch_size: int
+) -> list[tuple[torch.Tensor, list[tuple[object, np.ndarray, np.ndarray]]]]:
+    """
+    Partition a link's geoms into heterogeneous-variant groups by ``active_envs_mask``. Geoms sharing
+    a mask are one variant; ``None`` masks (homogeneous) collapse into a single all-True group.
+    Returns ``[(mask, geom_chunks_for_variant), ...]`` preserving the original geom order within each group.
+    """
+    groups: dict[bytes, tuple[torch.Tensor, list[tuple[object, np.ndarray, np.ndarray]]]] = {}
+    for chunk in geom_chunks:
+        geom = chunk[0]
+        mask = _active_envs_mask_tensor(geom, batch_size)
+        key = tensor_to_array(mask).astype(np.bool_).tobytes()
+        if key not in groups:
+            groups[key] = (mask, [])
+        groups[key][1].append(chunk)
+    return list(groups.values())
+
+
 def _sample_track_links_point_cloud_tensors(
     solver, track_link_idx: np.ndarray, n_sample_points: int | list | tuple, prefer_visual: bool
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     FPS-sample meshes on ``track_link_idx`` into concatenated link-local positions and normals.
 
+    The per-link budget from ``n_sample_points`` is allocated to every heterogeneous variant on a link
+    (geoms grouped by ``active_envs_mask``), so each parallel environment sees the full requested point
+    count regardless of which variant is active. Within a variant, the budget is split across geoms by
+    surface area.
+
     Returns
     -------
     idx_cat, pos_cat, nrm_cat, active_cat
@@ -130,20 +161,22 @@ def _sample_track_links_point_cloud_tensors(
         geom_chunks = get_mesh_geom_chunks(link, prefer_visual)
         if not geom_chunks:
             gs.raise_exception(f"No mesh geometry on tracked link index {link_idx}.")
-        for n_geom_pts, (geom, verts, faces) in zip(_split_count_by_area(n_pts, geom_chunks), geom_chunks):
-            if n_geom_pts <= 0:
-                continue
-            # Fixed seed: the cache key already discriminates between meshes (vertices+faces hashed), so the same mesh
-            # always resolves to the same sample, which keeps tactile readings reproducible across build/reset cycles.
-            pts_np, nrm_np = sample_mesh_point_cloud(
-                verts, faces, n_geom_pts, seed=0, use_cache=True, return_normals=True
-            )
+        for variant_mask, variant_chunks in _group_geoms_by_variant(geom_chunks, solver._B):
+            for n_geom_pts, (geom, verts, faces) in zip(_split_count_by_area(n_pts, variant_chunks), variant_chunks):
+                if n_geom_pts <= 0:
+                    continue
+                # Fixed seed: the cache key already discriminates between meshes (vertices+faces hashed), so the same
+                # mesh always resolves to the same sample, which keeps tactile readings reproducible across
+                # build/reset cycles.
+                pts_np, nrm_np = sample_mesh_point_cloud(
+                    verts, faces, n_geom_pts, seed=0, use_cache=True, return_normals=True
+                )
 
-            li = torch.full((pts_np.shape[0],), link_idx, dtype=gs.tc_int, device=gs.device)
-            link_idx_chunks.append(li)
-            pos_chunks.append(torch.tensor(pts_np, dtype=gs.tc_float, device=gs.device))
-            nrm_chunks.append(torch.tensor(nrm_np, dtype=gs.tc_float, device=gs.device))
-            active_chunks.append(_active_envs_mask_tensor(geom, solver._B).expand(pts_np.shape[0], solver._B))
+                li = torch.full((pts_np.shape[0],), link_idx, dtype=gs.tc_int, device=gs.device)
+                link_idx_chunks.append(li)
+                pos_chunks.append(torch.tensor(pts_np, dtype=gs.tc_float, device=gs.device))
+                nrm_chunks.append(torch.tensor(nrm_np, dtype=gs.tc_float, device=gs.device))
+                active_chunks.append(variant_mask.expand(pts_np.shape[0], solver._B))
 
     if not pos_chunks:
         gs.raise_exception("PointCloudTactile sensor produced an empty object point cloud.")
@@ -866,6 +899,129 @@ def _dilate_kernel_builder(meta_entry: tuple, fft_n: tuple[int, int]) -> torch.T
     return _precompute_hydroshear_dilate_kernel_fft(lambda_d, (spacing_u, spacing_v), fft_n, gs.device, gs.tc_float)
 
 
+@qd.func
+def _func_elastomer_min_signed_dist_bvh(
+    i_b: int,
+    i_s: int,
+    probe_world: qd.types.vector(3),
+    max_query_dist: float,
+    bvh_nodes: qd.template(),
+    bvh_morton_codes: qd.template(),
+    faces_info: array_class.FacesInfo,
+    verts_info: array_class.VertsInfo,
+    fixed_verts_state: array_class.VertsState,
+    free_verts_state: array_class.VertsState,
+    track_geom_mask: qd.types.ndarray(),
+) -> float:
+    """
+    BVH-based signed distance from ``probe_world`` to the nearest triangle of any geom flagged for this sensor in
+    ``track_geom_mask`` (shape ``(B, n_sensors, n_geoms)``). Sign is positive when the probe is outside the surface
+    (closest-triangle face-normal points away from probe), negative when inside. Mirrors the return contract of
+    ``_func_elastomer_min_sdf_over_active_geoms`` so callers consume ``max(0, -signed)`` identically.
+
+    Uses ``max_query_dist`` as the BVH cull radius: probes farther than that from every candidate triangle are
+    treated as fully outside (returns ``+max_query_dist``), which downstream maps to depth = 0.
+    """
+    n_triangles = faces_info.verts_idx.shape[0]
+    best_dist = max_query_dist
+    best_dist_sq = best_dist * best_dist
+    best_signed = max_query_dist
+
+    node_stack = qd.Vector.zero(gs.qd_int, qd.static(_BVH_STACK_SIZE))
+    node_stack[0] = 0
+    stack_idx = 1
+
+    while stack_idx > 0:
+        stack_idx -= 1
+        node_idx = node_stack[stack_idx]
+        node = bvh_nodes[i_b, node_idx]
+
+        if not func_sphere_intersects_aabb(probe_world, best_dist_sq, node.bound.min, node.bound.max):
+            continue
+
+        if node.left == -1:
+            sorted_leaf_idx = node_idx - (n_triangles - 1)
+            i_f = qd.cast(bvh_morton_codes[i_b, sorted_leaf_idx][1], gs.qd_int)
+            i_g = faces_info.geom_idx[i_f]
+            if not track_geom_mask[i_b, i_s, i_g]:
+                continue
+
+            tri = get_triangle_vertices(i_f, i_b, faces_info, verts_info, fixed_verts_state, free_verts_state)
+            v0 = tri[:, 0]
+            v1 = tri[:, 1]
+            v2 = tri[:, 2]
+            closest = closest_point_on_triangle(probe_world, v0, v1, v2)
+            diff = probe_world - closest
+            d_sq = diff.dot(diff)
+            if d_sq < best_dist_sq:
+                d = qd.sqrt(d_sq)
+                fn = triangle_face_normal(v0, v1, v2)
+                # Sign: probe outside if (probe - closest) aligns with outward face normal.
+                sign_v = qd.select(diff.dot(fn) >= gs.qd_float(0.0), gs.qd_float(1.0), gs.qd_float(-1.0))
+                best_signed = d * sign_v
+                best_dist = d
+                best_dist_sq = d_sq
+        else:
+            if stack_idx < qd.static(_BVH_STACK_SIZE - 2):
+                node_stack[stack_idx] = node.left
+                node_stack[stack_idx + 1] = node.right
+                stack_idx += 2
+
+    return best_signed
+
+
+@qd.kernel(fastcache=False)
+def _kernel_elastomer_probe_depth_bvh(
+    probe_positions_local: qd.types.ndarray(),
+    probe_sensor_idx: qd.types.ndarray(),
+    probe_radii: qd.types.ndarray(),
+    links_idx: qd.types.ndarray(),
+    track_geom_mask: qd.types.ndarray(),
+    max_query_dist: float,
+    bvh_nodes: qd.template(),
+    bvh_morton_codes: qd.template(),
+    links_state: array_class.LinksState,
+    faces_info: array_class.FacesInfo,
+    verts_info: array_class.VertsInfo,
+    fixed_verts_state: array_class.VertsState,
+    free_verts_state: array_class.VertsState,
+    probe_depth_buf: qd.types.ndarray(),
+):
+    """
+    Per-probe contact depth from the rigid solver's global collision BVH, gated by ``track_geom_mask``. Mirrors
+    ``_kernel_elastomer_probe_depth``'s output contract (write into ``probe_depth_buf``); the dilate accumulator
+    consumes the same buffer downstream.
+    """
+    total_n_probes = probe_positions_local.shape[0]
+    n_batches = probe_depth_buf.shape[0]
+
+    for i_b, i_p in qd.ndrange(n_batches, total_n_probes):
+        if probe_radii[i_p] <= gs.qd_float(0.0):
+            probe_depth_buf[i_b, i_p] = gs.qd_float(0.0)
+            continue
+        i_s = probe_sensor_idx[i_p]
+        sensor_link_idx = links_idx[i_s]
+        link_pos = links_state.pos[sensor_link_idx, i_b]
+        link_quat = links_state.quat[sensor_link_idx, i_b]
+        probe_local = func_vec3_at(probe_positions_local, i_p)
+        probe_world = link_pos + gu.qd_transform_by_quat(probe_local, link_quat)
+
+        signed = _func_elastomer_min_signed_dist_bvh(
+            i_b,
+            i_s,
+            probe_world,
+            max_query_dist,
+            bvh_nodes,
+            bvh_morton_codes,
+            faces_info,
+            verts_info,
+            fixed_verts_state,
+            free_verts_state,
+            track_geom_mask,
+        )
+        probe_depth_buf[i_b, i_p] = qd.max(gs.qd_float(0.0), -signed)
+
+
 @qd.kernel(fastcache=True)
 def _kernel_elastomer_probe_depth(
     probe_positions_local: qd.types.ndarray(),
@@ -1142,6 +1298,166 @@ def _kernel_elastomer_surface_state_bvh(
                 stack_idx += 1
 
 
+@qd.kernel(fastcache=False)
+def _kernel_elastomer_surface_state_via_global_bvh(
+    links_idx: qd.types.ndarray(),
+    sensor_elastomer_geom_start: qd.types.ndarray(),
+    sensor_elastomer_geom_n: qd.types.ndarray(),
+    elastomer_geom_idx: qd.types.ndarray(),
+    elastomer_geom_active_envs_mask: qd.types.ndarray(),
+    elastomer_candidate_geom_mask: qd.types.ndarray(),
+    bvh_chunk_sensor_idx: qd.types.ndarray(),
+    bvh_chunk_link_idx: qd.types.ndarray(),
+    bvh_chunk_node_start: qd.types.ndarray(),
+    bvh_node_min: qd.types.ndarray(),
+    bvh_node_max: qd.types.ndarray(),
+    bvh_node_left: qd.types.ndarray(),
+    bvh_node_right: qd.types.ndarray(),
+    bvh_node_leaf_start: qd.types.ndarray(),
+    bvh_node_leaf_count: qd.types.ndarray(),
+    bvh_leaf_elem_idx: qd.types.ndarray(),
+    pc_pos_link: qd.types.ndarray(),
+    pc_active_envs_mask: qd.types.ndarray(),
+    sdf_enter: qd.types.ndarray(),
+    sdf_exit: qd.types.ndarray(),
+    aabb_margin: float,
+    max_query_dist: float,
+    global_bvh_nodes: qd.template(),
+    global_bvh_morton_codes: qd.template(),
+    links_state: array_class.LinksState,
+    geoms_state: array_class.GeomsState,
+    faces_info: array_class.FacesInfo,
+    verts_info: array_class.VertsInfo,
+    fixed_verts_state: array_class.VertsState,
+    free_verts_state: array_class.VertsState,
+    surface_pos_sensor_buf: qd.types.ndarray(),
+    surface_entry_pos_sensor_buf: qd.types.ndarray(),
+    surface_depth_buf: qd.types.ndarray(),
+    surface_initialized_buf: qd.types.ndarray(),
+    surface_candidate_buf: qd.types.ndarray(),
+):
+    """
+    Raycast variant of ``_kernel_elastomer_surface_state_bvh``: same outer (env, chunk) traversal over the point-cloud
+    BVH per tracked link, but the inner signed-distance query at each PC point uses ``_func_elastomer_min_signed_dist_bvh``
+    over the rigid solver's global collision BVH (gated by ``elastomer_candidate_geom_mask``) instead of the analytic
+    SDF. Output contract matches the SDF variant so the dilate / shear pipeline downstream is unchanged.
+    """
+    n_batches = surface_pos_sensor_buf.shape[0]
+    n_chunks = bvh_chunk_sensor_idx.shape[0]
+
+    for i_b, i_c in qd.ndrange(n_batches, n_chunks):
+        i_s = bvh_chunk_sensor_idx[i_c]
+
+        wmin = qd.Vector([gs.qd_float(1e30), gs.qd_float(1e30), gs.qd_float(1e30)], dt=gs.qd_float)
+        wmax = qd.Vector([gs.qd_float(-1e30), gs.qd_float(-1e30), gs.qd_float(-1e30)], dt=gs.qd_float)
+        any_active = False
+        gm_start = sensor_elastomer_geom_start[i_s]
+        gm_n = sensor_elastomer_geom_n[i_s]
+        for i_gm in range(gm_start, gm_start + gm_n):
+            if not elastomer_geom_active_envs_mask[i_gm, i_b]:
+                continue
+            i_g = elastomer_geom_idx[i_gm]
+            gmin = geoms_state.aabb_min[i_g, i_b]
+            gmax = geoms_state.aabb_max[i_g, i_b]
+            for k in qd.static(range(3)):
+                if gmin[k] < wmin[k]:
+                    wmin[k] = gmin[k]
+                if gmax[k] > wmax[k]:
+                    wmax[k] = gmax[k]
+            any_active = True
+
+        if not any_active:
+            continue
+
+        expand = sdf_exit[i_s] + gs.qd_float(aabb_margin)
+        for k in qd.static(range(3)):
+            wmin[k] = wmin[k] - expand
+            wmax[k] = wmax[k] + expand
+
+        track_link_idx = bvh_chunk_link_idx[i_c]
+        track_pos = links_state.pos[track_link_idx, i_b]
+        track_quat = links_state.quat[track_link_idx, i_b]
+        qmin = qd.Vector([gs.qd_float(1e30), gs.qd_float(1e30), gs.qd_float(1e30)], dt=gs.qd_float)
+        qmax = qd.Vector([gs.qd_float(-1e30), gs.qd_float(-1e30), gs.qd_float(-1e30)], dt=gs.qd_float)
+        for cx in qd.static(range(2)):
+            for cy in qd.static(range(2)):
+                for cz in qd.static(range(2)):
+                    cw_x = wmax[0] if cx == 1 else wmin[0]
+                    cw_y = wmax[1] if cy == 1 else wmin[1]
+                    cw_z = wmax[2] if cz == 1 else wmin[2]
+                    corner_world = qd.Vector([cw_x, cw_y, cw_z], dt=gs.qd_float)
+                    corner_link = gu.qd_inv_transform_by_trans_quat(corner_world, track_pos, track_quat)
+                    for k in qd.static(range(3)):
+                        if corner_link[k] < qmin[k]:
+                            qmin[k] = corner_link[k]
+                        if corner_link[k] > qmax[k]:
+                            qmax[k] = corner_link[k]
+
+        sensor_link_idx = links_idx[i_s]
+        sensor_pos = links_state.pos[sensor_link_idx, i_b]
+        sensor_quat = links_state.quat[sensor_link_idx, i_b]
+
+        stack = qd.Vector.zero(gs.qd_int, qd.static(BVH_STACK_SIZE))
+        stack[0] = bvh_chunk_node_start[i_c]
+        stack_idx = 1
+
+        while stack_idx > 0:
+            stack_idx -= 1
+            n = stack[stack_idx]
+            bmin = func_vec3_at(bvh_node_min, n)
+            bmax = func_vec3_at(bvh_node_max, n)
+            if not func_aabb_intersects_aabb(bmin, bmax, qmin, qmax):
+                continue
+            left = bvh_node_left[n]
+            if left == -1:
+                pstart = bvh_node_leaf_start[n]
+                pn = bvh_node_leaf_count[n]
+                for j in range(pn):
+                    i_o = bvh_leaf_elem_idx[pstart + j]
+                    if not pc_active_envs_mask[i_o, i_b]:
+                        continue
+                    surface_candidate_buf[i_b, i_o] = True
+
+                    point_link = func_vec3_at(pc_pos_link, i_o)
+                    point_world = track_pos + gu.qd_transform_by_quat(point_link, track_quat)
+                    point_sensor = gu.qd_inv_transform_by_trans_quat(point_world, sensor_pos, sensor_quat)
+                    for k in qd.static(range(3)):
+                        surface_pos_sensor_buf[i_b, i_o, k] = point_sensor[k]
+
+                    min_sdf = _func_elastomer_min_signed_dist_bvh(
+                        i_b,
+                        i_s,
+                        point_world,
+                        max_query_dist,
+                        global_bvh_nodes,
+                        global_bvh_morton_codes,
+                        faces_info,
+                        verts_info,
+                        fixed_verts_state,
+                        free_verts_state,
+                        elastomer_candidate_geom_mask,
+                    )
+
+                    surface_depth_buf[i_b, i_o] = qd.max(gs.qd_float(0.0), -min_sdf)
+
+                    _func_elastomer_update_surface_anchor(
+                        i_b,
+                        i_o,
+                        min_sdf,
+                        point_sensor,
+                        sdf_enter[i_s],
+                        sdf_exit[i_s],
+                        surface_entry_pos_sensor_buf,
+                        surface_initialized_buf,
+                    )
+            else:
+                right = bvh_node_right[n]
+                stack[stack_idx] = left
+                stack_idx += 1
+                stack[stack_idx] = right
+                stack_idx += 1
+
+
 @qd.kernel(fastcache=True)
 def _kernel_elastomer_shear_accumulate(
     probe_positions_local: qd.types.ndarray(),
@@ -1356,6 +1672,7 @@ def _elastomer_taxel_grid_fft_dilate(
 class ElastomerTaxelSensorMetadata(
     ViscoelasticHysteresisMetadataMixin,
     GridFFTConvMetadataMixin,
+    ContactDepthQueryMetadataMixin,
     PointCloudTactileSharedMetadata,
     ProbesWithNormalSensorMetadataMixin,
 ):
@@ -1369,6 +1686,11 @@ class ElastomerTaxelSensorMetadata(
     sensor_elastomer_geom_start: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
     sensor_elastomer_geom_n: torch.Tensor = make_tensor_field((0,), dtype_factory=lambda: gs.tc_int)
 
+    # Per-(B, sensor, geom) bitmask of elastomer (sensor-own) geoms, used by the global-BVH surface-state kernel
+    # to gate triangles back to the sensor's elastomer surface. Separate from ``sensor_candidate_geom_mask`` which
+    # gates by tracked-object geoms for the probe-depth kernel.
+    elastomer_candidate_geom_mask: torch.Tensor = make_tensor_field((0, 0, 0), dtype_factory=lambda: gs.tc_bool)
+
     lambda_d: torch.Tensor = make_tensor_field((0,))
     lambda_s: torch.Tensor = make_tensor_field((0,))
     dilate_scale: torch.Tensor = make_tensor_field((0,))
@@ -1448,11 +1770,23 @@ def build(self):
         super().build()
 
         solver = self._shared_metadata.solver
-        solver.collider.activate_sdf()
         B = self._manager._sim._B
         if self._link is None:
             gs.raise_exception("ElastomerTaxel must be attached to a rigid link with collision geometry.")
 
+        # Last-value-wins propagation of contact_depth_query. SDF activation is skipped only when this sensor's own
+        # option is explicitly "raycast" -- with last-value-wins semantics this can leave SDF activated in mixed-mode
+        # configurations, but never broken (an unused activated SDF is harmless).
+        mode = self._options.contact_depth_query
+        if mode is not None:
+            self._shared_metadata.contact_depth_query = mode
+        if mode != "raycast":
+            solver.collider.activate_sdf()
+        if self._shared_metadata.contact_depth_query == "raycast":
+            from genesis.engine.sensors.raycaster import ensure_solver_bvhs
+
+            self._shared_metadata.collision_bvh = ensure_solver_bvhs(self._manager._sim)
+
         elastomer_geom_start_row = self._shared_metadata.elastomer_geom_idx.shape[0]
         elastomer_geom_idx, elastomer_geom_active_envs_mask = _collect_collision_geom_idx(
             solver, np.asarray((self._link.idx,), dtype=gs.np_int)
@@ -1547,6 +1881,35 @@ def build(self):
             (B, n_sensors_built), dtype=gs.tc_int, device=gs.device
         )
 
+        # Build the (B, n_sensors, n_geoms) candidate-geom masks scattered from track_geom_idx (probe-depth) and
+        # elastomer_geom_idx (surface-anchor). Only needed in raycast mode but allocated cheaply (bool, total
+        # scene-geom count) so we tolerate the small idle cost in sdf mode.
+        if self._shared_metadata.contact_depth_query == "raycast":
+            n_geoms = solver.n_geoms
+
+            def _build_mask(geom_starts_tensor, geom_ns_tensor, geom_idx_tensor):
+                mask = torch.zeros((B, n_sensors_built, n_geoms), dtype=gs.tc_bool, device=gs.device)
+                geom_starts = geom_starts_tensor.cpu().numpy()
+                geom_ns = geom_ns_tensor.cpu().numpy()
+                idx_np = geom_idx_tensor.cpu().numpy()
+                for s in range(n_sensors_built):
+                    lo = int(geom_starts[s])
+                    hi = lo + int(geom_ns[s])
+                    if hi > lo:
+                        mask[:, s, idx_np[lo:hi]] = True
+                return mask
+
+            self._shared_metadata.sensor_candidate_geom_mask = _build_mask(
+                self._shared_metadata.sensor_track_geom_start,
+                self._shared_metadata.sensor_track_geom_n,
+                self._shared_metadata.track_geom_idx,
+            )
+            self._shared_metadata.elastomer_candidate_geom_mask = _build_mask(
+                self._shared_metadata.sensor_elastomer_geom_start,
+                self._shared_metadata.sensor_elastomer_geom_n,
+                self._shared_metadata.elastomer_geom_idx,
+            )
+
         self._shared_metadata.use_grid_fft = concat_with_tensor(
             self._shared_metadata.use_grid_fft, self._use_grid_fft, expand=(1,)
         )
@@ -1652,21 +2015,47 @@ def _update_current_timestep_data(
         # which is set in lockstep with that same depth write; measured is .copy_'d at the end.
         measured = measured_data_timeline.at(0, copy=False)
 
-        _kernel_elastomer_probe_depth(
-            shared_metadata.probe_positions,
-            shared_metadata.probe_sensor_idx,
-            shared_metadata.probe_radii,
-            shared_metadata.links_idx,
-            shared_metadata.sensor_track_geom_start,
-            shared_metadata.sensor_track_geom_n,
-            shared_metadata.track_geom_idx,
-            shared_metadata.track_geom_active_envs_mask,
-            solver.links_state,
-            solver.geoms_state,
-            solver.geoms_info,
-            solver.collider._sdf._sdf_info,
-            shared_metadata.probe_depth_buf,
-        )
+        if (shared_metadata.contact_depth_query or "sdf") == "sdf":
+            _kernel_elastomer_probe_depth(
+                shared_metadata.probe_positions,
+                shared_metadata.probe_sensor_idx,
+                shared_metadata.probe_radii,
+                shared_metadata.links_idx,
+                shared_metadata.sensor_track_geom_start,
+                shared_metadata.sensor_track_geom_n,
+                shared_metadata.track_geom_idx,
+                shared_metadata.track_geom_active_envs_mask,
+                solver.links_state,
+                solver.geoms_state,
+                solver.geoms_info,
+                solver.collider._sdf._sdf_info,
+                shared_metadata.probe_depth_buf,
+            )
+        else:
+            from genesis.engine.sensors.raycaster import update_solver_bvhs
+
+            shared_metadata.contact_depth_query = "raycast"  # latch
+            update_solver_bvhs(shared_metadata.collision_bvh)
+            rigid_entry = next(e for e in shared_metadata.collision_bvh if e.raycast_mask is None)
+            # Conservative BVH walk cap. Far enough to cover any realistic elastomer penetration; probes outside
+            # this radius from every candidate triangle map to depth = 0.
+            max_query_dist = float(0.1)
+            _kernel_elastomer_probe_depth_bvh(
+                shared_metadata.probe_positions,
+                shared_metadata.probe_sensor_idx,
+                shared_metadata.probe_radii,
+                shared_metadata.links_idx,
+                shared_metadata.sensor_candidate_geom_mask,
+                max_query_dist,
+                rigid_entry.bvh.nodes,
+                rigid_entry.bvh.morton_codes,
+                solver.links_state,
+                solver.faces_info,
+                solver.verts_info,
+                solver.fixed_verts_state,
+                solver.free_verts_state,
+                shared_metadata.probe_depth_buf,
+            )
         _kernel_elastomer_dilate_accumulate(
             shared_metadata.use_grid_fft,
             shared_metadata.probe_positions,
@@ -1701,37 +2090,77 @@ def _update_current_timestep_data(
         if shared_metadata.any_shear:
             bvh = shared_metadata.pc_bvh
             shared_metadata.surface_candidate_buf.zero_()
-            _kernel_elastomer_surface_state_bvh(
-                shared_metadata.links_idx,
-                shared_metadata.sensor_elastomer_geom_start,
-                shared_metadata.sensor_elastomer_geom_n,
-                shared_metadata.elastomer_geom_idx,
-                shared_metadata.elastomer_geom_active_envs_mask,
-                bvh.chunk_sensor_idx,
-                bvh.chunk_link_idx,
-                bvh.chunk_node_start,
-                bvh.node_min,
-                bvh.node_max,
-                bvh.node_left,
-                bvh.node_right,
-                bvh.node_leaf_start,
-                bvh.node_leaf_count,
-                bvh.leaf_elem_idx,
-                shared_metadata.pc_pos_link,
-                shared_metadata.pc_active_envs_mask,
-                shared_metadata.elastomer_contact_sdf_enter,
-                shared_metadata.elastomer_contact_sdf_exit,
-                _ELASTOMER_QUERY_AABB_MARGIN,
-                solver.links_state,
-                solver.geoms_state,
-                solver.geoms_info,
-                solver.collider._sdf._sdf_info,
-                shared_metadata.surface_pos_sensor_buf,
-                shared_metadata.surface_entry_pos_sensor_buf,
-                shared_metadata.surface_depth_buf,
-                shared_metadata.surface_initialized_buf,
-                shared_metadata.surface_candidate_buf,
-            )
+            if (shared_metadata.contact_depth_query or "sdf") == "sdf":
+                _kernel_elastomer_surface_state_bvh(
+                    shared_metadata.links_idx,
+                    shared_metadata.sensor_elastomer_geom_start,
+                    shared_metadata.sensor_elastomer_geom_n,
+                    shared_metadata.elastomer_geom_idx,
+                    shared_metadata.elastomer_geom_active_envs_mask,
+                    bvh.chunk_sensor_idx,
+                    bvh.chunk_link_idx,
+                    bvh.chunk_node_start,
+                    bvh.node_min,
+                    bvh.node_max,
+                    bvh.node_left,
+                    bvh.node_right,
+                    bvh.node_leaf_start,
+                    bvh.node_leaf_count,
+                    bvh.leaf_elem_idx,
+                    shared_metadata.pc_pos_link,
+                    shared_metadata.pc_active_envs_mask,
+                    shared_metadata.elastomer_contact_sdf_enter,
+                    shared_metadata.elastomer_contact_sdf_exit,
+                    _ELASTOMER_QUERY_AABB_MARGIN,
+                    solver.links_state,
+                    solver.geoms_state,
+                    solver.geoms_info,
+                    solver.collider._sdf._sdf_info,
+                    shared_metadata.surface_pos_sensor_buf,
+                    shared_metadata.surface_entry_pos_sensor_buf,
+                    shared_metadata.surface_depth_buf,
+                    shared_metadata.surface_initialized_buf,
+                    shared_metadata.surface_candidate_buf,
+                )
+            else:
+                rigid_entry = next(e for e in shared_metadata.collision_bvh if e.raycast_mask is None)
+                _kernel_elastomer_surface_state_via_global_bvh(
+                    shared_metadata.links_idx,
+                    shared_metadata.sensor_elastomer_geom_start,
+                    shared_metadata.sensor_elastomer_geom_n,
+                    shared_metadata.elastomer_geom_idx,
+                    shared_metadata.elastomer_geom_active_envs_mask,
+                    shared_metadata.elastomer_candidate_geom_mask,
+                    bvh.chunk_sensor_idx,
+                    bvh.chunk_link_idx,
+                    bvh.chunk_node_start,
+                    bvh.node_min,
+                    bvh.node_max,
+                    bvh.node_left,
+                    bvh.node_right,
+                    bvh.node_leaf_start,
+                    bvh.node_leaf_count,
+                    bvh.leaf_elem_idx,
+                    shared_metadata.pc_pos_link,
+                    shared_metadata.pc_active_envs_mask,
+                    shared_metadata.elastomer_contact_sdf_enter,
+                    shared_metadata.elastomer_contact_sdf_exit,
+                    _ELASTOMER_QUERY_AABB_MARGIN,
+                    float(0.1),  # max_query_dist for the global-BVH closest-point search
+                    rigid_entry.bvh.nodes,
+                    rigid_entry.bvh.morton_codes,
+                    solver.links_state,
+                    solver.geoms_state,
+                    solver.faces_info,
+                    solver.verts_info,
+                    solver.fixed_verts_state,
+                    solver.free_verts_state,
+                    shared_metadata.surface_pos_sensor_buf,
+                    shared_metadata.surface_entry_pos_sensor_buf,
+                    shared_metadata.surface_depth_buf,
+                    shared_metadata.surface_initialized_buf,
+                    shared_metadata.surface_candidate_buf,
+                )
             # Invalidate stale surface state for points the BVH did not visit. surface_initialized
             # and entry-pos survive across steps; depth/pos are gated by initialized downstream so
             # they don't need clearing. The shear accumulator below reads from a compact index
diff --git a/genesis/engine/sensors/raycaster.py b/genesis/engine/sensors/raycaster.py
index c0834147af..623c761beb 100644
--- a/genesis/engine/sensors/raycaster.py
+++ b/genesis/engine/sensors/raycaster.py
@@ -44,6 +44,76 @@ class _SolverBVH(NamedTuple):
     raycast_mask: np.ndarray | None
 
 
+# Per-process cache of collision/visual BVH lists, keyed by ``id(sim)`` so the raycaster and any tactile-probe
+# sensors in ``contact_depth_query="raycast"`` mode share one BVH object per simulator without mutating sim state.
+_SIM_BVHS: dict[int, list[_SolverBVH]] = {}
+
+
+def ensure_solver_bvhs(sim) -> list[_SolverBVH]:
+    """
+    Idempotently build the per-(solver, mesh-type) collision/visual BVHs for ``sim`` and return the shared list.
+
+    Both ``RaycasterSensor`` and tactile-probe sensors in ``contact_depth_query="raycast"`` mode reuse the same list
+    so they share the BVH state for a given sim.
+    """
+    sim_key = id(sim)
+    existing = _SIM_BVHS.get(sim_key)
+    if existing is not None:
+        return existing
+
+    solver_bvhs: list[_SolverBVH] = []
+    for solver in (sim.rigid_solver, sim.kinematic_solver):
+        if not solver.is_active:
+            continue
+        n_envs = solver._B
+        if isinstance(solver, RigidSolver):
+            n_faces = solver.faces_info.geom_idx.shape[0]
+            aabb = AABB(n_batches=n_envs, n_aabbs=n_faces)
+            bvh = LBVH(aabb, max_n_query_result_per_aabb=0, n_radix_sort_groups=64)
+            solver_bvhs.append(_SolverBVH(solver, bvh, aabb, None))
+        n_vfaces = solver.vfaces_info.vgeom_idx.shape[0]
+        if n_vfaces > 0:
+            mask = RaycasterSensor._compute_visual_raycast_mask(solver)
+            if mask.any():
+                aabb = AABB(n_batches=n_envs, n_aabbs=n_vfaces)
+                bvh = LBVH(aabb, max_n_query_result_per_aabb=0, n_radix_sort_groups=64)
+                solver_bvhs.append(_SolverBVH(solver, bvh, aabb, mask))
+
+    _SIM_BVHS[sim_key] = solver_bvhs
+    return solver_bvhs
+
+
+def update_solver_bvhs(solver_bvhs: list[_SolverBVH]) -> None:
+    """Rebuild ``solver_bvhs`` from current scene state. Cheap dedup is intentionally NOT applied here: callers can
+    invoke this between scene mutations (e.g. ``set_pos``) without an intervening simulator substep, so any cross-call
+    deduplication would mask real geometry changes."""
+    for entry in solver_bvhs:
+        if entry.raycast_mask is None:
+            kernel_update_verts_and_aabbs(
+                geoms_info=entry.solver.geoms_info,
+                geoms_state=entry.solver.geoms_state,
+                verts_info=entry.solver.verts_info,
+                faces_info=entry.solver.faces_info,
+                free_verts_state=entry.solver.free_verts_state,
+                fixed_verts_state=entry.solver.fixed_verts_state,
+                static_rigid_sim_config=entry.solver._static_rigid_sim_config,
+                aabb_state=entry.aabb,
+            )
+            entry.bvh.build()
+        else:
+            entry.solver.update_forward_pos()
+            entry.solver.update_vgeoms()
+            kernel_update_visual_aabbs(
+                vverts_info=entry.solver.vverts_info,
+                vverts_state=entry.solver.vverts_state,
+                vfaces_info=entry.solver.vfaces_info,
+                vgeoms_state=entry.solver.vgeoms_state,
+                face_mask=entry.raycast_mask,
+                aabb_state=entry.aabb,
+            )
+            entry.bvh.build()
+
+
 @dataclass
 class RaycasterSharedMetadata(KinematicSensorMetadataMixin, SimpleSensorMetadata):
     # All BVHs (one per active solver per mesh type) cast against each frame. The first is written into the output cache
@@ -106,35 +176,7 @@ def _compute_visual_raycast_mask(solver: "KinematicSolver") -> np.ndarray:
     @classmethod
     def _update_bvh(cls, shared_metadata: RaycasterSharedMetadata):
         """Rebuild every BVH from current geometry in the scene."""
-        for entry in shared_metadata.solver_bvhs:
-            if entry.raycast_mask is None:
-                kernel_update_verts_and_aabbs(
-                    geoms_info=entry.solver.geoms_info,
-                    geoms_state=entry.solver.geoms_state,
-                    verts_info=entry.solver.verts_info,
-                    faces_info=entry.solver.faces_info,
-                    free_verts_state=entry.solver.free_verts_state,
-                    fixed_verts_state=entry.solver.fixed_verts_state,
-                    static_rigid_sim_config=entry.solver._static_rigid_sim_config,
-                    aabb_state=entry.aabb,
-                )
-                entry.bvh.build()
-            else:
-                # Reads vverts_state.pos as the source of vvert positions. The buffer is seeded by FK at scene.build()
-                # and refreshed for each user-driven entity via set_vverts; entries set via set_vverts survive across
-                # calls until set_vverts(None) re-runs FK over the entity's vgeoms. raycast_mask gates which vfaces
-                # contribute to the BVH; masked-out vfaces keep an inverted AABB and are skipped by ray queries.
-                entry.solver.update_forward_pos()
-                entry.solver.update_vgeoms()
-                kernel_update_visual_aabbs(
-                    vverts_info=entry.solver.vverts_info,
-                    vverts_state=entry.solver.vverts_state,
-                    vfaces_info=entry.solver.vfaces_info,
-                    vgeoms_state=entry.solver.vgeoms_state,
-                    face_mask=entry.raycast_mask,
-                    aabb_state=entry.aabb,
-                )
-                entry.bvh.build()
+        update_solver_bvhs(shared_metadata.solver_bvhs)
 
     def build(self):
         super().build()
@@ -148,23 +190,9 @@ def build(self):
                 self._shared_metadata.sensor_cache_offsets, 0
             )
 
-            sim = self._manager._sim
-            for solver in (sim.rigid_solver, sim.kinematic_solver):
-                if not solver.is_active:
-                    continue
-                n_envs = solver._B
-                if isinstance(solver, RigidSolver):
-                    n_faces = solver.faces_info.geom_idx.shape[0]
-                    aabb = AABB(n_batches=n_envs, n_aabbs=n_faces)
-                    bvh = LBVH(aabb, max_n_query_result_per_aabb=0, n_radix_sort_groups=64)
-                    self._shared_metadata.solver_bvhs.append(_SolverBVH(solver, bvh, aabb, None))
-                n_vfaces = solver.vfaces_info.vgeom_idx.shape[0]
-                if n_vfaces > 0:
-                    mask = self._compute_visual_raycast_mask(solver)
-                    if mask.any():
-                        aabb = AABB(n_batches=n_envs, n_aabbs=n_vfaces)
-                        bvh = LBVH(aabb, max_n_query_result_per_aabb=0, n_radix_sort_groups=64)
-                        self._shared_metadata.solver_bvhs.append(_SolverBVH(solver, bvh, aabb, mask))
+            # Reuse the per-sim shared collision/visual BVH list so tactile-probe sensors in raycast mode see the same
+            # BVH as the raycaster and the per-frame rebuild is amortized across all consumers.
+            self._shared_metadata.solver_bvhs = ensure_solver_bvhs(self._manager._sim)
 
             if not self._shared_metadata.solver_bvhs:
                 gs.raise_exception(
diff --git a/genesis/engine/sensors/tactile_shared.py b/genesis/engine/sensors/tactile_shared.py
index 3c8866fa50..b16ef66df7 100644
--- a/genesis/engine/sensors/tactile_shared.py
+++ b/genesis/engine/sensors/tactile_shared.py
@@ -11,6 +11,7 @@
 from genesis.utils.misc import concat_with_tensor, make_tensor_field
 
 if TYPE_CHECKING:
+    from genesis.engine.sensors.raycaster import _SolverBVH
     from genesis.utils.ring_buffer import TensorRingBuffer
 
 
@@ -433,6 +434,30 @@ class ContactPrefilterMetadataMixin:
     sensor_n_contacts: torch.Tensor = make_tensor_field((0, 0), dtype_factory=lambda: gs.tc_int)
 
 
+# ============================ Contact depth query mode (SDF vs raycast) ============================
+
+
+@dataclass
+class ContactDepthQueryMetadataMixin:
+    """
+    Shared per-sensor-class state for the contact-depth query backend.
+
+    ``contact_depth_query`` is the single resolved mode for every sensor of this class -- ``"sdf"`` or ``"raycast"``,
+    defaulting to ``None`` until the first sensor's ``build()`` sets it. Sensors of the same class must agree (mismatch
+    raises in ``build``); different classes may independently choose modes.
+
+    When mode is ``"raycast"``, ``collision_bvh`` holds a reference to the per-sim shared BVH list built lazily by
+    ``ensure_solver_bvhs`` (genesis/engine/sensors/raycaster.py). ``sensor_candidate_geom_mask`` is a per-(env, sensor)
+    bool gate scattered into per step from the contact prefilter list (KinematicTactile family) or once at build from
+    ``sensor_track_geom_idx`` (ElastomerTaxel); BVH leaves whose ``faces_info.geom_idx`` falls outside the mask are
+    skipped, preserving the SDF path's per-sensor geom-filtering semantics.
+    """
+
+    contact_depth_query: str | None = None
+    collision_bvh: "list[_SolverBVH] | None" = None
+    sensor_candidate_geom_mask: torch.Tensor = make_tensor_field((0, 0, 0), dtype_factory=lambda: gs.tc_bool)
+
+
 # ============================ ViscoelasticHysteresis ============================
 
 
diff --git a/genesis/options/sensors/options.py b/genesis/options/sensors/options.py
index 9d21644114..5dd19118df 100644
--- a/genesis/options/sensors/options.py
+++ b/genesis/options/sensors/options.py
@@ -247,8 +247,16 @@ class ProbesWithNormalSensorOptionsMixin(ProbeSensorOptionsMixin[SensorT]):
 
     probe_local_normal: UnitVec3FType | UnitVec3FArrayType | UnitVec3FGridType = (0.0, 0.0, 1.0)
 
+    def _probe_local_normal_required(self) -> bool:
+        """Override in subclasses where ``probe_local_normal`` is only consumed by an opt-in mode (e.g. raycast
+        contact-depth queries). When ``False``, the per-probe shape validation in ``model_post_init`` is skipped --
+        sensors that never read the normal don't surface confusing length errors for the default value."""
+        return True
+
     def model_post_init(self, context: Any) -> None:
         super().model_post_init(context)
+        if not self._probe_local_normal_required():
+            return
         n_probes = int(np.prod(np.asarray(self.probe_local_pos).shape[:-1]))
         normals = np.asarray(self.probe_local_normal)
         if normals.ndim > 1 and normals.size // 3 != n_probes:
diff --git a/genesis/options/sensors/tactile.py b/genesis/options/sensors/tactile.py
index e335154bdb..0fd5aebcd5 100644
--- a/genesis/options/sensors/tactile.py
+++ b/genesis/options/sensors/tactile.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy as np
 from pydantic import Field, StrictBool
@@ -107,6 +107,12 @@ class TactileProbeSensorOptionsMixin(ProbeSensorOptionsMixin[SensorT]):
         is untouched.
     dead_taxel_value_range : (float, float), optional
         Uniform range for the dead value sampled per channel on reset. Default ``(0.0, 0.0)``.
+    contact_depth_query : {"sdf", "raycast"} or None, optional
+        Per-probe contact-depth backend. ``"sdf"`` queries the per-geom analytic SDF grid (fast, exact for primitives,
+        requires SDF activation). ``"raycast"`` walks the rigid solver's per-frame collision-mesh BVH and combines a
+        sphere/triangle closest-point test with a ray/triangle test along ``-probe_normal`` (handles arbitrary meshes
+        uniformly, shares the BVH with ``RaycasterSensor``). ``None`` (default) defers the choice: all sensors of the
+        same class must agree, and the resolved mode is ``"sdf"`` if no sensor of that class sets it.
     """
 
     debug_contact_color: UnitIntervalVec3Type = (1.0, 0.2, 0.0)
@@ -115,6 +121,7 @@ class TactileProbeSensorOptionsMixin(ProbeSensorOptionsMixin[SensorT]):
     probe_gain_resample_range: PositiveVec2FType | None = None
     dead_taxel_probability: NonNegativeFloat = 0.0
     dead_taxel_value_range: Vec2FType = (0.0, 0.0)
+    contact_depth_query: Literal["sdf", "raycast"] | None = None
 
     def model_post_init(self, context: Any) -> None:
         super().model_post_init(context)
@@ -139,7 +146,10 @@ class PointCloudTactileSensorMixin(TactileProbeSensorOptionsMixin[SensorT]):
     track_link_idx : array-like[int]
         Global link indices whose mesh geometry is used to sample a point cloud from.
     n_sample_points: int | array-like[int]
-        Total FPS samples split across ``track_link_idx``, or one count per tracked link.
+        Total FPS samples split across ``track_link_idx``, or one count per tracked link. Per-variant
+        counts are not supported: when a tracked link belongs to a heterogeneous entity, the per-link
+        count is allocated to every variant on that link (so each parallel environment sees the full
+        count regardless of which variant is active).
     use_visual_mesh : bool
         Whether to use the visual mesh when sampling the point cloud.
     debug_point_cloud_color : array-like[float, float, float, float]
@@ -160,8 +170,13 @@ class ContactProbe(
     RigidSensorOptionsMixin["ContactProbeSensor"],
     SimpleSensorOptions["ContactProbeSensor"],
     TactileProbeSensorOptionsMixin["ContactProbeSensor"],
+    ProbesWithNormalSensorOptionsMixin["ContactProbeSensor"],
     ViscoelasticHysteresisOptionsMixin["ContactProbeSensor"],
 ):
+    def _probe_local_normal_required(self) -> bool:
+        # SDF mode ignores probe_normal entirely; raycast mode uses it for the per-leaf raycast subtest.
+        return self.contact_depth_query == "raycast"
+
     """
     Returns boolean contact per probe based on the contact depth threshold.
 
@@ -197,12 +212,17 @@ class ContactDepthProbe(
     RigidSensorOptionsMixin["ContactDepthProbeSensor"],
     SimpleSensorOptions["ContactDepthProbeSensor"],
     TactileProbeSensorOptionsMixin["ContactDepthProbeSensor"],
+    ProbesWithNormalSensorOptionsMixin["ContactDepthProbeSensor"],
     ViscoelasticHysteresisOptionsMixin["ContactDepthProbeSensor"],
 ):
     """
     Returns contact depth in meters per probe.
     """
 
+    def _probe_local_normal_required(self) -> bool:
+        # SDF mode ignores probe_normal entirely; raycast mode uses it for the per-leaf raycast subtest.
+        return self.contact_depth_query == "raycast"
+
 
 class KinematicTaxel(
     RigidSensorOptionsMixin["KinematicTaxelSensor"],
diff --git a/genesis/utils/raycast_qd.py b/genesis/utils/raycast_qd.py
index 8eed2b27cc..376d8e9298 100644
--- a/genesis/utils/raycast_qd.py
+++ b/genesis/utils/raycast_qd.py
@@ -108,10 +108,7 @@ def bvh_ray_cast(
                 if hit_result.w > 0.0 and hit_result.x < closest_distance and hit_result.x >= 0.0:
                     closest_distance = hit_result.x
                     hit_face = i_f
-                    # Compute triangle normal
-                    edge1 = v1 - v0
-                    edge2 = v2 - v0
-                    hit_normal = edge1.cross(edge2).normalized()
+                    hit_normal = triangle_face_normal(v0, v1, v2)
             else:  # Internal node
                 # Push children onto stack
                 if stack_idx < qd.static(STACK_SIZE - 2):
@@ -226,6 +223,71 @@ def ray_aabb_intersection(
     return result
 
 
+@qd.func
+def closest_point_on_triangle(
+    point: qd.types.vector(3),
+    v0: qd.types.vector(3),
+    v1: qd.types.vector(3),
+    v2: qd.types.vector(3),
+) -> qd.types.vector(3):
+    """
+    Closest point on a triangle to a query point. Reference: Christer Ericson, Real-Time Collision Detection §5.1.5.
+    """
+    ab = v1 - v0
+    ac = v2 - v0
+    ap = point - v0
+
+    d1 = ab.dot(ap)
+    d2 = ac.dot(ap)
+
+    closest = v0
+    if not (d1 <= 0.0 and d2 <= 0.0):
+        bp = point - v1
+        d3 = ab.dot(bp)
+        d4 = ac.dot(bp)
+
+        if d3 >= 0.0 and d4 <= d3:
+            closest = v1
+        else:
+            cp = point - v2
+            d5 = ab.dot(cp)
+            d6 = ac.dot(cp)
+
+            if d6 >= 0.0 and d5 <= d6:
+                closest = v2
+            else:
+                vc = d1 * d4 - d3 * d2
+                if vc <= 0.0 and d1 >= 0.0 and d3 <= 0.0:
+                    w = d1 / (d1 - d3)
+                    closest = v0 + w * ab
+                else:
+                    vb = d5 * d2 - d1 * d6
+                    if vb <= 0.0 and d2 >= 0.0 and d6 <= 0.0:
+                        w = d2 / (d2 - d6)
+                        closest = v0 + w * ac
+                    else:
+                        va = d3 * d6 - d5 * d4
+                        if va <= 0.0 and (d4 - d3) >= 0.0 and (d5 - d6) >= 0.0:
+                            w = (d4 - d3) / ((d4 - d3) + (d5 - d6))
+                            closest = v1 + w * (v2 - v1)
+                        else:
+                            denom = 1.0 / (va + vb + vc)
+                            v = vb * denom
+                            w = vc * denom
+                            closest = v0 + v * ab + w * ac
+    return closest
+
+
+@qd.func
+def triangle_face_normal(
+    v0: qd.types.vector(3),
+    v1: qd.types.vector(3),
+    v2: qd.types.vector(3),
+) -> qd.types.vector(3):
+    """Outward unit normal of the triangle (v0, v1, v2) under right-hand winding."""
+    return (v1 - v0).cross(v2 - v0).normalized()
+
+
 @qd.func
 def update_aabbs(
     free_verts_state: array_class.VertsState,
@@ -362,9 +424,7 @@ def bvh_ray_cast_visual(
                 if hit_result.w > 0.0 and hit_result.x < closest_distance and hit_result.x >= 0.0:
                     closest_distance = hit_result.x
                     hit_face = i_f
-                    edge1 = v1 - v0
-                    edge2 = v2 - v0
-                    hit_normal = edge1.cross(edge2).normalized()
+                    hit_normal = triangle_face_normal(v0, v1, v2)
             else:
                 if stack_idx < qd.static(STACK_SIZE - 2):
                     node_stack[stack_idx] = node.left
diff --git a/tests/test_sensors.py b/tests/test_sensors.py
index bf4edf26a2..c34eea7a3c 100644
--- a/tests/test_sensors.py
+++ b/tests/test_sensors.py
@@ -2796,6 +2796,16 @@ def test_tactile_sensors_heterogeneous_object(show_viewer, tol):
     )
 
     scene.build(n_envs=2)
+
+    # Per-variant sampling: each heterogeneous variant must receive the full n_sample_points budget so
+    # every parallel env sees the requested point count regardless of which variant is active there.
+    for pc_sensor, n_requested in ((proximity_taxel, 800), (elastomer_taxel, 800)):
+        meta = pc_sensor._shared_metadata
+        pc_start = int(meta.sensor_pc_start[pc_sensor._idx].item())
+        pc_end = pc_start + int(meta.sensor_pc_n[pc_sensor._idx].item())
+        per_env_active = meta.pc_active_envs_mask[pc_start:pc_end].sum(dim=0)
+        assert_equal(per_env_active, torch.full_like(per_env_active, n_requested))
+
     obj.set_pos(
         [
             [0.0, 0.0, PAD_TOP_Z + OBJECT_Z_SIZE / 2 - PENETRATION],
@@ -2821,6 +2831,84 @@ def test_tactile_sensors_heterogeneous_object(show_viewer, tol):
     assert surface_distance[0, 1] < surface_distance[1, 1]
 
 
+@pytest.mark.required
+def test_tactile_contact_depth_query_sdf_vs_raycast_parity(show_viewer):
+    """SDF and raycast contact-depth backends should agree on a face-on contact across all four probe sensors."""
+    PAD_SIZE = (0.2, 0.2, 0.05)
+    PAD_TOP_Z = PAD_SIZE[2]
+    BALL_R = 0.04
+    PROBE_R = 0.01
+    CENTER_PROBE = (0.0, 0.0, PAD_SIZE[2] / 2)
+
+    scene = gs.Scene(
+        sim_options=gs.options.SimOptions(gravity=(0.0, 0.0, 0.0)),
+        profiling_options=gs.options.ProfilingOptions(show_FPS=False),
+        show_viewer=show_viewer,
+    )
+    pad = scene.add_entity(gs.morphs.Box(size=PAD_SIZE, pos=(0.0, 0.0, PAD_SIZE[2] / 2), fixed=True))
+    ball = scene.add_entity(gs.morphs.Sphere(radius=BALL_R, pos=(0.0, 0.0, 0.4)))
+
+    common = dict(entity_idx=pad.idx, probe_local_pos=(CENTER_PROBE,), probe_radius=PROBE_R)
+    depth_sdf = scene.add_sensor(gs.sensors.ContactDepthProbe(contact_depth_query="sdf", **common))
+    depth_ray = scene.add_sensor(gs.sensors.ContactDepthProbe(contact_depth_query="raycast", **common))
+    kin_sdf = scene.add_sensor(
+        gs.sensors.KinematicTaxel(
+            probe_local_normal=(0.0, 0.0, 1.0),
+            normal_stiffness=100.0,
+            normal_damping=0.0,
+            shear_scalar=0.0,
+            twist_scalar=0.0,
+            contact_depth_query="sdf",
+            **common,
+        )
+    )
+    kin_ray = scene.add_sensor(
+        gs.sensors.KinematicTaxel(
+            probe_local_normal=(0.0, 0.0, 1.0),
+            normal_stiffness=100.0,
+            normal_damping=0.0,
+            shear_scalar=0.0,
+            twist_scalar=0.0,
+            contact_depth_query="raycast",
+            **common,
+        )
+    )
+    elast_common = dict(
+        entity_idx=pad.idx,
+        probe_local_pos=(CENTER_PROBE,),
+        probe_local_normal=(0.0, 0.0, 1.0),
+        probe_radius=PROBE_R,
+        track_link_idx=(ball.base_link_idx,),
+        n_sample_points=200,
+    )
+    elast_sdf = scene.add_sensor(gs.sensors.ElastomerTaxel(contact_depth_query="sdf", **elast_common))
+    elast_ray = scene.add_sensor(gs.sensors.ElastomerTaxel(contact_depth_query="raycast", **elast_common))
+    scene.build(n_envs=0)
+
+    ball.set_pos((0.0, 0.0, PAD_TOP_Z + BALL_R - 0.005))  # 5mm penetration
+    scene.step()
+
+    # ContactDepthProbe -- the probe-radius offset semantics mean the actual numeric values are similar in both modes.
+    sdf_d = depth_sdf.read_ground_truth()
+    ray_d = depth_ray.read_ground_truth()
+    assert (sdf_d > gs.EPS).all() and (ray_d > gs.EPS).all()
+    assert_allclose(sdf_d, ray_d, tol=0.1 * PROBE_R)
+
+    # KinematicTaxel force: both modes report a force in the same direction with magnitude within mesh-discretization
+    # tolerance of each other.
+    sdf_f = kin_sdf.read_ground_truth().force.reshape(-1, 3)
+    ray_f = kin_ray.read_ground_truth().force.reshape(-1, 3)
+    assert torch.linalg.norm(sdf_f, dim=-1).item() > 0
+    assert torch.linalg.norm(ray_f, dim=-1).item() > 0
+    cos_sim = (sdf_f * ray_f).sum(dim=-1) / (
+        torch.linalg.norm(sdf_f, dim=-1) * torch.linalg.norm(ray_f, dim=-1) + gs.EPS
+    )
+    assert (cos_sim > 0.9).all(), f"force direction mismatch: cos_sim={cos_sim}"
+
+    # ElastomerTaxel dilate displacement: face-on contact, identical on both modes when geom is a sphere primitive.
+    assert_allclose(elast_sdf.read_ground_truth(), elast_ray.read_ground_truth(), tol=0.1 * PROBE_R)
+
+
 # ------------------------------------------------------------------------------------------
 # ----------------------------------- Bulk read API ----------------------------------------
 # ------------------------------------------------------------------------------------------

From 10e9993b4de74bd6d8d1e27a454df2769187d21a Mon Sep 17 00:00:00 2001
From: Trinity Chung <trinityjchung@gmail.com>
Date: Thu, 28 May 2026 00:05:53 -0400
Subject: [PATCH 7/7] cleanup

---
 genesis/engine/sensors/kinematic_tactile.py   | 46 +++++--------------
 genesis/engine/sensors/point_cloud_tactile.py | 23 ++++------
 genesis/engine/sensors/tactile_shared.py      | 17 ++++---
 tests/test_sensors.py                         | 43 ++++++++++++++---
 4 files changed, 67 insertions(+), 62 deletions(-)

diff --git a/genesis/engine/sensors/kinematic_tactile.py b/genesis/engine/sensors/kinematic_tactile.py
index 456573eeb7..dd26ea22bc 100644
--- a/genesis/engine/sensors/kinematic_tactile.py
+++ b/genesis/engine/sensors/kinematic_tactile.py
@@ -24,6 +24,8 @@
     triangle_face_normal,
 )
 
+from .raycaster import ensure_solver_bvhs, update_solver_bvhs
+
 from .base_sensor import RigidSensorMetadataMixin, RigidSensorMixin, SimpleSensor, SimpleSensorMetadata
 from .probe import (
     ProbeSensorMetadataMixin,
@@ -906,41 +908,17 @@ def _kernel_kinematic_taxel_bvh(
                 output_measured[torque_start + j, i_b] = torque_m[j]
 
 
-def _resolve_query_mode(shared_metadata) -> str:
-    """Resolve ``shared_metadata.contact_depth_query`` to ``"sdf"`` or ``"raycast"``. ``None`` defaults to ``"sdf"``
-    and is latched in so subsequent calls short-circuit."""
-    mode = shared_metadata.contact_depth_query
-    if mode is None:
-        mode = "sdf"
-        shared_metadata.contact_depth_query = mode
-    return mode
-
-
-def _ensure_candidate_geom_mask(shared_metadata, B: int, n_sensors: int, n_geoms: int) -> None:
-    """(Re)allocate ``sensor_candidate_geom_mask`` to ``(B, n_sensors, n_geoms)`` if its current shape doesn't match.
-    Sized lazily because the resolved mode may flip to ``"raycast"`` only after all sensors of this class have built."""
-    target = (B, n_sensors, n_geoms)
-    current = tuple(shared_metadata.sensor_candidate_geom_mask.shape)
-    if current != target:
-        shared_metadata.sensor_candidate_geom_mask = torch.zeros(target, dtype=gs.tc_bool, device=gs.device)
-
-
 class KinematicTactileSensorMixin(ProbeSensorMixin[ProbesWithNormalSensorSharedMetadataT]):
     def build(self):
         super().build()
-        # SDF activation is the default fallback; raycast mode does not require it but the call is idempotent and
-        # cheap so we keep it unconditional to avoid needing a build-finalize hook.
+        # SDF activation is the default fallback; the idempotent call keeps it active even in mixed-mode scenes.
         self._shared_metadata.solver.collider.activate_sdf()
 
-        # Last-value-wins propagation of the contact_depth_query option onto the per-sensor-type shared metadata. If
-        # any sensor of this class opts into "raycast", ensure the per-sim shared collision BVH list exists so it's
-        # ready by the time _update_current_timestep_data runs.
+        # Last-value-wins propagation of contact_depth_query. Build the shared collision BVH lazily on raycast opt-in.
         mode = self._options.contact_depth_query
         if mode is not None:
             self._shared_metadata.contact_depth_query = mode
         if self._shared_metadata.contact_depth_query == "raycast":
-            from genesis.engine.sensors.raycaster import ensure_solver_bvhs
-
             self._shared_metadata.collision_bvh = ensure_solver_bvhs(self._manager._sim)
 
 
@@ -1003,7 +981,7 @@ def _update_current_timestep_data(
             shared_metadata.sensor_n_contacts,
         )
 
-        if _resolve_query_mode(shared_metadata) == "sdf":
+        if (shared_metadata.contact_depth_query or "sdf") == "sdf":
             _kernel_contact_depth_probe(
                 shared_metadata.probe_positions,
                 shared_metadata.probe_sensor_idx,
@@ -1024,10 +1002,10 @@ def _update_current_timestep_data(
                 measured_cols_b,
             )
         else:
-            from genesis.engine.sensors.raycaster import update_solver_bvhs
-
             B, n_sensors = shared_metadata.sensor_n_contacts.shape
-            _ensure_candidate_geom_mask(shared_metadata, B, n_sensors, solver.n_geoms)
+            mask_shape = (B, n_sensors, solver.n_geoms)
+            if tuple(shared_metadata.sensor_candidate_geom_mask.shape) != mask_shape:
+                shared_metadata.sensor_candidate_geom_mask = torch.zeros(mask_shape, dtype=gs.tc_bool, device=gs.device)
             _kernel_build_sensor_candidate_geom_mask(
                 shared_metadata.sensor_contacts_idx,
                 shared_metadata.sensor_n_contacts,
@@ -1444,7 +1422,7 @@ def _update_current_timestep_data(
             shared_metadata.sensor_n_contacts,
         )
 
-        if _resolve_query_mode(shared_metadata) == "sdf":
+        if (shared_metadata.contact_depth_query or "sdf") == "sdf":
             _kernel_kinematic_taxel(
                 shared_metadata.probe_positions,
                 shared_metadata.probe_sensor_idx,
@@ -1475,10 +1453,10 @@ def _update_current_timestep_data(
                 measured_cols_b,
             )
         else:
-            from genesis.engine.sensors.raycaster import update_solver_bvhs
-
             B, n_sensors = shared_metadata.sensor_n_contacts.shape
-            _ensure_candidate_geom_mask(shared_metadata, B, n_sensors, solver.n_geoms)
+            mask_shape = (B, n_sensors, solver.n_geoms)
+            if tuple(shared_metadata.sensor_candidate_geom_mask.shape) != mask_shape:
+                shared_metadata.sensor_candidate_geom_mask = torch.zeros(mask_shape, dtype=gs.tc_bool, device=gs.device)
             _kernel_build_sensor_candidate_geom_mask(
                 shared_metadata.sensor_contacts_idx,
                 shared_metadata.sensor_n_contacts,
diff --git a/genesis/engine/sensors/point_cloud_tactile.py b/genesis/engine/sensors/point_cloud_tactile.py
index 455a533cd3..353c0ca565 100644
--- a/genesis/engine/sensors/point_cloud_tactile.py
+++ b/genesis/engine/sensors/point_cloud_tactile.py
@@ -29,6 +29,7 @@
     func_noised_probe_radius,
     get_measured_bufs,
 )
+from .raycaster import ensure_solver_bvhs, update_solver_bvhs
 from .tactile_shared import (
     BVH_LEAF_SIZE,
     BVH_STACK_SIZE,
@@ -47,6 +48,11 @@
     register_grid_fft_sensor,
 )
 
+# Conservative cap for global-BVH closest-point walks in raycast mode. Points farther than this from every candidate
+# triangle map to depth = 0 (so the elastomer "out of contact" branch fires). Sized to cover realistic elastomer
+# penetrations -- bumping it widens BVH traversal cost but doesn't change correctness for in-contact probes.
+_ELASTOMER_RAYCAST_QUERY_DIST = 0.1
+
 if TYPE_CHECKING:
     from genesis.options.sensors import SensorOptions
     from genesis.utils.ring_buffer import TensorRingBuffer
@@ -1774,17 +1780,14 @@ def build(self):
         if self._link is None:
             gs.raise_exception("ElastomerTaxel must be attached to a rigid link with collision geometry.")
 
-        # Last-value-wins propagation of contact_depth_query. SDF activation is skipped only when this sensor's own
-        # option is explicitly "raycast" -- with last-value-wins semantics this can leave SDF activated in mixed-mode
-        # configurations, but never broken (an unused activated SDF is harmless).
+        # Last-value-wins propagation of contact_depth_query. Skip activate_sdf only when this sensor explicitly opts
+        # into raycast -- with last-wins, mixed-mode scenes may end up with SDF activated unused (harmless).
         mode = self._options.contact_depth_query
         if mode is not None:
             self._shared_metadata.contact_depth_query = mode
         if mode != "raycast":
             solver.collider.activate_sdf()
         if self._shared_metadata.contact_depth_query == "raycast":
-            from genesis.engine.sensors.raycaster import ensure_solver_bvhs
-
             self._shared_metadata.collision_bvh = ensure_solver_bvhs(self._manager._sim)
 
         elastomer_geom_start_row = self._shared_metadata.elastomer_geom_idx.shape[0]
@@ -2032,21 +2035,15 @@ def _update_current_timestep_data(
                 shared_metadata.probe_depth_buf,
             )
         else:
-            from genesis.engine.sensors.raycaster import update_solver_bvhs
-
-            shared_metadata.contact_depth_query = "raycast"  # latch
             update_solver_bvhs(shared_metadata.collision_bvh)
             rigid_entry = next(e for e in shared_metadata.collision_bvh if e.raycast_mask is None)
-            # Conservative BVH walk cap. Far enough to cover any realistic elastomer penetration; probes outside
-            # this radius from every candidate triangle map to depth = 0.
-            max_query_dist = float(0.1)
             _kernel_elastomer_probe_depth_bvh(
                 shared_metadata.probe_positions,
                 shared_metadata.probe_sensor_idx,
                 shared_metadata.probe_radii,
                 shared_metadata.links_idx,
                 shared_metadata.sensor_candidate_geom_mask,
-                max_query_dist,
+                _ELASTOMER_RAYCAST_QUERY_DIST,
                 rigid_entry.bvh.nodes,
                 rigid_entry.bvh.morton_codes,
                 solver.links_state,
@@ -2146,7 +2143,7 @@ def _update_current_timestep_data(
                     shared_metadata.elastomer_contact_sdf_enter,
                     shared_metadata.elastomer_contact_sdf_exit,
                     _ELASTOMER_QUERY_AABB_MARGIN,
-                    float(0.1),  # max_query_dist for the global-BVH closest-point search
+                    _ELASTOMER_RAYCAST_QUERY_DIST,
                     rigid_entry.bvh.nodes,
                     rigid_entry.bvh.morton_codes,
                     solver.links_state,
diff --git a/genesis/engine/sensors/tactile_shared.py b/genesis/engine/sensors/tactile_shared.py
index b16ef66df7..00b9e0cf3e 100644
--- a/genesis/engine/sensors/tactile_shared.py
+++ b/genesis/engine/sensors/tactile_shared.py
@@ -442,15 +442,14 @@ class ContactDepthQueryMetadataMixin:
     """
     Shared per-sensor-class state for the contact-depth query backend.
 
-    ``contact_depth_query`` is the single resolved mode for every sensor of this class -- ``"sdf"`` or ``"raycast"``,
-    defaulting to ``None`` until the first sensor's ``build()`` sets it. Sensors of the same class must agree (mismatch
-    raises in ``build``); different classes may independently choose modes.
-
-    When mode is ``"raycast"``, ``collision_bvh`` holds a reference to the per-sim shared BVH list built lazily by
-    ``ensure_solver_bvhs`` (genesis/engine/sensors/raycaster.py). ``sensor_candidate_geom_mask`` is a per-(env, sensor)
-    bool gate scattered into per step from the contact prefilter list (KinematicTactile family) or once at build from
-    ``sensor_track_geom_idx`` (ElastomerTaxel); BVH leaves whose ``faces_info.geom_idx`` falls outside the mask are
-    skipped, preserving the SDF path's per-sensor geom-filtering semantics.
+    ``contact_depth_query`` is the resolved mode for every sensor of this class -- ``"sdf"`` or ``"raycast"``. Each
+    sensor's ``build()`` overwrites the field with its own option when non-``None`` (last-value-wins); ``None`` at
+    update time falls back to ``"sdf"``.
+
+    When mode is ``"raycast"``, ``collision_bvh`` references the per-sim shared BVH list built by
+    ``ensure_solver_bvhs``. ``sensor_candidate_geom_mask`` is a per-(env, sensor, geom) bool gate -- scattered per
+    step from the contact prefilter (KinematicTactile family) or once at build from ``sensor_track_geom_idx``
+    (ElastomerTaxel) -- so BVH leaves whose ``faces_info.geom_idx`` falls outside the mask are skipped.
     """
 
     contact_depth_query: str | None = None
diff --git a/tests/test_sensors.py b/tests/test_sensors.py
index c34eea7a3c..5d13fc76c6 100644
--- a/tests/test_sensors.py
+++ b/tests/test_sensors.py
@@ -2845,12 +2845,33 @@ def test_tactile_contact_depth_query_sdf_vs_raycast_parity(show_viewer):
         profiling_options=gs.options.ProfilingOptions(show_FPS=False),
         show_viewer=show_viewer,
     )
-    pad = scene.add_entity(gs.morphs.Box(size=PAD_SIZE, pos=(0.0, 0.0, PAD_SIZE[2] / 2), fixed=True))
-    ball = scene.add_entity(gs.morphs.Sphere(radius=BALL_R, pos=(0.0, 0.0, 0.4)))
+    pad = scene.add_entity(
+        gs.morphs.Box(
+            size=PAD_SIZE,
+            pos=(0.0, 0.0, PAD_SIZE[2] / 2),
+            fixed=True,
+        ),
+    )
+    ball = scene.add_entity(
+        gs.morphs.Sphere(
+            radius=BALL_R,
+            pos=(0.0, 0.0, 0.4),
+        ),
+    )
 
     common = dict(entity_idx=pad.idx, probe_local_pos=(CENTER_PROBE,), probe_radius=PROBE_R)
-    depth_sdf = scene.add_sensor(gs.sensors.ContactDepthProbe(contact_depth_query="sdf", **common))
-    depth_ray = scene.add_sensor(gs.sensors.ContactDepthProbe(contact_depth_query="raycast", **common))
+    depth_sdf = scene.add_sensor(
+        gs.sensors.ContactDepthProbe(
+            contact_depth_query="sdf",
+            **common,
+        ),
+    )
+    depth_ray = scene.add_sensor(
+        gs.sensors.ContactDepthProbe(
+            contact_depth_query="raycast",
+            **common,
+        ),
+    )
     kin_sdf = scene.add_sensor(
         gs.sensors.KinematicTaxel(
             probe_local_normal=(0.0, 0.0, 1.0),
@@ -2881,8 +2902,18 @@ def test_tactile_contact_depth_query_sdf_vs_raycast_parity(show_viewer):
         track_link_idx=(ball.base_link_idx,),
         n_sample_points=200,
     )
-    elast_sdf = scene.add_sensor(gs.sensors.ElastomerTaxel(contact_depth_query="sdf", **elast_common))
-    elast_ray = scene.add_sensor(gs.sensors.ElastomerTaxel(contact_depth_query="raycast", **elast_common))
+    elast_sdf = scene.add_sensor(
+        gs.sensors.ElastomerTaxel(
+            contact_depth_query="sdf",
+            **elast_common,
+        ),
+    )
+    elast_ray = scene.add_sensor(
+        gs.sensors.ElastomerTaxel(
+            contact_depth_query="raycast",
+            **elast_common,
+        ),
+    )
     scene.build(n_envs=0)
 
     ball.set_pos((0.0, 0.0, PAD_TOP_Z + BALL_R - 0.005))  # 5mm penetration