diff --git a/Cargo.lock b/Cargo.lock
index bc6799a91..a3cc14077 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5274,16 +5274,8 @@ dependencies = [
 name = "sedona-libgpuspatial"
 version = "0.3.0"
 dependencies = [
- "arrow-array",
- "arrow-schema",
  "bindgen",
  "cmake",
- "log",
- "sedona-expr",
- "sedona-geos",
- "sedona-schema",
- "sedona-testing",
- "thiserror 2.0.17",
  "which",
 ]
 
diff --git a/c/sedona-libgpuspatial/Cargo.toml b/c/sedona-libgpuspatial/Cargo.toml
index f271cd57a..01840813c 100644
--- a/c/sedona-libgpuspatial/Cargo.toml
+++ b/c/sedona-libgpuspatial/Cargo.toml
@@ -35,15 +35,3 @@ gpu = []
 bindgen = "0.72.1"
 cmake = "0.1"
 which = "8.0"
-
-[dependencies]
-arrow-array = { workspace = true, features = ["ffi"] }
-arrow-schema = { workspace = true }
-thiserror = { workspace = true }
-log = "0.4"
-sedona-schema = { path = "../../rust/sedona-schema" }
-
-[dev-dependencies]
-sedona-expr = { path = "../../rust/sedona-expr" }
-sedona-geos = { path = "../sedona-geos" }
-sedona-testing = { path = "../../rust/sedona-testing" }
diff --git a/c/sedona-libgpuspatial/build.rs b/c/sedona-libgpuspatial/build.rs
index 6bf5f3f8b..dacaaccef 100644
--- a/c/sedona-libgpuspatial/build.rs
+++ b/c/sedona-libgpuspatial/build.rs
@@ -119,10 +119,18 @@ fn main() {
                 println!("cargo:warning=CMAKE_CUDA_ARCHITECTURES environment variable not set. Defaulting to '86;89'.");
                 "86;89".to_string()
             });
+        // Determine the build profile to match Cargo's debug/release mode
+        let profile_mode = if cfg!(debug_assertions) {
+            "Debug"
+        } else {
+            "Release"
+        };
+
         let dst = cmake::Config::new("./libgpuspatial")
             .define("CMAKE_CUDA_ARCHITECTURES", cuda_architectures)
             .define("CMAKE_POLICY_VERSION_MINIMUM", "3.5") // Allow older CMake versions
             .define("LIBGPUSPATIAL_LOGGING_LEVEL", "WARN") // Set logging level
+            .define("SPDLOG_FMT_EXTERNAL", "OFF") // Prevent spdlog from using external fmt library
             .build();
         let include_path = dst.join("include");
         println!(
@@ -157,6 +165,17 @@ fn main() {
         println!("cargo:rustc-link-lib=static=gpuspatial");
         println!("cargo:rustc-link-lib=static=rmm");
         println!("cargo:rustc-link-lib=static=rapids_logger");
+        // Use the 'd' suffix for the debug build of spdlog (libspdlogd.a)
+        let spdlog_lib_name = if cfg!(debug_assertions) {
+            "spdlogd"
+        } else {
+            "spdlog"
+        };
+        println!(
+            "cargo:warning=Linking spdlog in {} mode: lib{}.a",
+            profile_mode, spdlog_lib_name
+        );
+        println!("cargo:rustc-link-lib=static={}", spdlog_lib_name);
         println!("cargo:rustc-link-lib=static=geoarrow");
         println!("cargo:rustc-link-lib=static=nanoarrow");
         println!("cargo:rustc-link-lib=stdc++");
diff --git a/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt b/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
index 773cf2061..eab272481 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
+++ b/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
@@ -132,8 +132,13 @@ config_shaders(PTX_FILES)
 
 message("-- Config shader PTX files ${PTX_FILES}")
 
-add_library(gpuspatial src/rt/rt_engine.cpp src/relate_engine.cu src/spatial_joiner.cu
-                       ${PTX_FILES})
+add_library(gpuspatial
+            src/rt/rt_engine.cpp
+            src/memory_manager.cc
+            src/relate_engine.cu
+            src/rt_spatial_index.cu
+            src/rt_spatial_refiner.cu
+            ${PTX_FILES})
 
 # Link libraries
 target_link_libraries(gpuspatial
@@ -142,8 +147,7 @@ target_link_libraries(gpuspatial
                              cuda
                              rmm::rmm
                              rapids_logger::rapids_logger
-                             OptiX
-                      PRIVATE zstd)
+                             OptiX)
 
 # Set include directories
 target_include_directories(gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json b/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json
index 55248ea7f..0cb8a7fbb 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json
+++ b/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json
@@ -31,7 +31,7 @@
             "name": "default",
             "configurePreset": "default-with-tests",
             "environment": {
-                "GPUSPATIAL_TEST_DIR": "${sourceDir}/test_data"
+                "GPUSPATIAL_TEST_DIR": "${sourceDir}/test/data"
             }
         }
     ]
diff --git a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake
index 1f4d53c22..a7314c151 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake
+++ b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake
@@ -47,6 +47,7 @@ function(find_and_configure_geoarrow)
                   "BUILD_SHARED_LIBS OFF"
                   ${_exclude_from_all})
   set_target_properties(geoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  target_compile_options(geoarrow PRIVATE -Wno-conversion)
   rapids_export_find_package_root(BUILD
                                   geoarrow
                                   "${geoarrow_BINARY_DIR}"
diff --git a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake
index ecc3b4179..396831475 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake
+++ b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake
@@ -48,6 +48,7 @@ function(find_and_configure_nanoarrow)
                   "NANOARROW_NAMESPACE gpuspatial"
                   ${_exclude_from_all})
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  target_compile_options(nanoarrow PRIVATE -Wno-conversion)
   rapids_export_find_package_root(BUILD
                                   nanoarrow
                                   "${nanoarrow_BINARY_DIR}"
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.hpp
similarity index 89%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.hpp
index 9fb33fa8e..971f3565d 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.hpp
@@ -16,9 +16,9 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/helpers.h"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/helpers.cuh"
 
 #include <optix_types.h>
 
@@ -86,22 +86,26 @@ class Box {
   }
 
   DEV_HOST_INLINE OptixAabb ToOptixAabb() const {
-    OptixAabb aabb;
+    OptixAabb aabb{0, 0, 0, 0, 0, 0};
 
-    memset(&aabb, 0, sizeof(OptixAabb));
-    if (sizeof(scalar_t) == sizeof(float)) {
+    if constexpr (sizeof(scalar_t) == sizeof(float)) {
       for (int dim = 0; dim < n_dim; dim++) {
-        reinterpret_cast<float*>(&aabb.minX)[dim] = min_.get_coordinate(dim);
-        reinterpret_cast<float*>(&aabb.maxX)[dim] = max_.get_coordinate(dim);
+        auto min_val = min_.get_coordinate(dim);
+        auto max_val = max_.get_coordinate(dim);
+        if (min_val == max_val) {
+          min_val = next_float_from_double(min_val, -1, 2);
+          max_val = next_float_from_double(max_val, 1, 2);
+        }
+        (&aabb.minX)[dim] = min_val;
+        (&aabb.maxX)[dim] = max_val;
       }
     } else {
       for (int dim = 0; dim < n_dim; dim++) {
         auto min_val = min_.get_coordinate(dim);
         auto max_val = max_.get_coordinate(dim);
 
-        reinterpret_cast<float*>(&aabb.minX)[dim] =
-            next_float_from_double(min_val, -1, 2);
-        reinterpret_cast<float*>(&aabb.maxX)[dim] = next_float_from_double(max_val, 1, 2);
+        (&aabb.minX)[dim] = next_float_from_double(min_val, -1, 2);
+        (&aabb.maxX)[dim] = next_float_from_double(max_val, 1, 2);
       }
     }
     return aabb;
@@ -137,6 +141,8 @@ class Box {
 
   DEV_HOST_INLINE scalar_t get_min(int dim) const { return min_.get_coordinate(dim); }
 
+  DEV_HOST_INLINE bool valid() const { return !min_.empty() && !max_.empty(); }
+
   DEV_HOST_INLINE const point_t& get_max() const { return max_; }
 
   DEV_HOST_INLINE scalar_t get_max(int dim) const { return max_.get_coordinate(dim); }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.hpp
similarity index 95%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.hpp
index 433317190..66c7dee45 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.hpp
@@ -15,15 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/geometry_type.cuh"
-#include "gpuspatial/geom/line_string.cuh"
-#include "gpuspatial/geom/multi_line_string.cuh"
-#include "gpuspatial/geom/multi_point.cuh"
-#include "gpuspatial/geom/multi_polygon.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/geom/polygon.cuh"
-#include "gpuspatial/utils/array_view.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/geometry_type.hpp"
+#include "gpuspatial/geom/line_string.hpp"
+#include "gpuspatial/geom/multi_line_string.hpp"
+#include "gpuspatial/geom/multi_point.hpp"
+#include "gpuspatial/geom/multi_polygon.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/geom/polygon.hpp"
+#include "gpuspatial/utils/array_view.hpp"
 
 namespace gpuspatial {
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_type.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_type.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_type.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_type.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.hpp
index 75f83f38e..a4eef0707 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.hpp
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/floating_point.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/floating_point.hpp"
 
 namespace gpuspatial {
 template <typename POINT_T>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.hpp
index e0ddabe8e..00b57b0d9 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.hpp
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/line_segment.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/geom/line_segment.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 namespace gpuspatial {
 template <typename POINT_T>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.hpp
index b6aae39f8..c5d84f1b6 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.hpp
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/line_string.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/geom/line_string.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 namespace gpuspatial {
 template <typename POINT_T, typename INDEX_T>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.hpp
index e01938e75..e6bc5a226 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.hpp
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 namespace gpuspatial {
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.hpp
similarity index 99%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.hpp
index b1a443aec..9179789c6 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.hpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/polygon.cuh"
+#include "gpuspatial/geom/polygon.hpp"
 
 namespace gpuspatial {
 template <typename POINT_T, typename INDEX_T>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.hpp
similarity index 94%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.hpp
index 500d9def5..006da8d4b 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.hpp
@@ -15,11 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/floating_point.h"
-#include "gpuspatial/utils/type_traits.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/floating_point.hpp"
+#include "gpuspatial/utils/type_traits.hpp"
 
 namespace gpuspatial {
 enum class PointLocation {
@@ -73,7 +73,14 @@ class Point {
 
   DEV_HOST_INLINE const scalar_t* get_data() const { return &data_.x; }
 
-  DEV_HOST_INLINE bool empty() const { return std::isnan(data_.x); }
+  DEV_HOST_INLINE bool empty() const {
+    for (int dim = 0; dim < n_dim; dim++) {
+      if (std::isnan(get_coordinate(dim))) {
+        return true;
+      }
+    }
+    return false;
+  }
 
   DEV_HOST_INLINE void set_empty() {
     for (int dim = 0; dim < n_dim; dim++) {
@@ -102,11 +109,7 @@ class Point {
    * @brief Provides const access to the x-coordinate.
    * This method is only available if N_DIM >= 1.
    */
-  DEV_HOST_INLINE const scalar_t& x() const {
-    if constexpr (N_DIM >= 1) {
-      return data_.x;
-    }
-  }
+  DEV_HOST_INLINE const scalar_t& x() const { return data_.x; }
 
   /**
    * @brief Provides access to the y-coordinate.
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
index 6ed66f168..e457a8fb2 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
@@ -16,11 +16,11 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/line_string.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/floating_point.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/line_string.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/floating_point.hpp"
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/warp/warp_reduce.cuh>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.hpp
index 12963b845..b25a0ad9a 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.hpp
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/doubledouble.h"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/doubledouble.hpp"
 
 namespace gpuspatial {
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h
index b31af58b0..01821ac09 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h
@@ -14,60 +14,203 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+#include <stdbool.h>
 #include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct GpuSpatialJoinerConfig {
-  uint32_t concurrency;
+struct ArrowSchema;
+struct ArrowArray;
+
+// Interfaces for ray-tracing engine (OptiX)
+struct GpuSpatialRuntimeConfig {
+  /** Path to PTX files */
   const char* ptx_root;
+  /** Device ID to use, 0 is the first GPU */
+  int device_id;
+  /** Whether to use CUDA memory pool for allocations */
+  bool use_cuda_memory_pool;
+  /** Ratio of initial memory pool size to total GPU memory, between 0 and 100 */
+  int cuda_memory_pool_init_precent;
+};
+
+/** Opaque runtime for GPU spatial operations
+ * Each process should have exactly one instance of GpuSpatialRuntime
+ */
+struct GpuSpatialRuntime {
+  /** Initialize the runtime (OptiX) with the given configuration
+   * @return 0 on success, non-zero on failure
+   */
+  int (*init)(struct GpuSpatialRuntime* self, struct GpuSpatialRuntimeConfig* config);
+  void (*release)(struct GpuSpatialRuntime* self);
+  const char* (*get_last_error)(struct GpuSpatialRuntime* self);
+  void* private_data;
+};
+
+/** Create an instance of GpuSpatialRuntime */
+void GpuSpatialRuntimeCreate(struct GpuSpatialRuntime* runtime);
+
+struct GpuSpatialIndexConfig {
+  /** Pointer to an initialized GpuSpatialRuntime struct */
+  struct GpuSpatialRuntime* runtime;
+  /** How many threads will concurrently call Probe method */
+  uint32_t concurrency;
+};
+
+// An opaque context for concurrent probing
+struct SedonaSpatialIndexContext {
+  void* private_data;
+};
+
+struct SedonaFloatIndex2D {
+  /** Clear the spatial index, removing all built data */
+  int (*clear)(struct SedonaFloatIndex2D* self);
+  /** Create a new context for concurrent probing */
+  void (*create_context)(struct SedonaSpatialIndexContext* context);
+  /** Destroy a previously created context */
+  void (*destroy_context)(struct SedonaSpatialIndexContext* context);
+  /** Push rectangles for building the spatial index, each rectangle is represented by 4
+   * floats: [min_x, min_y, max_x, max_y].
+   * Points can also be indexed by providing degenerated rectangles [x, y, x, y].
+   *
+   * @return 0 on success, non-zero on failure
+   */
+  int (*push_build)(struct SedonaFloatIndex2D* self, const float* buf, uint32_t n_rects);
+  /**
+   * Finish building the spatial index after all rectangles have been pushed
+   *
+   * @return 0 on success, non-zero on failure
+   */
+  int (*finish_building)(struct SedonaFloatIndex2D* self);
+  /**
+   * Probe the spatial index with the given rectangles, each rectangle is represented by 4
+   * floats: [min_x, min_y, max_x, max_y] Points can also be probed by providing [x, y, x,
+   * y] but points and rectangles cannot be mixed in one Probe call. The results of the
+   * probe will be stored in the context.
+   *
+   * @return 0 on success, non-zero on failure
+   */
+  int (*probe)(struct SedonaFloatIndex2D* self, struct SedonaSpatialIndexContext* context,
+               const float* buf, uint32_t n_rects);
+  /** Get the build indices buffer from the context
+   *
+   * @return A pointer to the buffer and its length
+   */
+  void (*get_build_indices_buffer)(struct SedonaSpatialIndexContext* context,
+                                   uint32_t** build_indices,
+                                   uint32_t* build_indices_length);
+  /** Get the probe indices buffer from the context
+   *
+   * @return A pointer to the buffer and its length
+   */
+  void (*get_probe_indices_buffer)(struct SedonaSpatialIndexContext* context,
+                                   uint32_t** probe_indices,
+                                   uint32_t* probe_indices_length);
+  /** Get the last error message from either the index
+   *
+   * @return A pointer to the error message string
+   */
+  const char* (*get_last_error)(struct SedonaFloatIndex2D* self);
+  /** Get the last error message from the context
+   *
+   * @return A pointer to the error message string
+   */
+  const char* (*context_get_last_error)(struct SedonaSpatialIndexContext* context);
+  /** Release the spatial index and free all resources */
+  void (*release)(struct SedonaFloatIndex2D* self);
+  void* private_data;
 };
 
-struct GpuSpatialJoinerContext {
-  const char* last_error;  // Pointer to std::string to store last error message
-  void* private_data;      // GPUSpatial context
-  void* build_indices;     // Pointer to std::vector<uint32_t> to store results
-  void* stream_indices;
+/** Create an instance of GpuSpatialIndex for 2D float rectangles/points
+ *  @return 0 on success, non-zero on failure
+ */
+int GpuSpatialIndexFloat2DCreate(struct SedonaFloatIndex2D* index,
+                                 const struct GpuSpatialIndexConfig* config);
+
+struct GpuSpatialRefinerConfig {
+  /** Pointer to an initialized GpuSpatialRuntime struct */
+  struct GpuSpatialRuntime* runtime;
+  /** How many threads will concurrently call Probe method */
+  uint32_t concurrency;
+  /** Whether to compress the BVH structures to save memory */
+  bool compress_bvh;
+  /** Number of batches to pipeline for parsing and refinement; setting to 1 disables
+   * pipelining */
+  uint32_t pipeline_batches;
 };
 
-enum GpuSpatialPredicate {
-  GpuSpatialPredicateEquals = 0,
-  GpuSpatialPredicateDisjoint,
-  GpuSpatialPredicateTouches,
-  GpuSpatialPredicateContains,
-  GpuSpatialPredicateCovers,
-  GpuSpatialPredicateIntersects,
-  GpuSpatialPredicateWithin,
-  GpuSpatialPredicateCoveredBy
+enum SedonaSpatialRelationPredicate {
+  SedonaSpatialPredicateEquals = 0,
+  SedonaSpatialPredicateDisjoint,
+  SedonaSpatialPredicateTouches,
+  SedonaSpatialPredicateContains,
+  SedonaSpatialPredicateCovers,
+  SedonaSpatialPredicateIntersects,
+  SedonaSpatialPredicateWithin,
+  SedonaSpatialPredicateCoveredBy
 };
 
-struct GpuSpatialJoiner {
-  int (*init)(struct GpuSpatialJoiner* self, struct GpuSpatialJoinerConfig* config);
-  void (*clear)(struct GpuSpatialJoiner* self);
-  void (*create_context)(struct GpuSpatialJoiner* self,
-                         struct GpuSpatialJoinerContext* context);
-  void (*destroy_context)(struct GpuSpatialJoinerContext* context);
-  int (*push_build)(struct GpuSpatialJoiner* self, const struct ArrowSchema* schema,
-                    const struct ArrowArray* array, int64_t offset, int64_t length);
-  int (*finish_building)(struct GpuSpatialJoiner* self);
-  int (*push_stream)(struct GpuSpatialJoiner* self,
-                     struct GpuSpatialJoinerContext* context,
-                     const struct ArrowSchema* schema, const struct ArrowArray* array,
-                     int64_t offset, int64_t length, enum GpuSpatialPredicate predicate,
-                     int32_t array_index_offset);
-  void (*get_build_indices_buffer)(struct GpuSpatialJoinerContext* context,
-                                   void** build_indices, uint32_t* build_indices_length);
-  void (*get_stream_indices_buffer)(struct GpuSpatialJoinerContext* context,
-                                    void** stream_indices,
-                                    uint32_t* stream_indices_length);
-  void (*release)(struct GpuSpatialJoiner* self);
+/** An opaque spatial refiner that can refine candidate pairs of geometries */
+struct SedonaSpatialRefiner {
+  /** Clear all built geometries from the refiner */
+  int (*clear)(struct SedonaSpatialRefiner* self);
+
+  int (*init_schema)(struct SedonaSpatialRefiner* self,
+                     const struct ArrowSchema* build_schema,
+                     const struct ArrowSchema* probe_schema);
+
+  /** Push geometries for building the spatial refiner
+   *
+   * @param build_array The Arrow array of the build geometries
+   * @return 0 on success, non-zero on failure
+   */
+  int (*push_build)(struct SedonaSpatialRefiner* self,
+                    const struct ArrowArray* build_array);
+  /**
+   * Finish building the spatial refiner after all geometries have been pushed
+   *
+   * @return 0 on success, non-zero on failure
+   */
+  int (*finish_building)(struct SedonaSpatialRefiner* self);
+
+  /**
+   * Refine candidate pairs of geometries
+   *
+   * @param probe_array The Arrow array of the probe geometries
+   * @param predicate The spatial relation predicate to evaluate
+   * @param build_indices An array of build-side indices corresponding to candidate pairs.
+   * This is a global index from 0 to N-1, where N is the total number of build geometries
+   * pushed.
+   * @param probe_indices An array of probe-side indices corresponding to candidate pairs.
+   * This is a local index from 0 to M - 1, where M is the number of geometries in the
+   * probe_array.
+   * @param indices_size The number of candidate pairs
+   * @param new_indices_size Output parameter to store the number of refined pairs
+   * @return 0 on success, non-zero on failure
+   */
+  int (*refine)(struct SedonaSpatialRefiner* self, const struct ArrowArray* probe_array,
+                enum SedonaSpatialRelationPredicate predicate, uint32_t* build_indices,
+                uint32_t* probe_indices, uint32_t indices_size,
+                uint32_t* new_indices_size);
+
+  /** Get the last error message
+   *
+   * @return A pointer to the error message string
+   */
+  const char* (*get_last_error)(struct SedonaSpatialRefiner* self);
+
+  /** Release the spatial refiner and free all resources */
+  void (*release)(struct SedonaSpatialRefiner* self);
   void* private_data;
-  const char* last_error;
 };
 
-void GpuSpatialJoinerCreate(struct GpuSpatialJoiner* index);
+/** Create an instance of GpuSpatialRefiner
+ * @return 0 on success, non-zero on failure
+ */
+int GpuSpatialRefinerCreate(struct SedonaSpatialRefiner* refiner,
+                            const struct GpuSpatialRefinerConfig* config);
 #ifdef __cplusplus
 }
 #endif
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/geometry_grouper.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/geometry_grouper.hpp
deleted file mode 100644
index 5dab852d1..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/geometry_grouper.hpp
+++ /dev/null
@@ -1,294 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "gpuspatial/utils/launcher.h"
-#include "gpuspatial/utils/morton_code.h"
-
-#include "rmm/cuda_stream_view.hpp"
-#include "rmm/device_uvector.hpp"
-#include "rmm/exec_policy.hpp"
-
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-
-#include <memory>
-
-namespace gpuspatial {
-template <typename POINT_T, typename INDEX_T>
-class GeometryGrouper {
-  using box_t = Box<POINT_T>;
-  static constexpr int n_dim = POINT_T::n_dim;
-  using scalar_t = typename POINT_T::scalar_t;
-
- public:
-  void Group(const rmm::cuda_stream_view& stream,
-             const DeviceGeometries<POINT_T, INDEX_T>& geometries,
-             uint32_t geoms_per_aabb) {
-    switch (geometries.get_geometry_type()) {
-      case GeometryType::kPoint: {
-        Group(
-            stream,
-            geometries.template GetGeometryArrayView<PointArrayView<POINT_T, INDEX_T>>(),
-            geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kMultiPoint: {
-        Group(stream,
-              geometries
-                  .template GetGeometryArrayView<MultiPointArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kLineString: {
-        Group(stream,
-              geometries
-                  .template GetGeometryArrayView<LineStringArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kMultiLineString: {
-        Group(stream,
-              geometries.template GetGeometryArrayView<
-                  MultiLineStringArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kPolygon: {
-        Group(stream,
-              geometries
-                  .template GetGeometryArrayView<PolygonArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kMultiPolygon: {
-        Group(
-            stream,
-            geometries
-                .template GetGeometryArrayView<MultiPolygonArrayView<POINT_T, INDEX_T>>(),
-            geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kBox: {
-        Group(stream,
-              geometries.template GetGeometryArrayView<BoxArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      default:
-        assert(false);
-    }
-  }
-
-  template <typename GEOMETRY_ARRAY_T>
-  void Group(const rmm::cuda_stream_view& stream, const GEOMETRY_ARRAY_T& geometries,
-             uint32_t geoms_per_aabb) {
-    rmm::device_uvector<INDEX_T> morton_codes(geometries.size(), stream);
-    POINT_T min_world_corner, max_world_corner;
-
-    min_world_corner.set_max();
-    max_world_corner.set_min();
-
-    for (int dim = 0; dim < n_dim; dim++) {
-      auto min_val = thrust::transform_reduce(
-          rmm::exec_policy_nosync(stream), thrust::make_counting_iterator<INDEX_T>(0),
-          thrust::make_counting_iterator<INDEX_T>(geometries.size()),
-          [=] __host__ __device__(INDEX_T i) {
-            const auto& geom = geometries[i];
-            const auto& mbr = geom.get_mbr();
-
-            return mbr.get_min(dim);
-          },
-          std::numeric_limits<scalar_t>::max(), thrust::minimum<scalar_t>());
-
-      auto max_val = thrust::transform_reduce(
-          rmm::exec_policy_nosync(stream), thrust::make_counting_iterator<INDEX_T>(0),
-          thrust::make_counting_iterator<INDEX_T>(geometries.size()),
-          [=] __host__ __device__(INDEX_T i) {
-            const auto& geom = geometries[i];
-            const auto& mbr = geom.get_mbr();
-
-            return mbr.get_max(dim);
-          },
-          std::numeric_limits<scalar_t>::lowest(), thrust::maximum<scalar_t>());
-      min_world_corner.set_coordinate(dim, min_val);
-      max_world_corner.set_coordinate(dim, max_val);
-    }
-
-    // compute morton codes and reorder indices
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      thrust::make_counting_iterator<INDEX_T>(0),
-                      thrust::make_counting_iterator<INDEX_T>(geometries.size()),
-                      morton_codes.begin(), [=] __device__(INDEX_T i) {
-                        const auto& geom = geometries[i];
-                        const auto& mbr = geom.get_mbr();
-                        auto p = mbr.centroid();
-                        POINT_T norm_p;
-
-                        for (int dim = 0; dim < n_dim; dim++) {
-                          auto min_val = min_world_corner.get_coordinate(dim);
-                          auto max_val = max_world_corner.get_coordinate(dim);
-                          auto extent = min_val == max_val ? 1 : max_val - min_val;
-                          auto norm_val = (p.get_coordinate(dim) - min_val) / extent;
-                          norm_p.set_coordinate(dim, norm_val);
-                        }
-                        return detail::morton_code(norm_p.get_vec());
-                      });
-    reordered_indices_ =
-        std::make_unique<rmm::device_uvector<INDEX_T>>(geometries.size(), stream);
-    thrust::sequence(rmm::exec_policy_nosync(stream), reordered_indices_->begin(),
-                     reordered_indices_->end());
-    thrust::sort_by_key(rmm::exec_policy_nosync(stream), morton_codes.begin(),
-                        morton_codes.end(), reordered_indices_->begin());
-
-    auto n_aabbs = (geometries.size() + geoms_per_aabb - 1) / geoms_per_aabb;
-    aabbs_ = std::make_unique<rmm::device_uvector<OptixAabb>>(n_aabbs, stream);
-    OptixAabb empty_aabb;
-
-    if (n_dim == 2) {
-      empty_aabb = OptixAabb{
-          std::numeric_limits<float>::max(),    std::numeric_limits<float>::max(),    0,
-          std::numeric_limits<float>::lowest(), std::numeric_limits<float>::lowest(), 0};
-    } else if (n_dim == 3) {
-      empty_aabb = OptixAabb{
-          std::numeric_limits<float>::max(),    std::numeric_limits<float>::max(),
-          std::numeric_limits<float>::max(),    std::numeric_limits<float>::lowest(),
-          std::numeric_limits<float>::lowest(), std::numeric_limits<float>::lowest()};
-    }
-
-    thrust::fill(rmm::exec_policy_nosync(stream), aabbs_->begin(), aabbs_->end(),
-                 empty_aabb);
-
-    auto* p_aabbs = aabbs_->data();
-
-    rmm::device_uvector<INDEX_T> n_geoms_per_aabb(n_aabbs, stream);
-
-    auto* p_reordered_indices = reordered_indices_->data();
-    auto* p_n_geoms_per_aabb = n_geoms_per_aabb.data();
-
-    // each warp takes an AABB and processes points_per_aabb points
-    LaunchKernel(stream, [=] __device__() mutable {
-      typedef cub::WarpReduce<scalar_t> WarpReduce;
-      __shared__ typename WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-      auto warp_id = threadIdx.x / 32;
-      auto lane_id = threadIdx.x % 32;
-      auto global_warp_id = TID_1D / 32;
-      auto n_warps = TOTAL_THREADS_1D / 32;
-
-      for (uint32_t aabb_id = global_warp_id; aabb_id < n_aabbs; aabb_id += n_warps) {
-        POINT_T min_corner, max_corner;
-        size_t idx_begin = aabb_id * geoms_per_aabb;
-        size_t idx_end = std::min((size_t)geometries.size(), idx_begin + geoms_per_aabb);
-        size_t idx_end_rup = (idx_end + 31) / 32;
-        idx_end_rup *= 32;  // round up to the next multiple of 32
-
-        p_n_geoms_per_aabb[aabb_id] = idx_end - idx_begin;
-
-        for (auto idx = idx_begin + lane_id; idx < idx_end_rup; idx += 32) {
-          Box<Point<float, POINT_T::n_dim>> mbr;
-
-          auto warp_begin = idx - lane_id;
-          auto warp_end = std::min(warp_begin + 32, idx_end);
-          auto n_valid = warp_end - warp_begin;
-
-          if (idx < idx_end) {
-            auto geom_idx = p_reordered_indices[idx];
-            mbr = geometries[geom_idx].get_mbr();
-          }
-
-          for (int dim = 0; dim < n_dim; dim++) {
-            auto min_val =
-                WarpReduce(temp_storage[warp_id])
-                    .Reduce(mbr.get_min(dim), thrust::minimum<scalar_t>(), n_valid);
-            if (lane_id == 0) {
-              min_corner.set_coordinate(dim, min_val);
-            }
-            auto max_val =
-                WarpReduce(temp_storage[warp_id])
-                    .Reduce(mbr.get_max(dim), thrust::maximum<scalar_t>(), n_valid);
-            if (lane_id == 0) {
-              max_corner.set_coordinate(dim, max_val);
-            }
-          }
-        }
-
-        if (lane_id == 0) {
-          box_t ext_mbr(min_corner, max_corner);
-          p_aabbs[aabb_id] = ext_mbr.ToOptixAabb();
-        }
-      }
-    });
-
-    prefix_sum_ = std::make_unique<rmm::device_uvector<INDEX_T>>(n_aabbs + 1, stream);
-    prefix_sum_->set_element_to_zero_async(0, stream);
-    thrust::inclusive_scan(rmm::exec_policy_nosync(stream), n_geoms_per_aabb.begin(),
-                           n_geoms_per_aabb.end(), prefix_sum_->begin() + 1);
-#ifndef NDEBUG
-    auto* p_prefix_sum = prefix_sum_->data();
-
-    thrust::for_each(rmm::exec_policy_nosync(stream),
-                     thrust::counting_iterator<size_t>(0),
-                     thrust::counting_iterator<size_t>(aabbs_->size()),
-                     [=] __device__(size_t aabb_idx) {
-                       auto begin = p_prefix_sum[aabb_idx];
-                       auto end = p_prefix_sum[aabb_idx + 1];
-                       const auto& aabb = p_aabbs[aabb_idx];
-
-                       for (auto i = begin; i < end; i++) {
-                         auto geom_idx = p_reordered_indices[i];
-                         auto mbr = geometries[geom_idx].get_mbr();
-                         assert(mbr.covered_by(aabb));
-                       }
-                     });
-#endif
-  }
-
-  ArrayView<OptixAabb> get_aabbs() const {
-    if (aabbs_ != nullptr) {
-      return ArrayView<OptixAabb>(aabbs_->data(), aabbs_->size());
-    }
-    return {};
-  }
-
-  ArrayView<INDEX_T> get_prefix_sum() const {
-    if (prefix_sum_ != nullptr) {
-      return ArrayView<INDEX_T>(prefix_sum_->data(), prefix_sum_->size());
-    }
-    return {};
-  }
-
-  ArrayView<INDEX_T> get_reordered_indices() const {
-    if (reordered_indices_ != nullptr) {
-      return ArrayView<INDEX_T>(reordered_indices_->data(), reordered_indices_->size());
-    }
-    return {};
-  }
-
-  void Clear() {
-    aabbs_ = nullptr;
-    prefix_sum_ = nullptr;
-    reordered_indices_ = nullptr;
-  }
-
- private:
-  std::unique_ptr<rmm::device_uvector<OptixAabb>> aabbs_;
-  std::unique_ptr<rmm::device_uvector<INDEX_T>> prefix_sum_;
-  std::unique_ptr<rmm::device_uvector<INDEX_T>> reordered_indices_;
-};
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/object_pool.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/object_pool.hpp
deleted file mode 100644
index d0ab3e1ff..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/object_pool.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-
-#include <memory>
-#include <mutex>
-#include <vector>
-
-namespace gpuspatial {
-// Forward declaration of ObjectPool to be used in the custom deleter.
-template <typename T>
-class ObjectPool;
-
-// A helper struct to allow std::make_shared to access the private constructor.
-// It inherits from ObjectPool and is defined outside of it.
-template <typename T>
-struct PoolEnabler : public ObjectPool<T> {
-  PoolEnabler(size_t size) : ObjectPool<T>(size) {}
-};
-
-// A custom deleter for std::shared_ptr.
-// When the shared_ptr's reference count goes to zero, this deleter
-// will be invoked, returning the object to the pool instead of deleting it.
-template <typename T>
-class PoolDeleter {
- public:
-  // Constructor takes a weak_ptr to the pool to avoid circular references.
-  PoolDeleter(std::weak_ptr<ObjectPool<T>> pool) : pool_(pool) {}
-
-  // The function call operator is what std::shared_ptr invokes.
-  void operator()(T* ptr) const {
-    // Attempt to lock the weak_ptr to get a shared_ptr to the pool.
-    if (auto pool_sp = pool_.lock()) {
-      // If the pool still exists, return the object to it.
-      pool_sp->release(ptr);
-    } else {
-      // If the pool no longer exists, we must delete the pointer to avoid a memory leak.
-      delete ptr;
-    }
-  }
-
- private:
-  std::weak_ptr<ObjectPool<T>> pool_;
-};
-
-/**
- * @brief A thread-safe object pool for reusable objects.
- *
- * @tparam T The type of object to pool.
- */
-template <typename T>
-class ObjectPool : public std::enable_shared_from_this<ObjectPool<T>> {
-  friend struct PoolEnabler<T>;
-
-  // Constructor is private to force object creation through the static 'create' method.
-  // This ensures the ObjectPool is always managed by a std::shared_ptr.
-  ObjectPool(size_t initial_size = 0) {
-    for (size_t i = 0; i < initial_size; ++i) {
-      pool_.push_back(new T());
-    }
-  }
-
- public:
-  /**
-   * @brief Factory method to create an instance of the ObjectPool.
-   * Guarantees that the pool is managed by a std::shared_ptr, which is required
-   * for the custom deleter mechanism to work correctly.
-   *
-   * @param initial_size The number of objects to pre-allocate.
-   * @return A std::shared_ptr to the new ObjectPool instance.
-   */
-  static std::shared_ptr<ObjectPool<T>> create(size_t initial_size = 0) {
-    return std::make_shared<PoolEnabler<T>>(initial_size);
-  }
-
-  /**
-   * @brief Destructor. Cleans up any remaining objects in the pool.
-   */
-  ~ObjectPool() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    for (T* item : pool_) {
-      delete item;
-    }
-    pool_.clear();
-  }
-
-  // Disable copy constructor and assignment operator
-  ObjectPool(const ObjectPool&) = delete;
-  ObjectPool& operator=(const ObjectPool&) = delete;
-
-  /**
-   * @brief Acquires an object from the pool.
-   *
-   * If the pool is empty, a new object is created. The returned shared_ptr
-   * has a custom deleter that will return the object to the pool when it's
-   * no longer referenced.
-   *
-   * @return A std::shared_ptr to an object of type T.
-   */
-  std::shared_ptr<T> take() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    T* resource_ptr = nullptr;
-    if (!pool_.empty()) {
-      // Take an existing object from the pool
-      resource_ptr = pool_.back();
-      pool_.pop_back();
-    } else {
-      // Pool is empty, create a new object
-      resource_ptr = new T();
-    }
-
-    // Create a custom deleter that knows how to return the object to this pool.
-    // this->shared_from_this() is now safe because creation is forced through the
-    // 'create' method.
-    PoolDeleter<T> deleter(this->shared_from_this());
-
-    // Return a shared_ptr with the custom deleter.
-    return std::shared_ptr<T>(resource_ptr, deleter);
-  }
-
-  /**
-   * @brief Returns an object to the pool.
-   *
-   * This method is intended to be called by the PoolDeleter, not directly by clients.
-   *
-   * @param object The raw pointer to the object to return to the pool.
-   */
-  void release(T* object) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    pool_.push_back(object);
-  }
-
-  /**
-   * @brief Gets the current number of available objects in the pool.
-   * @return The size of the pool.
-   */
-  size_t size() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return pool_.size();
-  }
-
- private:
-  std::vector<T*> pool_;
-  std::mutex mutex_;
-};
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
new file mode 100644
index 000000000..baaeb77f6
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "gpuspatial/index/rt_spatial_index.hpp"
+#include "gpuspatial/index/spatial_index.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/gpu_timer.hpp"
+#include "gpuspatial/utils/queue.hpp"
+
+#include "rmm/cuda_stream_pool.hpp"
+#include "rmm/cuda_stream_view.hpp"
+#include "rmm/device_uvector.hpp"
+#define GPUSPATIAL_PROFILING
+namespace gpuspatial {
+
+/** * @brief A spatial index implementation using NVIDIA OptiX ray tracing engine.
+ *
+ * This class provides spatial indexing capabilities for geometric data using
+ * the OptiX ray tracing engine. It supports building the index from either
+ * points or bounding boxes and allows for efficient spatial queries.
+ *
+ * @tparam SCALAR_T The scalar type used for coordinates (e.g., float, double).
+ * @tparam N_DIM The number of dimensions (e.g., 2 for 2D, 3 for 3D).
+ */
+template <typename SCALAR_T, int N_DIM>
+class RTSpatialIndex : public SpatialIndex<SCALAR_T, N_DIM> {
+  using point_t = typename SpatialIndex<SCALAR_T, N_DIM>::point_t;
+  using box_t = typename SpatialIndex<SCALAR_T, N_DIM>::box_t;
+  using scalar_t = typename point_t::scalar_t;
+  static constexpr int n_dim = point_t::n_dim;
+
+  using index_t = uint32_t;  // type of the index to represent geometries
+  struct SpatialIndexContext {
+    rmm::cuda_stream_view stream;
+    std::string shader_id;
+    rmm::device_buffer bvh_buffer{0, rmm::cuda_stream_default};
+    OptixTraversableHandle handle;
+    std::vector<char> h_launch_params_buffer;
+    rmm::device_buffer launch_params_buffer{0, rmm::cuda_stream_default};
+    std::unique_ptr<rmm::device_scalar<uint32_t>> counter;
+    // output
+    Queue<index_t> build_indices;
+    rmm::device_uvector<index_t> probe_indices{0, rmm::cuda_stream_default};
+#ifdef GPUSPATIAL_PROFILING
+    GPUTimer timer;
+    // counters
+    double alloc_ms = 0.0;
+    double bvh_build_ms = 0.0;
+    double rt_ms = 0.0;
+    double copy_res_ms = 0.0;
+#endif
+  };
+
+ public:
+  RTSpatialIndex() = default;
+
+  RTSpatialIndex(const RTSpatialIndexConfig& config);
+
+  void Clear() override;
+
+  void PushBuild(const box_t* rects, uint32_t n_rects) override;
+
+  void FinishBuilding() override;
+
+  void Probe(const box_t* rects, uint32_t n_rects, std::vector<uint32_t>* build_indices,
+             std::vector<uint32_t>* probe_indices) override;
+
+ private:
+  RTSpatialIndexConfig config_;
+  std::unique_ptr<rmm::cuda_stream_pool> stream_pool_;
+  bool indexing_points_;
+  // The rectangles being indexed or the MBRs of grouped points
+  rmm::device_uvector<box_t> rects_{0, rmm::cuda_stream_default};
+  // Data structures for indexing points
+  rmm::device_uvector<index_t> point_ranges_{0, rmm::cuda_stream_default};
+  rmm::device_uvector<index_t> reordered_point_indices_{0, rmm::cuda_stream_default};
+  rmm::device_uvector<point_t> points_{0, rmm::cuda_stream_default};
+  rmm::device_buffer bvh_buffer_{0, rmm::cuda_stream_default};
+  OptixTraversableHandle handle_;
+
+  void allocateResultBuffer(SpatialIndexContext& ctx, uint32_t capacity) const;
+
+  void handleBuildPoint(SpatialIndexContext& ctx, ArrayView<point_t> points,
+                        bool counting) const;
+
+  void handleBuildPoint(SpatialIndexContext& ctx, ArrayView<box_t> rects,
+                        bool counting) const;
+
+  void handleBuildBox(SpatialIndexContext& ctx, ArrayView<point_t> points,
+                      bool counting) const;
+
+  void handleBuildBox(SpatialIndexContext& ctx, ArrayView<box_t> rects,
+                      bool counting) const;
+
+  void prepareLaunchParamsBoxQuery(SpatialIndexContext& ctx, ArrayView<box_t> probe_rects,
+                                   bool forward, bool counting) const;
+
+  void filter(SpatialIndexContext& ctx, uint32_t dim_x) const;
+
+  size_t numGeometries() const {
+    return indexing_points_ ? points_.size() : rects_.size();
+  }
+};
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.hpp
similarity index 50%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.hpp
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.hpp
index 6c836dfa9..18619903a 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.hpp
@@ -16,13 +16,33 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/index/streaming_joiner.hpp"
+#include "gpuspatial/index/spatial_index.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
 
 #include <memory>
+#include <thread>
 
 namespace gpuspatial {
-std::unique_ptr<StreamingJoiner> CreateSpatialJoiner();
+/** Configuration for RTSpatialIndex
+ */
+struct RTSpatialIndexConfig {
+  // The ray tracing engine to use
+  std::shared_ptr<RTEngine> rt_engine;
+  // Prefer fast build the BVH
+  bool prefer_fast_build = false;
+  // Compress the BVH to save memory
+  bool compact = false;
+  // How many threads are allowed to call PushProbe concurrently
+  uint32_t concurrency = 1;
+  // number of points to represent an AABB when doing point-point queries
+  uint32_t n_points_per_aabb = 8;
+  RTSpatialIndexConfig() : prefer_fast_build(false), compact(false) {
+    concurrency = std::thread::hardware_concurrency();
+  }
+};
+
+template <typename SCALAR_T, int N_DIM>
+std::unique_ptr<SpatialIndex<SCALAR_T, N_DIM>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
 
-void InitSpatialJoiner(StreamingJoiner* index, const char* ptx_root,
-                       uint32_t concurrency);
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_index.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_index.hpp
new file mode 100644
index 000000000..688d0a9b6
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_index.hpp
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/point.hpp"
+
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+namespace gpuspatial {
+template <typename SCALAR_T, int N_DIM>
+class SpatialIndex {
+ public:
+  using point_t = Point<SCALAR_T, N_DIM>;
+  using box_t = Box<point_t>;
+
+  virtual ~SpatialIndex() = default;
+
+  /**
+   * Provide an array of geometries to build the index.
+   * @param rects An array of rectangles to be indexed.
+   */
+  virtual void PushBuild(const box_t* rects, uint32_t n_rects) = 0;
+
+  /**
+   * Waiting the index to be built.
+   * This method should be called after all geometries have been pushed.
+   */
+  virtual void FinishBuilding() = 0;
+
+  /**
+   * Remove all geometries from the index, so the index can reused.
+   */
+  virtual void Clear() = 0;
+
+  /**
+   * Query the index with an array of rectangles and return the indices of
+   * the rectangles. This method is thread-safe.
+   * @param build_indices A vector to store the indices of the geometries in the index
+   * that have a spatial overlap with the geometries in the stream.
+   * @param stream_indices A vector to store the indices of the geometries in the stream
+   * that have a spatial overlap with the geometries in the index.
+   */
+  virtual void Probe(const box_t* rects, uint32_t n_rects,
+                     std::vector<uint32_t>* build_indices,
+                     std::vector<uint32_t>* stream_indices) {
+    throw std::runtime_error("Not implemented");
+  }
+};
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.cuh
deleted file mode 100644
index 1c93a54b2..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.cuh
+++ /dev/null
@@ -1,184 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "geoarrow/geoarrow_type.h"
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/index/detail/rt_engine.hpp"
-#include "gpuspatial/index/geometry_grouper.hpp"
-#include "gpuspatial/index/object_pool.hpp"
-#include "gpuspatial/index/relate_engine.cuh"
-#include "gpuspatial/index/streaming_joiner.hpp"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "gpuspatial/loader/parallel_wkb_loader.h"
-#include "gpuspatial/utils/gpu_timer.hpp"
-#include "gpuspatial/utils/queue.h"
-#include "gpuspatial/utils/thread_pool.h"
-
-#include "rmm/cuda_stream_pool.hpp"
-#include "rmm/cuda_stream_view.hpp"
-#include "rmm/device_uvector.hpp"
-
-#include <fstream>
-#include <thread>
-
-
-// #define GPUSPATIAL_PROFILING
-namespace gpuspatial {
-
-class SpatialJoiner : public StreamingJoiner {
-  // TODO: Assuming every thing is 2D in double for now
-  using scalar_t = double;
-  static constexpr int n_dim = 2;
-  using index_t = uint32_t;  // type of the index to represent geometries
-  // geometry types
-  using point_t = Point<scalar_t, n_dim>;
-  using multi_point_t = MultiPoint<point_t>;
-  using line_string_t = LineString<point_t>;
-  using multi_line_string_t = MultiLineString<point_t, index_t>;
-  using polygon_t = Polygon<point_t, index_t>;
-  using multi_polygon_t = MultiPolygon<point_t, index_t>;
-  // geometry array types
-  using point_array_t = PointArrayView<point_t, index_t>;
-  using multi_point_array_t = MultiPointArrayView<point_t, index_t>;
-  using line_string_array_t = LineStringArrayView<point_t, index_t>;
-  using multi_line_string_array_t = MultiLineStringArrayView<point_t, index_t>;
-  using polygon_array_t = PolygonArrayView<point_t, index_t>;
-  using multi_polygon_array_t = MultiPolygonArrayView<point_t, index_t>;
-
-  using dev_geometries_t = DeviceGeometries<point_t, index_t>;
-  using box_t = Box<Point<float, n_dim>>;
-  using loader_t = ParallelWkbLoader<point_t, index_t>;
-
- public:
-  struct SpatialJoinerConfig : Config {
-    const char* ptx_root;
-    // Prefer fast build the BVH
-    bool prefer_fast_build = false;
-    // Compress the BVH to save memory
-    bool compact = true;
-    // Loader configurations
-    // How many threads to use for parsing WKBs
-    uint32_t parsing_threads = std::thread::hardware_concurrency();
-    // How many threads are allowed to call PushStream concurrently
-    uint32_t concurrency = 1;
-    // number of points to represent an AABB when doing point-point queries
-    uint32_t n_points_per_aabb = 8;
-    // reserve a ratio of available memory for result sets
-    float result_buffer_memory_reserve_ratio = 0.2;
-    // the memory quota for relate engine compared to the available memory
-    float relate_engine_memory_quota = 0.8;
-    // this value determines RELATE_MAX_DEPTH
-    size_t stack_size_bytes = 3 * 1024;
-    SpatialJoinerConfig() : ptx_root(nullptr), prefer_fast_build(false), compact(false) {
-      concurrency = std::thread::hardware_concurrency();
-    }
-  };
-
-  struct SpatialJoinerContext : Context {
-    rmm::cuda_stream_view cuda_stream;
-    std::string shader_id;
-    std::unique_ptr<loader_t> stream_loader;
-    dev_geometries_t stream_geometries;
-    std::unique_ptr<rmm::device_buffer> bvh_buffer;
-    OptixTraversableHandle handle;
-    std::vector<char> h_launch_params_buffer;
-    std::unique_ptr<rmm::device_buffer> launch_params_buffer;
-    // output
-    Queue<thrust::pair<index_t, index_t>> results;
-    int32_t array_index_offset;
-#ifdef GPUSPATIAL_PROFILING
-    GPUTimer timer;
-    // counters
-    double parse_ms = 0.0;
-    double alloc_ms = 0.0;
-    double filter_ms = 0.0;
-    double refine_ms = 0.0;
-    double copy_res_ms = 0.0;
-#endif
-  };
-
-  SpatialJoiner() = default;
-
-  ~SpatialJoiner() = default;
-
-  void Init(const Config* config) override;
-
-  void Clear() override;
-
-  void PushBuild(const ArrowSchema* schema, const ArrowArray* array, int64_t offset,
-                 int64_t length) override;
-
-  void FinishBuilding() override;
-
-  std::shared_ptr<Context> CreateContext() override { return ctx_pool_->take(); }
-
-  void PushStream(Context* ctx, const ArrowSchema* schema, const ArrowArray* array,
-                  int64_t offset, int64_t length, Predicate predicate,
-                  std::vector<uint32_t>* build_indices,
-                  std::vector<uint32_t>* stream_indices,
-                  int32_t array_index_offset) override;
-
-  // Internal method but has to be public for the CUDA kernel to access
-  void handleBuildPointStreamPoint(SpatialJoinerContext* ctx, Predicate predicate,
-                                   std::vector<uint32_t>* build_indices,
-                                   std::vector<uint32_t>* stream_indices);
-
-  void handleBuildBoxStreamPoint(SpatialJoinerContext* ctx, Predicate predicate,
-                                 std::vector<uint32_t>* build_indices,
-                                 std::vector<uint32_t>* stream_indices);
-
-  void handleBuildPointStreamBox(SpatialJoinerContext* ctx, Predicate predicate,
-                                 std::vector<uint32_t>* build_indices,
-                                 std::vector<uint32_t>* stream_indices);
-
-  void handleBuildBoxStreamBox(SpatialJoinerContext* ctx, Predicate predicate,
-                               std::vector<uint32_t>* build_indices,
-                               std::vector<uint32_t>* stream_indices);
-
-  void filter(SpatialJoinerContext* ctx, uint32_t dim_x, bool swap_id = false);
-
-  void refine(SpatialJoinerContext* ctx, Predicate predicate,
-              std::vector<uint32_t>* build_indices,
-              std::vector<uint32_t>* stream_indices);
-
- private:
-  SpatialJoinerConfig config_;
-  std::unique_ptr<rmm::cuda_stream_pool> stream_pool_;
-  std::shared_ptr<ThreadPool> thread_pool_;
-  details::RTEngine rt_engine_;
-  std::unique_ptr<rmm::device_buffer> bvh_buffer_;
-  std::unique_ptr<loader_t> build_loader_;
-
-  DeviceGeometries<point_t, index_t> build_geometries_;
-  // For grouping points with space-filing curve
-  GeometryGrouper<point_t, index_t> geometry_grouper_;
-  RelateEngine<point_t, index_t> relate_engine_;
-  OptixTraversableHandle handle_;
-
-  std::shared_ptr<ObjectPool<SpatialJoinerContext>> ctx_pool_;
-
-  OptixTraversableHandle buildBVH(const rmm::cuda_stream_view& stream,
-                                  const ArrayView<OptixAabb>& aabbs,
-                                  std::unique_ptr<rmm::device_buffer>& buffer);
-
-  void allocateResultBuffer(SpatialJoinerContext* ctx);
-
-  void prepareLaunchParamsBoxQuery(SpatialJoinerContext* ctx, bool forward);
-};
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/streaming_joiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/streaming_joiner.hpp
deleted file mode 100644
index ccf8a3bfe..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/streaming_joiner.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "gpuspatial/relate/predicate.cuh"
-
-#include "nanoarrow/nanoarrow.hpp"
-
-#include <memory>
-#include <stdexcept>
-#include <vector>
-namespace gpuspatial {
-
-class StreamingJoiner {
- public:
-  struct Context {
-    virtual ~Context() = default;
-  };
-
-  struct Config {
-    virtual ~Config() = default;
-  };
-
-  virtual ~StreamingJoiner() = default;
-
-  /**
-   * Initialize the index with the given configuration. This method should be called only
-   * once before using the index.
-   * @param config
-   */
-  virtual void Init(const Config* config) = 0;
-
-  /**
-   * Provide an array of geometries to build the index.
-   * @param array ArrowArray that contains the geometries in WKB format.
-   * @param offset starting index of the ArrowArray
-   * @param length length of the ArrowArray to read.
-   */
-  virtual void PushBuild(const ArrowSchema* schema, const ArrowArray* array,
-                         int64_t offset, int64_t length) = 0;
-
-  /**
-   * Waiting the index to be built.
-   * This method should be called after all geometries have been pushed.
-   */
-  virtual void FinishBuilding() = 0;
-
-  /**
-   * Remove all geometries from the index, so the index can reused.
-   */
-  virtual void Clear() = 0;
-
-  /**
-   * Query the index with an array of geometries in WKB format and return the indices of
-   * the geometries in stream and the index that satisfy a given predicate. This method is
-   * thread-safe.
-   * @param context A context object that can be used to store intermediate results.
-   * @param array ArrowArray that contains the geometries in WKB format.
-   * @param offset starting index of the ArrowArray
-   * @param length length of the ArrowArray to read.
-   * @param predicate A predicate to filter the query results.
-   * @param build_indices A vector to store the indices of the geometries in the index
-   * that have a spatial overlap with the geometries in the stream.
-   * @param stream_indices A vector to store the indices of the geometries in the stream
-   * that have a spatial overlap with the geometries in the index.
-   * @param stream_index_offset An offset to be added to stream_indices
-   */
-  virtual void PushStream(Context* context, const ArrowSchema* schema,
-                          const ArrowArray* array, int64_t offset, int64_t length,
-                          Predicate predicate, std::vector<uint32_t>* build_indices,
-                          std::vector<uint32_t>* stream_indices,
-                          int32_t stream_index_offset) {
-    throw std::runtime_error("Not implemented");
-  }
-
-  /**
-   * Create a context object for issuing queries against the index.
-   * @return A context object that is used to store intermediate results.
-   */
-  virtual std::shared_ptr<Context> CreateContext() {
-    throw std::runtime_error("Not implemented");
-  }
-};
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.hpp
index 3c44ca324..2d59d0a89 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.hpp
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/geometry_type.cuh"
-#include "gpuspatial/geom/multi_line_string.cuh"
-#include "gpuspatial/geom/multi_point.cuh"
-#include "gpuspatial/geom/multi_polygon.cuh"
-#include "gpuspatial/geom/polygon.cuh"
-#include "gpuspatial/utils/array_view.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/geometry_type.hpp"
+#include "gpuspatial/geom/multi_line_string.hpp"
+#include "gpuspatial/geom/multi_point.hpp"
+#include "gpuspatial/geom/multi_polygon.hpp"
+#include "gpuspatial/geom/polygon.hpp"
+#include "gpuspatial/utils/array_view.hpp"
 
 #include "rmm/device_uvector.hpp"
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.hpp
similarity index 70%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.hpp
index cb2186ff3..b40122a74 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.hpp
@@ -15,78 +15,39 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-
-#include "gpuspatial/geom/geometry_type.cuh"
-#include "gpuspatial/loader/device_geometries.cuh"
+#include "gpuspatial/geom/geometry_type.hpp"
+#include "gpuspatial/loader/device_geometries.hpp"
+#include "gpuspatial/mem/memory_manager.hpp"
 #include "gpuspatial/utils/logger.hpp"
+#include "gpuspatial/utils/markers.hpp"
 #include "gpuspatial/utils/mem_utils.hpp"
-#include "gpuspatial/utils/stopwatch.h"
-#include "gpuspatial/utils/thread_pool.h"
+#include "gpuspatial/utils/stopwatch.hpp"
+#include "gpuspatial/utils/thread_pool.hpp"
+
+#include "nanoarrow/nanoarrow.hpp"
 
-#include "nanoarrow/nanoarrow.h"
+#include "geoarrow/geoarrow.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 #include "rmm/device_uvector.hpp"
 #include "rmm/exec_policy.hpp"
 
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
 
+#include <cstring>
+#include <future>
+#include <numeric>
 #include <thread>
 #include <unordered_set>
-
-#include <sys/sysinfo.h>
-#include <unistd.h>
+#include <vector>
 
 namespace gpuspatial {
 namespace detail {
 
-inline long long get_free_physical_memory_linux() {
-  struct sysinfo info;
-  if (sysinfo(&info) == 0) {
-    // info.freeram is in bytes (or unit defined by info.mem_unit)
-    // Use info.freeram * info.mem_unit for total free bytes
-    return (long long)info.freeram * (long long)info.mem_unit;
-  }
-  return 0;  // Error
-}
-
-// Copied from GeoArrow, it is faster than using GeoArrowWKBReaderRead
-struct WKBReaderPrivate {
-  const uint8_t* data;
-  int64_t size_bytes;
-  const uint8_t* data0;
-  int need_swapping;
-  GeoArrowGeometry geom;
-};
-
-static int WKBReaderReadEndian(struct WKBReaderPrivate* s, struct GeoArrowError* error) {
-  if (s->size_bytes > 0) {
-    s->need_swapping = s->data[0] != GEOARROW_NATIVE_ENDIAN;
-    s->data++;
-    s->size_bytes--;
-    return GEOARROW_OK;
-  } else {
-    GeoArrowErrorSet(error, "Expected endian byte but found end of buffer at byte %ld",
-                     (long)(s->data - s->data0));
-    return EINVAL;
-  }
-}
-
-static int WKBReaderReadUInt32(struct WKBReaderPrivate* s, uint32_t* out,
-                               struct GeoArrowError* error) {
-  if (s->size_bytes >= 4) {
-    memcpy(out, s->data, sizeof(uint32_t));
-    s->data += sizeof(uint32_t);
-    s->size_bytes -= sizeof(uint32_t);
-    if (s->need_swapping) {
-      *out = __builtin_bswap32(*out);
-    }
-    return GEOARROW_OK;
-  } else {
-    GeoArrowErrorSet(error, "Expected uint32 but found end of buffer at byte %ld",
-                     (long)(s->data - s->data0));
-    return EINVAL;
-  }
+inline bool is_little_endian() {
+  const uint16_t x = 0x0001;
+  return *reinterpret_cast<const uint8_t*>(&x) != 0;
 }
 
 /**
@@ -105,6 +66,7 @@ template <typename POINT_T, typename INDEX_T>
 struct HostParsedGeometries {
   constexpr static int n_dim = POINT_T::n_dim;
   using mbr_t = Box<Point<float, n_dim>>;
+  GeometryType type;
   // each feature should have only one type except GeometryCollection
   std::vector<GeometryType> feature_types;
   // This number should be one except GeometryCollection, which should be unnested # of
@@ -120,17 +82,18 @@ struct HostParsedGeometries {
   bool has_geometry_collection = false;
   bool create_mbr = false;
 
-  HostParsedGeometries(bool multi_, bool has_geometry_collection_, bool create_mbr_) {
+  HostParsedGeometries(GeometryType t) : type(t) {
+    multi = type == GeometryType::kMultiPoint || type == GeometryType::kMultiLineString ||
+            type == GeometryType::kMultiPolygon;
+    has_geometry_collection = type == GeometryType::kGeometryCollection;
+    create_mbr = type != GeometryType::kPoint;
     // Multi and GeometryCollection are mutually exclusive
-    assert(!(multi_ && has_geometry_collection_));
-    multi = multi_;
-    has_geometry_collection = has_geometry_collection_;
-    create_mbr = create_mbr_;
+    assert(!(multi && has_geometry_collection));
   }
 
   void AddGeometry(const GeoArrowGeometryView* geom) {
     if (geom == nullptr) {
-      throw std::runtime_error("Null geometry not supported yet");
+      addNullEntry();
       return;
     }
 
@@ -405,6 +368,49 @@ struct HostParsedGeometries {
     }
     return node + 1;
   }
+
+  void addNullEntry() {
+    // 1. Maintain MBR alignment if this type has MBRs
+    if (create_mbr) {
+      mbr_t empty_mbr;
+      empty_mbr.set_empty();
+      mbrs.push_back(empty_mbr);
+    }
+
+    // 2. Push zero-placeholders to maintain offset alignment
+    if (has_geometry_collection) {
+      // Null collection => 0 sub-geometries
+      num_geoms.push_back(0);
+    } else {
+      switch (type) {
+        case GeometryType::kPoint: {
+          // Push NaN point to represent empty/null
+          POINT_T p;
+          p.set_empty();
+          vertices.push_back(p);
+          break;
+        }
+        case GeometryType::kLineString:
+          num_points.push_back(0);
+          break;
+        case GeometryType::kPolygon:
+          num_rings.push_back(0);
+          break;
+        case GeometryType::kMultiPoint:
+          num_points.push_back(0);
+          break;
+        case GeometryType::kMultiLineString:
+          num_parts.push_back(0);
+          break;
+        case GeometryType::kMultiPolygon:
+          num_parts.push_back(0);
+          break;
+        default:
+          throw std::runtime_error(
+              "Null geometry encountered for unsupported geometry type");
+      }
+    }
+  }
 };
 
 template <typename POINT_T, typename INDEX_T>
@@ -442,7 +448,8 @@ struct DeviceParsedGeometries {
   }
 
   void Append(rmm::cuda_stream_view stream,
-              const std::vector<HostParsedGeometries<POINT_T, INDEX_T>>& host_geoms) {
+              const std::vector<HostParsedGeometries<POINT_T, INDEX_T>>& host_geoms,
+              double& t_alloc_ms, double& t_copy_ms) {
     size_t sz_feature_types = 0;
     size_t sz_num_geoms = 0;
     size_t sz_num_parts = 0;
@@ -482,6 +489,9 @@ struct DeviceParsedGeometries {
         prev_sz_mbrs * sizeof(mbr_t) / 1024 / 1024,
         sz_mbrs * sizeof(mbr_t) / 1024 / 1024);
 
+    Stopwatch sw;
+
+    sw.start();
     feature_types.resize(feature_types.size() + sz_feature_types, stream);
     num_geoms.resize(num_geoms.size() + sz_num_geoms, stream);
     num_parts.resize(num_parts.size() + sz_num_parts, stream);
@@ -489,7 +499,11 @@ struct DeviceParsedGeometries {
     num_points.resize(num_points.size() + sz_num_points, stream);
     vertices.resize(vertices.size() + sz_vertices, stream);
     mbrs.resize(mbrs.size() + sz_mbrs, stream);
-
+    stream.synchronize();
+    sw.stop();
+    t_alloc_ms += sw.ms();
+    Instrument::Range r("H2D", gpuspatial::Color::Blue);
+    sw.start();
     for (auto& geoms : host_geoms) {
       detail::async_copy_h2d(stream, geoms.feature_types.data(),
                              feature_types.data() + prev_sz_feature_types,
@@ -518,6 +532,9 @@ struct DeviceParsedGeometries {
       prev_sz_vertices += geoms.vertices.size();
       prev_sz_mbrs += geoms.mbrs.size();
     }
+    stream.synchronize();
+    sw.stop();
+    t_copy_ms += sw.ms();
   }
 };
 }  // namespace detail
@@ -531,9 +548,7 @@ class ParallelWkbLoader {
 
  public:
   struct Config {
-    // How many rows of WKBs to process in one chunk
-    // This value affects the peak memory usage and overheads
-    int chunk_size = 16 * 1024;
+    float memory_quota = 0.8f;  // percentage of free memory to use
   };
 
   ParallelWkbLoader()
@@ -543,9 +558,8 @@ class ParallelWkbLoader {
       : thread_pool_(thread_pool) {}
 
   void Init(const Config& config = Config()) {
-    ArrowArrayViewInitFromType(&array_view_, NANOARROW_TYPE_BINARY);
     config_ = config;
-    geometry_type_ = GeometryType::kNull;
+    Clear(rmm::cuda_stream_default);
   }
 
   void Clear(rmm::cuda_stream_view stream) {
@@ -553,72 +567,85 @@ class ParallelWkbLoader {
     geoms_.Clear(stream);
   }
 
-  void Parse(rmm::cuda_stream_view stream, const ArrowArray* array, int64_t offset,
+  void Parse(rmm::cuda_stream_view stream, const ArrowArrayView* array, int64_t offset,
              int64_t length) {
+    auto begin = thrust::make_counting_iterator<int64_t>(offset);
+    auto end = begin + length;
+
+    Parse(stream, array, begin, end);
+  }
+
+  template <typename OFFSET_IT>
+  void Parse(rmm::cuda_stream_view stream, const ArrowArrayView* array_view,
+             OFFSET_IT begin, OFFSET_IT end) {
     using host_geometries_t = detail::HostParsedGeometries<POINT_T, INDEX_T>;
-    ArrowError arrow_error;
-    if (ArrowArrayViewSetArray(&array_view_, array, &arrow_error) != NANOARROW_OK) {
-      throw std::runtime_error("ArrowArrayViewSetArray error " +
-                               std::string(arrow_error.message));
-    }
+
+    size_t num_offsets = std::distance(begin, end);
+    if (num_offsets == 0) return;
+
     auto parallelism = thread_pool_->num_threads();
-    auto est_bytes = estimateTotalBytes(array, offset, length);
-    auto free_memory = detail::get_free_physical_memory_linux();
+    uint64_t est_bytes = estimateTotalBytes(array_view, begin, end);
+
+    uint64_t free_memory = MemoryManager::get_available_host_memory();
+    uint64_t memory_quota = free_memory * config_.memory_quota;
     uint32_t est_n_chunks = est_bytes / free_memory + 1;
-    uint32_t chunk_size = (length + est_n_chunks - 1) / est_n_chunks;
+
+    // Use num_offsets instead of offsets.size()
+    uint32_t chunk_size = (num_offsets + est_n_chunks - 1) / est_n_chunks;
+    uint32_t n_chunks = (num_offsets + chunk_size - 1) / chunk_size;
 
     GPUSPATIAL_LOG_INFO(
-        "Parsing %ld rows, est arrow size %ld MB, free memory %lld, chunk size %u\n",
-        length, est_bytes / 1024 / 1024, free_memory / 1024 / 1024, chunk_size);
+        "Parsing %zu rows, est ArrowArray size %lu MB, Free Host Memory %lu MB, Memory quota %lu MB, Chunk Size %u, Total Chunks %u",
+        num_offsets, est_bytes / 1024 / 1024, free_memory / 1024 / 1024,
+        memory_quota / 1024 / 1024, chunk_size, n_chunks);
 
-    auto n_chunks = (length + chunk_size - 1) / chunk_size;
     Stopwatch sw;
     double t_fetch_type = 0, t_parse = 0, t_copy = 0;
+    double t_alloc = 0, t_h2d = 0;
 
     sw.start();
-    updateGeometryType(offset, length);
+    // Assumption: updateGeometryType is updated to accept iterators (begin, end)
+    updateGeometryType(array_view, begin, end);
     sw.stop();
     t_fetch_type = sw.ms();
 
-    bool multi = geometry_type_ == GeometryType::kMultiPoint ||
-                 geometry_type_ == GeometryType::kMultiLineString ||
-                 geometry_type_ == GeometryType::kMultiPolygon;
-    bool has_geometry_collection = geometry_type_ == GeometryType::kGeometryCollection;
-    bool create_mbr = geometry_type_ != GeometryType::kPoint;
-
     // reserve space
     geoms_.vertices.reserve(est_bytes / sizeof(POINT_T), stream);
-    if (create_mbr) geoms_.mbrs.reserve(array->length, stream);
+    if (geometry_type_ != GeometryType::kPoint)
+      geoms_.mbrs.reserve(array_view->length, stream);
 
     // Batch processing to reduce the peak memory usage
-    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+    for (size_t chunk = 0; chunk < n_chunks; chunk++) {
       auto chunk_start = chunk * chunk_size;
-      auto chunk_end = std::min(length, (chunk + 1) * chunk_size);
-      auto work_size = chunk_end - chunk_start;
+      auto chunk_end = std::min(num_offsets, (chunk + 1) * chunk_size);
+      auto split_points = assignBalancedWorks(array_view, begin + chunk_start,
+                                              begin + chunk_end, parallelism);
 
       std::vector<std::future<host_geometries_t>> pending_local_geoms;
-      auto thread_work_size = (work_size + parallelism - 1) / parallelism;
-      sw.start();
       // Each thread will parse in parallel and store results sequentially
       for (int thread_idx = 0; thread_idx < parallelism; thread_idx++) {
         auto run = [&](int tid) {
-          // FIXME: SetDevice
-          auto thread_work_start = chunk_start + tid * thread_work_size;
-          auto thread_work_end =
-              std::min(chunk_end, thread_work_start + thread_work_size);
-          host_geometries_t local_geoms(multi, has_geometry_collection, create_mbr);
+          auto thread_work_start = split_points[tid];
+          auto thread_work_end = split_points[tid + 1];
+          host_geometries_t local_geoms(geometry_type_);
           GeoArrowWKBReader reader;
           GeoArrowError error;
-          GEOARROW_THROW_NOT_OK(nullptr, GeoArrowWKBReaderInit(&reader));
+          GEOARROW_THROW_NOT_OK(&error, GeoArrowWKBReaderInit(&reader));
+
+          uint64_t chunk_bytes = estimateTotalBytes(array_view, begin + thread_work_start,
+                                                    begin + thread_work_end);
+          local_geoms.vertices.reserve(chunk_bytes / sizeof(POINT_T));
 
           for (uint32_t work_offset = thread_work_start; work_offset < thread_work_end;
                work_offset++) {
-            auto arrow_offset = work_offset + offset;
+            // Use iterator indexing (Requires RandomAccessIterator)
+            auto arrow_offset = begin[chunk_start + work_offset];
+
             // handle null value
-            if (ArrowArrayViewIsNull(&array_view_, arrow_offset)) {
+            if (ArrowArrayViewIsNull(array_view, arrow_offset)) {
               local_geoms.AddGeometry(nullptr);
             } else {
-              auto item = ArrowArrayViewGetBytesUnsafe(&array_view_, arrow_offset);
+              auto item = ArrowArrayViewGetBytesUnsafe(array_view, arrow_offset);
               GeoArrowGeometryView geom;
 
               GEOARROW_THROW_NOT_OK(
@@ -629,6 +656,7 @@ class ParallelWkbLoader {
             }
           }
 
+          GeoArrowWKBReaderReset(&reader);
           return std::move(local_geoms);
         };
         pending_local_geoms.push_back(std::move(thread_pool_->enqueue(run, thread_idx)));
@@ -641,15 +669,14 @@ class ParallelWkbLoader {
       sw.stop();
       t_parse += sw.ms();
       sw.start();
-      geoms_.Append(stream, local_geoms);
+      geoms_.Append(stream, local_geoms, t_alloc, t_h2d);
       stream.synchronize();
       sw.stop();
       t_copy += sw.ms();
     }
     GPUSPATIAL_LOG_INFO(
-        "ParallelWkbLoader::Parse: fetched type in %.3f ms, parsed in %.3f ms, copied in "
-        "%.3f ms",
-        t_fetch_type, t_parse, t_copy);
+        "ParallelWkbLoader::Parse: fetched type in %.3f ms, parsed in %.3f ms, alloc %.3f ms, h2d copy %.3f ms",
+        t_fetch_type, t_parse, t_alloc, t_h2d);
   }
 
   DeviceGeometries<POINT_T, INDEX_T> Finish(rmm::cuda_stream_view stream) {
@@ -746,8 +773,10 @@ class ParallelWkbLoader {
             std::move(ps_num_points);
         break;
       }
+      default:
+        throw std::runtime_error("Unsupported geometry type " +
+                                 GeometryTypeToString(geometry_type_) + " in Finish");
     }
-    Clear(stream);
     stream.synchronize();
     sw.stop();
     GPUSPATIAL_LOG_INFO("Finish building DeviceGeometries in %.3f ms", sw.ms());
@@ -756,102 +785,106 @@ class ParallelWkbLoader {
 
  private:
   Config config_;
-  ArrowArrayView array_view_;
   GeometryType geometry_type_;
   detail::DeviceParsedGeometries<POINT_T, INDEX_T> geoms_;
   std::shared_ptr<ThreadPool> thread_pool_;
 
-  void updateGeometryType(int64_t offset, int64_t length) {
+  template <typename OFFSET_IT>
+  void updateGeometryType(const ArrowArrayView* array_view, OFFSET_IT begin,
+                          OFFSET_IT end) {
     if (geometry_type_ == GeometryType::kGeometryCollection) {
-      // it's already the most generic type
       return;
     }
 
-    std::vector<bool> type_flags(8 /*WKB types*/, false);
-    std::vector<std::thread> workers;
+    size_t num_offsets = std::distance(begin, end);
+    if (num_offsets == 0) return;
+
     auto parallelism = thread_pool_->num_threads();
-    auto thread_work_size = (length + parallelism - 1) / parallelism;
-    std::vector<std::future<void>> futures;
+    auto thread_work_size = (num_offsets + parallelism - 1) / parallelism;
+
+    std::vector<std::future<uint32_t>> futures;
+    futures.reserve(parallelism);
+
+    auto read_geom_type = [array_view](int64_t arrow_offset) -> uint32_t {
+      auto item = ArrowArrayViewGetBytesUnsafe(array_view, arrow_offset);
+      const uint8_t* data = item.data.as_uint8;
+      // Safety check: WKB minimal size is 5 bytes (1 byte order + 4 type)
+      if (item.size_bytes < 5) return 0;
+      // 1. Read Endianness Byte (0 = Big/XDR, 1 = Little/NDR)
+      uint8_t wkb_endian = data[0];
+
+      // 2. Read Type (Bytes 1-4)
+      uint32_t geometry_type;
+      std::memcpy(&geometry_type, data + 1, sizeof(uint32_t));
+      const bool host_is_little = detail::is_little_endian();
+      // 3. Swap if mismatch
+      // If (WKB is Little) != (Host is Little), we must swap
+      if ((wkb_endian == 1) != host_is_little) {
+        geometry_type = __builtin_bswap32(geometry_type);
+      }
+
+      // 4. Validate and Accumulate (Branchless Masking)
+      if (geometry_type > 7) {
+        // It's safer to throw exception outside the tight loop or set an error flag
+        // For now, we skip or you can throw.
+        throw std::runtime_error("Extended WKB types not supported: " +
+                                 std::to_string(geometry_type));
+      }
+      return geometry_type;
+    };
 
     for (int thread_idx = 0; thread_idx < parallelism; thread_idx++) {
-      auto run = [&](int tid) {
-        auto thread_work_start = tid * thread_work_size;
-        auto thread_work_end = std::min(length, thread_work_start + thread_work_size);
-        GeoArrowWKBReader reader;
-        GeoArrowError error;
-        GEOARROW_THROW_NOT_OK(nullptr, GeoArrowWKBReaderInit(&reader));
-
-        for (uint32_t work_offset = thread_work_start; work_offset < thread_work_end;
-             work_offset++) {
-          auto arrow_offset = work_offset + offset;
-          // handle null value
-          if (ArrowArrayViewIsNull(&array_view_, arrow_offset)) {
-            continue;
+      auto run = [=](int tid) -> uint32_t {
+        size_t thread_work_start = tid * thread_work_size;
+        size_t thread_work_end =
+            std::min(num_offsets, thread_work_start + thread_work_size);
+        uint32_t local_seen_mask = 0;
+
+        if (array_view->null_count == 0) {
+          for (uint32_t work_offset = thread_work_start; work_offset < thread_work_end;
+               work_offset++) {
+            auto arrow_offset = begin[work_offset];
+            auto geometry_type = read_geom_type(arrow_offset);
+
+            local_seen_mask |= (1 << geometry_type);
           }
-          auto item = ArrowArrayViewGetBytesUnsafe(&array_view_, arrow_offset);
-          auto* s = (struct detail::WKBReaderPrivate*)reader.private_data;
-
-          s->data = item.data.as_uint8;
-          s->data0 = s->data;
-          s->size_bytes = item.size_bytes;
-
-          NANOARROW_THROW_NOT_OK(detail::WKBReaderReadEndian(s, &error));
-          uint32_t geometry_type;
-          NANOARROW_THROW_NOT_OK(detail::WKBReaderReadUInt32(s, &geometry_type, &error));
-          if (geometry_type > 7) {
-            throw std::runtime_error(
-                "Extended WKB types are not currently supported, type = " +
-                std::to_string(geometry_type));
+        } else {
+          for (uint32_t work_offset = thread_work_start; work_offset < thread_work_end;
+               work_offset++) {
+            auto arrow_offset = begin[work_offset];
+
+            if (!ArrowArrayViewIsNull(array_view, arrow_offset)) {
+              auto geometry_type = read_geom_type(arrow_offset);
+
+              local_seen_mask |= (1 << geometry_type);
+            }
           }
-          assert(geometry_type < type_flags.size());
-          type_flags[geometry_type] = true;
         }
+
+        return local_seen_mask;
       };
+
       futures.push_back(std::move(thread_pool_->enqueue(run, thread_idx)));
     }
+
+    // Reduction
+    uint32_t global_mask = 0;
     for (auto& fu : futures) {
-      fu.get();
+      global_mask |= fu.get();
     }
 
     std::unordered_set<GeometryType> types;
-    // include existing geometry type
     if (geometry_type_ != GeometryType::kNull) {
       types.insert(geometry_type_);
     }
 
     for (int i = 1; i <= 7; i++) {
-      if (type_flags[i]) {
+      if (global_mask & (1 << i)) {
         types.insert(static_cast<GeometryType>(i));
       }
     }
 
-    GeometryType final_type;
-    // Infer a generic type that can represent the current and previous types
-    switch (types.size()) {
-      case 0:
-        final_type = GeometryType::kNull;
-        break;
-      case 1:
-        final_type = *types.begin();
-        break;
-      case 2: {
-        if (types.count(GeometryType::kPoint) && types.count(GeometryType::kMultiPoint)) {
-          final_type = GeometryType::kMultiPoint;
-        } else if (types.count(GeometryType::kLineString) &&
-                   types.count(GeometryType::kMultiLineString)) {
-          final_type = GeometryType::kMultiLineString;
-        } else if (types.count(GeometryType::kPolygon) &&
-                   types.count(GeometryType::kMultiPolygon)) {
-          final_type = GeometryType::kMultiPolygon;
-        } else {
-          final_type = GeometryType::kGeometryCollection;
-        }
-        break;
-      }
-      default:
-        final_type = GeometryType::kGeometryCollection;
-    }
-    geometry_type_ = final_type;
+    geometry_type_ = getUpcastedGeometryType(types);
   }
 
   template <typename T>
@@ -875,21 +908,122 @@ class ParallelWkbLoader {
     nums.shrink_to_fit(stream);
   }
 
-  size_t estimateTotalBytes(const ArrowArray* array, int64_t offset, int64_t length) {
-    ArrowError arrow_error;
-    if (ArrowArrayViewSetArray(&array_view_, array, &arrow_error) != NANOARROW_OK) {
-      throw std::runtime_error("ArrowArrayViewSetArray error " +
-                               std::string(arrow_error.message));
-    }
+  template <typename OFFSET_IT>
+  size_t estimateTotalBytes(const ArrowArrayView* array_view, OFFSET_IT begin,
+                            OFFSET_IT end) const {
     size_t total_bytes = 0;
-    for (int64_t i = 0; i < length; i++) {
-      if (!ArrowArrayViewIsNull(&array_view_, offset + i)) {
-        auto item = ArrowArrayViewGetBytesUnsafe(&array_view_, offset + i);
-        total_bytes += item.size_bytes - 1      // byte order
-                       - 2 * sizeof(uint32_t);  // type + size
+    if (array_view->null_count == 0) {
+      for (auto it = begin; it != end; ++it) {
+        auto offset = *it;
+        auto item = ArrowArrayViewGetBytesUnsafe(array_view, offset);
+        total_bytes += item.size_bytes;
+      }
+    } else {
+      for (auto it = begin; it != end; ++it) {
+        auto offset = *it;
+        if (!ArrowArrayViewIsNull(array_view, offset)) {
+          auto item = ArrowArrayViewGetBytesUnsafe(array_view, offset);
+          total_bytes += item.size_bytes;
+        }
       }
     }
+
     return total_bytes;
   }
+
+  template <typename OFFSET_IT>
+  std::vector<uint32_t> assignBalancedWorks(const ArrowArrayView* array_view,
+                                            OFFSET_IT begin, OFFSET_IT end,
+                                            uint32_t num_threads) const {
+    size_t total_bytes = 0;
+    std::vector<uint32_t> bytes_per_row;
+    size_t num_rows = std::distance(begin, end);
+
+    bytes_per_row.resize(num_rows, 0);
+
+    // 1. Calculate bytes per row
+    if (array_view->null_count == 0) {
+      for (auto it = begin; it != end; ++it) {
+        auto offset = *it;
+        auto item = ArrowArrayViewGetBytesUnsafe(array_view, offset);
+        bytes_per_row[it - begin] = static_cast<uint32_t>(item.size_bytes);
+      }
+    } else {
+      for (auto it = begin; it != end; ++it) {
+        auto offset = *it;
+        if (!ArrowArrayViewIsNull(array_view, offset)) {
+          auto item = ArrowArrayViewGetBytesUnsafe(array_view, offset);
+          bytes_per_row[it - begin] = static_cast<uint32_t>(item.size_bytes);
+        }
+      }
+    }
+
+    // 2. Calculate prefix sum
+    // We use size_t (or uint64_t) for the sum to prevent overflow
+    std::vector<size_t> prefix_sum;
+    prefix_sum.reserve(num_rows + 1);
+    prefix_sum.push_back(0);
+
+    for (uint32_t b : bytes_per_row) {
+      total_bytes += b;
+      prefix_sum.push_back(total_bytes);
+    }
+
+    // 3. Calculate balanced split points
+    std::vector<uint32_t> split_points;
+    split_points.reserve(num_threads + 1);
+    split_points.push_back(0);  // The start index for the first thread
+
+    assert(num_threads > 0);
+    double ideal_chunk_size = static_cast<double>(total_bytes) / num_threads;
+
+    for (uint32_t i = 1; i < num_threads; ++i) {
+      auto target_size = static_cast<size_t>(i * ideal_chunk_size);
+
+      // Find the first index where cumulative bytes >= target_size
+      auto it = std::lower_bound(prefix_sum.begin(), prefix_sum.end(), target_size);
+
+      // Convert iterator to index (row number)
+      auto split_index = static_cast<uint32_t>(std::distance(prefix_sum.begin(), it));
+      split_points.push_back(split_index);
+    }
+
+    // Ensure the last point is the total number of rows
+    // If num_threads was 0, this will be the second element (0, num_rows)
+    split_points.push_back(static_cast<uint32_t>(num_rows));
+
+    return split_points;
+  }
+
+  GeometryType getUpcastedGeometryType(
+      const std::unordered_set<GeometryType>& types) const {
+    GeometryType final_type;
+    // Infer a generic type that can represent the current and previous types
+    switch (types.size()) {
+      case 0:
+        final_type = GeometryType::kNull;
+        break;
+      case 1:
+        final_type = *types.begin();
+        break;
+      case 2: {
+        if (types.count(GeometryType::kPoint) && types.count(GeometryType::kMultiPoint)) {
+          final_type = GeometryType::kMultiPoint;
+        } else if (types.count(GeometryType::kLineString) &&
+                   types.count(GeometryType::kMultiLineString)) {
+          final_type = GeometryType::kMultiLineString;
+        } else if (types.count(GeometryType::kPolygon) &&
+                   types.count(GeometryType::kMultiPolygon)) {
+          final_type = GeometryType::kMultiPolygon;
+        } else {
+          final_type = GeometryType::kGeometryCollection;
+        }
+        break;
+      }
+      default:
+        final_type = GeometryType::kGeometryCollection;
+    }
+    return final_type;
+  }
 };
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/mem/memory_manager.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/mem/memory_manager.hpp
new file mode 100644
index 000000000..7160fb6da
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/mem/memory_manager.hpp
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include "rmm/mr/device/cuda_async_memory_resource.hpp"
+#include "rmm/mr/device/device_memory_resource.hpp"
+#include "rmm/mr/device/pool_memory_resource.hpp"
+#include "rmm/mr/device/tracking_resource_adaptor.hpp"
+
+#include <memory>
+namespace gpuspatial {
+/** @brief An optional singleton memory manager to use asynchronous memory allocation and
+ * memory pool with RAPIDS's RMM memory resources.
+ * Once the memory manager is initialized, all GPU memory allocations will use the RMM's
+ * memory allocator. The user should call Shutdown() to cleanly release RMM resources
+ * before program exit.
+ */
+class MemoryManager {
+ public:
+  static MemoryManager& instance();
+
+  MemoryManager(const MemoryManager&) = delete;
+  MemoryManager& operator=(const MemoryManager&) = delete;
+
+  /**
+   * @brief Initializes the memory resources.
+   * @param use_pool Whether to use RMM pool allocator
+   * @param init_pool_precent Initial pool size as percent of total GPU memory
+   */
+  void Init(bool use_pool, int init_pool_precent = 50);
+
+  /**
+   * @brief Estimates free memory available in bytes
+   * * If using a pool: Returns (Total GPU Mem - Tracked Bytes) * 0.95 safety factor.
+   * If direct: Returns actual CUDA free memory.
+   */
+  size_t get_available_device_memory() const;
+
+  /**
+   * @brief Estimates free host memory available in bytes
+   */
+  static size_t get_available_host_memory();
+  /**
+   * @brief Cleanly resets RMM resources. Automatically called on destruction.
+   */
+  void Shutdown();
+
+ private:
+  MemoryManager() = default;
+  ~MemoryManager();
+
+  // --- Type Aliases ---
+  using CudaMR = rmm::mr::cuda_async_memory_resource;
+  using PoolMR = rmm::mr::pool_memory_resource<CudaMR>;
+
+  // We have two possible tracker types depending on configuration
+  using PoolTracker = rmm::mr::tracking_resource_adaptor<PoolMR>;
+  using CudaTracker = rmm::mr::tracking_resource_adaptor<CudaMR>;
+
+  // --- State ---
+  bool is_initialized_ = false;
+  bool use_pool_ = false;
+
+  std::unique_ptr<CudaMR> cuda_mr_;
+  std::unique_ptr<PoolMR> pool_mr_;
+  std::unique_ptr<rmm::mr::device_memory_resource> active_resource_;
+
+  void* raw_tracker_ptr_ = nullptr;
+};
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
new file mode 100644
index 000000000..9d6d9d379
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/loader/device_geometries.hpp"
+#include "gpuspatial/loader/parallel_wkb_loader.hpp"
+#include "gpuspatial/refine/rt_spatial_refiner.hpp"
+#include "gpuspatial/refine/spatial_refiner.hpp"
+#include "gpuspatial/relate/relate_engine.cuh"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/gpu_timer.hpp"
+#include "gpuspatial/utils/thread_pool.hpp"
+
+#include "geoarrow/geoarrow_type.h"
+#include "nanoarrow/nanoarrow.h"
+
+#include "rmm/cuda_stream_pool.hpp"
+#include "rmm/cuda_stream_view.hpp"
+
+namespace gpuspatial {
+
+class RTSpatialRefiner : public SpatialRefiner {
+  // TODO: Assuming every thing is 2D in double for now
+  using scalar_t = double;
+  static constexpr int n_dim = 2;
+  using index_t = uint32_t;  // type of the index to represent geometries
+  // geometry types
+  using point_t = Point<scalar_t, n_dim>;
+  using multi_point_t = MultiPoint<point_t>;
+  using line_string_t = LineString<point_t>;
+  using multi_line_string_t = MultiLineString<point_t, index_t>;
+  using polygon_t = Polygon<point_t, index_t>;
+  using multi_polygon_t = MultiPolygon<point_t, index_t>;
+  // geometry array types
+  using point_array_t = PointArrayView<point_t, index_t>;
+  using multi_point_array_t = MultiPointArrayView<point_t, index_t>;
+  using line_string_array_t = LineStringArrayView<point_t, index_t>;
+  using multi_line_string_array_t = MultiLineStringArrayView<point_t, index_t>;
+  using polygon_array_t = PolygonArrayView<point_t, index_t>;
+  using multi_polygon_array_t = MultiPolygonArrayView<point_t, index_t>;
+
+  using dev_geometries_t = DeviceGeometries<point_t, index_t>;
+  using box_t = Box<Point<float, n_dim>>;
+  using loader_t = ParallelWkbLoader<point_t, index_t>;
+
+  static_assert(sizeof(Box<Point<float, 2>>) == sizeof(box_t),
+                "Box<Point<float, 2>> size mismatch!");
+
+ public:
+  struct IndicesMap {
+    // Sorted unique original indices
+    std::vector<uint32_t> h_uniq_indices;
+    rmm::device_uvector<uint32_t> d_uniq_indices{0, rmm::cuda_stream_default};
+    // Mapping from original indices to consecutive zero-based indices
+    rmm::device_uvector<uint32_t> d_reordered_indices{0, rmm::cuda_stream_default};
+  };
+  struct SpatialRefinerContext {
+    rmm::cuda_stream_view cuda_stream;
+#ifdef GPUSPATIAL_PROFILING
+    GPUTimer timer;
+    // counters
+    double parse_ms = 0.0;
+    double alloc_ms = 0.0;
+    double refine_ms = 0.0;
+    double copy_res_ms = 0.0;
+#endif
+  };
+
+  RTSpatialRefiner() = default;
+
+  RTSpatialRefiner(const RTSpatialRefinerConfig& config);
+
+  ~RTSpatialRefiner() = default;
+
+  void Clear() override;
+
+  void PushBuild(const ArrowArrayView* build_array) override;
+
+  void FinishBuilding() override;
+
+  uint32_t Refine(const ArrowArrayView* probe_array, Predicate predicate,
+                  uint32_t* build_indices, uint32_t* probe_indices,
+                  uint32_t len) override;
+
+  uint32_t RefinePipelined(const ArrowArrayView* probe_array, Predicate predicate,
+                           uint32_t* build_indices, uint32_t* probe_indices,
+                           uint32_t len);
+
+ private:
+  RTSpatialRefinerConfig config_;
+  std::unique_ptr<rmm::cuda_stream_pool> stream_pool_;
+  std::shared_ptr<ThreadPool> thread_pool_;
+  std::unique_ptr<ParallelWkbLoader<point_t, index_t>> wkb_loader_;
+  dev_geometries_t build_geometries_;
+
+  template <typename INDEX_IT>
+  void buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT index_begin,
+                       INDEX_IT index_end, IndicesMap& indices_map) const;
+};
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.hpp
new file mode 100644
index 000000000..1c9a6305e
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.hpp
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "gpuspatial/refine/spatial_refiner.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
+
+#include <memory>
+
+namespace gpuspatial {
+/** Configuration for RTSpatialRefiner for initializing a RTSpatialRefiner instance
+ */
+struct RTSpatialRefinerConfig {
+  // The ray-tracing engine to use
+  std::shared_ptr<RTEngine> rt_engine;
+  // Prefer fast build the BVH
+  bool prefer_fast_build = false;
+  // Compress the BVH to save memory
+  bool compact = true;
+  // Loader configurations
+  // How many threads to use for parsing WKBs
+  uint32_t parsing_threads = std::thread::hardware_concurrency();
+  // How many threads are allowed to call PushStream concurrently
+  uint32_t concurrency = 1;
+  // Overlapping parsing and refinement by pipelining multiple batches; 1 means no
+  // pipelining
+  uint32_t pipeline_batches = 1;
+  // the host memory quota for WKB parser compared to the available memory
+  float wkb_parser_memory_quota = 0.8;
+  // the device memory quota for relate engine compared to the available memory
+  float relate_engine_memory_quota = 0.8;
+  // this value determines RELATE_MAX_DEPTH
+  size_t stack_size_bytes = 3 * 1024;
+  bool sort_probe_indices = true;  // Sedona's spatial-join may require ordered output
+};
+
+std::unique_ptr<SpatialRefiner> CreateRTSpatialRefiner(
+    const RTSpatialRefinerConfig& config);
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/spatial_refiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/spatial_refiner.hpp
new file mode 100644
index 000000000..373815cee
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/spatial_refiner.hpp
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "gpuspatial/relate/predicate.hpp"
+
+#include "nanoarrow/nanoarrow.h"
+
+namespace gpuspatial {
+/** This class refines candidate pairs of geometries based on a spatial predicate.
+ *
+ * The SpatialRefiner is initialized by pushing build-side geometries via PushBuild(),
+ * followed by a call to FinishBuilding(). After that, the Refine() method can be called
+ * multiple times with probe-side geometries and candidate index pairs to filter out
+ * non-matching pairs based on the specified spatial predicate.
+ */
+class SpatialRefiner {
+ public:
+  virtual ~SpatialRefiner() = default;
+
+  /** Clear the internal state of the refiner, allowing it to be reused.
+   */
+  virtual void Clear() = 0;
+
+  /** Push build-side geometries to the refiner.
+   *
+   * @param build_array An ArrowArrayView containing the build-side geometries.
+   */
+  virtual void PushBuild(const ArrowArrayView* build_array) = 0;
+
+  /** Finalize the build-side geometries after all have been pushed. The Refine function
+   * can only be used after this call.
+   */
+  virtual void FinishBuilding() = 0;
+
+  /** Refine candidate pairs of geometries based on a spatial predicate.
+   *
+   * @param probe_array An ArrowArrayView containing the probe-side geometries.
+   * @param predicate The spatial predicate to use for refinement.
+   * @param build_indices An array of build-side indices corresponding to candidate pairs.
+   * This is a global index from 0 to N-1, where N is the total number of build geometries
+   * pushed.
+   * @param probe_indices An array of probe-side indices corresponding to candidate pairs.
+   * This is a local index from 0 to M - 1, where M is the number of geometries in the
+   * probe_array.
+   * @param len The length of the build_indices and probe_indices arrays.
+   * @return The number of candidate pairs that satisfy the spatial predicate after
+   * refinement.
+   */
+  virtual uint32_t Refine(const ArrowArrayView* probe_array, Predicate predicate,
+                          uint32_t* build_indices, uint32_t* probe_indices,
+                          uint32_t len) = 0;
+};
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/intersection_matrix.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/intersection_matrix.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/intersection_matrix.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/intersection_matrix.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/predicate.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/predicate.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/predicate.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/predicate.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.hpp
similarity index 95%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.hpp
index 4b397453c..038ce7681 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.hpp
@@ -22,13 +22,13 @@
  */
 
 #pragma once
-#include "gpuspatial/geom/line_string.cuh"
-#include "gpuspatial/geom/multi_line_string.cuh"
-#include "gpuspatial/geom/multi_point.cuh"
-#include "gpuspatial/geom/multi_polygon.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/geom/polygon.cuh"
-#include "gpuspatial/relate/intersection_matrix.cuh"
+#include "gpuspatial/geom/line_string.hpp"
+#include "gpuspatial/geom/multi_line_string.hpp"
+#include "gpuspatial/geom/multi_point.hpp"
+#include "gpuspatial/geom/multi_polygon.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/geom/polygon.hpp"
+#include "gpuspatial/relate/intersection_matrix.hpp"
 // Ref: https://github.com/heterodb/pg-strom/blob/master/src/xpu_postgis.cu
 // A good visualize to cases
 // https://dev.luciad.com/portal/productDocumentation/LuciadFusion/docs/articles/guide/geometry/images/interior_exterior_boundary.png
@@ -169,8 +169,10 @@ DEV_HOST int32_t relate(const POINT_T& P1, bool p1_is_head, const POINT_T& P2,
           if (p1_in_qq != PointLocation::kOutside &&
               p2_in_qq != PointLocation::kOutside) {
             /* P1-P2 is fully contained by Q1-Q2 */
-            if (p1_is_head) retval |= (IntersectionMatrix::BOUND_BOUND_0D | IM__LINE_HEAD_CONTAINED);
-            if (p2_is_tail) retval |= (IntersectionMatrix::BOUND_BOUND_0D | IM__LINE_TAIL_CONTAINED);
+            if (p1_is_head)
+              retval |= (IntersectionMatrix::BOUND_BOUND_0D | IM__LINE_HEAD_CONTAINED);
+            if (p2_is_tail)
+              retval |= (IntersectionMatrix::BOUND_BOUND_0D | IM__LINE_TAIL_CONTAINED);
             if (P1 == P2) {
               if (!p1_is_head && !p2_is_tail)
                 retval |= IntersectionMatrix::INTER_BOUND_0D;
@@ -457,8 +459,9 @@ DEV_HOST_INLINE int32_t relate(const LinearRing<POINT_T>& ring,
         std::min(P1.x(), P2.x()) > mbr.get_max().x() ||
         std::max(P1.y(), P2.y()) < mbr.get_min().y() ||
         std::min(P1.y(), P2.y()) > mbr.get_max().y()) {
-      status = (IntersectionMatrix::INTER_EXTER_1D | IntersectionMatrix::BOUND_EXTER_0D | IntersectionMatrix::EXTER_INTER_2D |
-                IntersectionMatrix::EXTER_BOUND_1D | IntersectionMatrix::EXTER_EXTER_2D);
+      status = (IntersectionMatrix::INTER_EXTER_1D | IntersectionMatrix::BOUND_EXTER_0D |
+                IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
+                IntersectionMatrix::EXTER_EXTER_2D);
     } else {
       status = relate(P1, false, P2, false, geom, 0, false);
       // char res[10];
@@ -497,25 +500,32 @@ DEV_HOST_INLINE int32_t relate(const LinearRing<POINT_T>& ring,
    */
   if ((rflags & IntersectionMatrix::INTER_BOUND_2D) == IntersectionMatrix::INTER_BOUND_1D)
     boundary = IntersectionMatrix::BOUND_BOUND_1D;
-  else if ((rflags & IntersectionMatrix::INTER_BOUND_2D) == IntersectionMatrix::INTER_BOUND_0D)
+  else if ((rflags & IntersectionMatrix::INTER_BOUND_2D) ==
+           IntersectionMatrix::INTER_BOUND_0D)
     boundary = IntersectionMatrix::BOUND_BOUND_0D;
 
-  if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 && (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0 &&
+  if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 &&
+      (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0 &&
       (rflags & IntersectionMatrix::INTER_EXTER_2D) == 0) {
     /* ring equals to the polygon */
-    return (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::BOUND_BOUND_1D | IntersectionMatrix::EXTER_EXTER_2D);
-  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 && (rflags & IntersectionMatrix::INTER_BOUND_2D) == 0 &&
+    return (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::BOUND_BOUND_1D |
+            IntersectionMatrix::EXTER_EXTER_2D);
+  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 &&
+             (rflags & IntersectionMatrix::INTER_BOUND_2D) == 0 &&
              (rflags & IntersectionMatrix::INTER_EXTER_2D) != 0) {
     if (poly_has_outside) {
       /* disjoint */
-      return (IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D | IntersectionMatrix::EXTER_INTER_2D |
-              IntersectionMatrix::EXTER_BOUND_1D | IntersectionMatrix::EXTER_EXTER_2D);
+      return (IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D |
+              IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
+              IntersectionMatrix::EXTER_EXTER_2D);
     } else {
       /* ring fully contains the polygons */
-      return (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D | IntersectionMatrix::INTER_EXTER_2D |
-              IntersectionMatrix::BOUND_EXTER_1D | IntersectionMatrix::EXTER_EXTER_2D);
+      return (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D |
+              IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D |
+              IntersectionMatrix::EXTER_EXTER_2D);
     }
-  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) != 0 && (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0
+  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) != 0 &&
+             (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0
              // TODO: Need this? && (rflags & IntersectionMatrix::INTER_EXTER_2D) != 0
   ) {
     /* ring has intersection to the polygon */
@@ -523,26 +533,36 @@ DEV_HOST_INLINE int32_t relate(const LinearRing<POINT_T>& ring,
     if ((rflags & IntersectionMatrix::INTER_EXTER_2D) != 0) {
       boundary |= IntersectionMatrix::BOUND_EXTER_1D;
     }
-    return boundary | (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D | IntersectionMatrix::INTER_EXTER_2D |
-                       IntersectionMatrix::BOUND_INTER_1D | IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
-                       IntersectionMatrix::EXTER_EXTER_2D);
-  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 && (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0 &&
+    return boundary |
+           (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D |
+            IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_INTER_1D |
+            IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
+            IntersectionMatrix::EXTER_EXTER_2D);
+  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 &&
+             (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0 &&
              (rflags & IntersectionMatrix::INTER_EXTER_2D) != 0) {
     if (poly_has_outside) {
       /* ring touched the polygon at a boundary, but no intersection */
       assert(boundary != 0);
-      return boundary | (IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D | IntersectionMatrix::EXTER_INTER_2D |
-                         IntersectionMatrix::EXTER_BOUND_1D | IntersectionMatrix::EXTER_EXTER_2D);
+      return boundary |
+             (IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D |
+              IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
+              IntersectionMatrix::EXTER_EXTER_2D);
     } else {
       /* ring fully contains the polygon touched at boundaries */
       assert(boundary != 0);
-      return boundary | (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D | IntersectionMatrix::INTER_EXTER_2D |
-                         IntersectionMatrix::BOUND_EXTER_1D | IntersectionMatrix::EXTER_EXTER_2D);
+      return boundary |
+             (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D |
+              IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D |
+              IntersectionMatrix::EXTER_EXTER_2D);
     }
-  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) != 0 && (rflags & IntersectionMatrix::INTER_EXTER_2D) == 0) {
+  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) != 0 &&
+             (rflags & IntersectionMatrix::INTER_EXTER_2D) == 0) {
     /* ring is fully contained by the polygon; might be touched */
-    return boundary | (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::BOUND_INTER_1D | IntersectionMatrix::EXTER_INTER_2D |
-                       IntersectionMatrix::EXTER_BOUND_1D | IntersectionMatrix::EXTER_EXTER_2D);
+    return boundary |
+           (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::BOUND_INTER_1D |
+            IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
+            IntersectionMatrix::EXTER_EXTER_2D);
   }
   // FIXME:
   printf("unknown intersection\n");
@@ -663,7 +683,8 @@ DEV_HOST_INLINE int32_t relate(const POINT_T& geom1,
                                const MultiPolygon<POINT_T, INDEX_T>& geom2,
                                ArrayView<PointLocation> locations) {
   assert(geom2.num_polygons() == locations.size());
-  if (geom2.empty()) return IntersectionMatrix::INTER_EXTER_0D | IntersectionMatrix::EXTER_EXTER_2D;
+  if (geom2.empty())
+    return IntersectionMatrix::INTER_EXTER_0D | IntersectionMatrix::EXTER_EXTER_2D;
   int32_t retval = IntersectionMatrix::EXTER_EXTER_2D;
   bool matched = false;
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/relate_engine.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate_engine.cuh
similarity index 66%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/relate_engine.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate_engine.cuh
index 5fb275078..c83538a75 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/relate_engine.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate_engine.cuh
@@ -15,10 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/index/detail/rt_engine.hpp"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "gpuspatial/relate/predicate.cuh"
-#include "gpuspatial/utils/queue.h"
+#include "gpuspatial/loader/device_geometries.hpp"
+#include "gpuspatial/relate/predicate.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 
@@ -31,8 +30,9 @@ class RelateEngine {
  public:
   struct Config {
     bool bvh_fast_build = false;
-    bool bvh_fast_compact = true;
+    bool bvh_compact = true;
     float memory_quota = 0.8;
+    int segs_per_aabb = 32;
   };
 
   RelateEngine() = default;
@@ -40,80 +40,94 @@ class RelateEngine {
   RelateEngine(const DeviceGeometries<POINT_T, INDEX_T>* geoms1);
 
   RelateEngine(const DeviceGeometries<POINT_T, INDEX_T>* geoms1,
-               const details::RTEngine* rt_engine);
+               const RTEngine* rt_engine);
 
   void set_config(const Config& config) { config_ = config; }
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const DeviceGeometries<POINT_T, INDEX_T>& geoms2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   template <typename GEOM2_ARRAY_VIEW_T>
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const GEOM2_ARRAY_VIEW_T& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   // This is a generic version that can accept any two geometry array views
   template <typename GEOM1_ARRAY_VIEW_T, typename GEOM2_ARRAY_VIEW_T>
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const GEOM1_ARRAY_VIEW_T& geom_array1,
                 const GEOM2_ARRAY_VIEW_T& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   // These are the specific overloads for RT-accelerated PIP queries
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void EvaluateImpl(const rmm::cuda_stream_view& stream,
                     const PointArrayView<POINT_T, INDEX_T>& point_array,
                     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
                     const PolygonArrayView<POINT_T, INDEX_T>& poly_array,
-                    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids,
-                    bool inverse = false);
+                    Predicate predicate, rmm::device_uvector<INDEX_T>& point_ids,
+                    rmm::device_uvector<INDEX_T>& poly_ids, bool inverse = false);
 
   void EvaluateImpl(const rmm::cuda_stream_view& stream,
                     const PointArrayView<POINT_T, INDEX_T>& point_array,
                     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
                     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_poly_array,
-                    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids,
-                    bool inverse);
+                    Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                    rmm::device_uvector<INDEX_T>& ids2, bool inverse);
+
+  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
+                         const PolygonArrayView<POINT_T, INDEX_T>& polys,
+                         ArrayView<uint32_t> poly_ids, int segs_per_aabb);
+
+  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
+                         const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
+                         ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb);
 
   /**
    * Build BVH for a subset of polygons
@@ -122,34 +136,27 @@ class RelateEngine {
    * @param polygon_ids
    * @param buffer
    */
-  OptixTraversableHandle BuildBVH(const rmm::cuda_stream_view& stream,
-                                  const PolygonArrayView<POINT_T, INDEX_T>& polygons,
-                                  ArrayView<uint32_t> polygon_ids,
-                                  rmm::device_uvector<INDEX_T>& seg_begins,
-                                  rmm::device_buffer& buffer,
-                                  rmm::device_uvector<INDEX_T>& aabb_poly_ids,
-                                  rmm::device_uvector<INDEX_T>& aabb_ring_ids);
+  OptixTraversableHandle BuildBVH(
+      const rmm::cuda_stream_view& stream,
+      const PolygonArrayView<POINT_T, INDEX_T>& polygons, ArrayView<uint32_t> polygon_ids,
+      int segs_per_aabb, rmm::device_buffer& buffer,
+      rmm::device_uvector<INDEX_T>& aabb_poly_ids,
+      rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+      rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets);
 
   OptixTraversableHandle BuildBVH(
       const rmm::cuda_stream_view& stream,
       const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-      ArrayView<uint32_t> multi_poly_ids, rmm::device_uvector<INDEX_T>& seg_begins,
-      rmm::device_uvector<INDEX_T>& part_begins, rmm::device_buffer& buffer,
+      ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb, rmm::device_buffer& buffer,
       rmm::device_uvector<INDEX_T>& aabb_multi_poly_ids,
       rmm::device_uvector<INDEX_T>& aabb_part_ids,
-      rmm::device_uvector<INDEX_T>& aabb_ring_ids);
-
-  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
-                         const PolygonArrayView<POINT_T, INDEX_T>& polys,
-                         ArrayView<uint32_t> poly_ids);
-
-  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
-                         const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-                         ArrayView<uint32_t> multi_poly_ids);
+      rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+      rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets,
+      rmm::device_uvector<INDEX_T>& part_begins);
 
  private:
   Config config_;
   const DeviceGeometries<POINT_T, INDEX_T>* geoms1_;
-  const details::RTEngine* rt_engine_;
+  const RTEngine* rt_engine_;
 };
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/launch_parameters.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/launch_parameters.cuh
similarity index 67%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/launch_parameters.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/launch_parameters.cuh
index 555d2504c..a263fbcf2 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/launch_parameters.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/launch_parameters.cuh
@@ -16,13 +16,13 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/multi_point.cuh"
-#include "gpuspatial/geom/multi_polygon.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/geom/polygon.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/queue_view.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/multi_point.hpp"
+#include "gpuspatial/geom/multi_polygon.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/geom/polygon.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/queue_view.hpp"
 
 #include <thrust/pair.h>
 
@@ -31,29 +31,29 @@ namespace detail {
 
 template <typename POINT_T>
 struct LaunchParamsPointQuery {
-  using box_t = Box<Point<float, POINT_T::n_dim>>;
-  // Data structures of geometries1
-  bool grouped;
-  ArrayView<uint32_t> prefix_sum;         // Only used when grouped
-  ArrayView<uint32_t> reordered_indices;  // Only used when grouped
-  ArrayView<box_t> mbrs1;                 // MBR of each feature in geometries1
+  using box_t = Box<POINT_T>;
+  // Input
+  ArrayView<box_t> rects;
+  ArrayView<POINT_T> points;
   OptixTraversableHandle handle;
-  //  Data structures of geometries2
-  ArrayView<POINT_T> points2;
-  // Output: Geom1 ID, Geom2 ID
-  QueueView<thrust::pair<uint32_t, uint32_t>> ids;
+  uint32_t* count;
+  // Output
+  QueueView<uint32_t> rect_ids;
+  ArrayView<uint32_t> point_ids;
 };
 
 template <typename POINT_T>
 struct LaunchParamsBoxQuery {
-  using box_t = Box<Point<float, POINT_T::n_dim>>;
+  using box_t = Box<POINT_T>;
   // Input
-  ArrayView<box_t> mbrs1;
-  ArrayView<box_t> mbrs2;
+  ArrayView<box_t> rects1;
+  ArrayView<box_t> rects2;
   // can be either geometries 1 or 2
   OptixTraversableHandle handle;
-  // Output: Geom2 ID, Geom2 ID
-  QueueView<thrust::pair<uint32_t, uint32_t>> ids;
+  uint32_t* count;
+  // Output
+  QueueView<uint32_t> rect1_ids;
+  ArrayView<uint32_t> rect2_ids;
 };
 
 /**
@@ -67,12 +67,15 @@ struct LaunchParamsPolygonPointQuery {
   MultiPointArrayView<point_t, index_t> multi_points;
   PointArrayView<point_t, index_t> points;
   PolygonArrayView<point_t, index_t> polygons;
-  ArrayView<index_t> polygon_ids;  // sorted
-  ArrayView<thrust::pair<index_t, index_t>> ids;
+  ArrayView<index_t> uniq_polygon_ids;  // sorted
+  index_t* query_point_ids;
+  index_t* query_polygon_ids;
+  size_t query_size;
   ArrayView<index_t> seg_begins;
   ArrayView<int> IMs;  // intersection matrices
   OptixTraversableHandle handle;
   ArrayView<index_t> aabb_poly_ids, aabb_ring_ids;
+  ArrayView<thrust::pair<index_t, index_t>> aabb_vertex_offsets;
 };
 
 /**
@@ -87,14 +90,16 @@ struct LaunchParamsPointMultiPolygonQuery {
   // Either MultiPointArrayView or PointArrayView will be used
   MultiPointArrayView<point_t, index_t> multi_points;
   PointArrayView<point_t, index_t> points;
-  ArrayView<index_t> multi_polygon_ids;  // sorted
-  ArrayView<thrust::pair<index_t, index_t>> ids;
-  ArrayView<index_t> seg_begins;
-  ArrayView<index_t> uniq_part_begins;
+  ArrayView<index_t> uniq_multi_polygon_ids;  // sorted
+  index_t* query_point_ids;
+  index_t* query_multi_polygon_ids;
+  size_t query_size;
+  ArrayView<index_t> uniq_part_begins;  // used to calculate z-index for parts
   // each query point has n elements of part_min_y and part_locations, n is # of parts
   ArrayView<int> IMs;  // intersection matrices
   OptixTraversableHandle handle;
   ArrayView<index_t> aabb_multi_poly_ids, aabb_part_ids, aabb_ring_ids;
+  ArrayView<thrust::pair<index_t, index_t>> aabb_vertex_offsets;
 };
 
 }  // namespace detail
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/rt_engine.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/rt_engine.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/rt_engine.hpp
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/rt_engine.hpp
index d571feaa7..3b3019e46 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/rt_engine.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/rt_engine.hpp
@@ -16,7 +16,7 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/utils/array_view.h"
+#include "gpuspatial/utils/array_view.hpp"
 
 #include "rmm/cuda_stream.hpp"
 #include "rmm/device_uvector.hpp"
@@ -33,7 +33,6 @@
 #define GPUSPATIAL_OPTIX_LAUNCH_PARAMS_NAME "params"
 
 namespace gpuspatial {
-namespace details {
 
 /*! SBT record for a raygen program */
 struct __align__(OPTIX_SBT_RECORD_ALIGNMENT) RaygenRecord {
@@ -160,6 +159,9 @@ RTConfig get_default_rt_config(const std::string& ptx_root);
 
 class RTEngine {
  public:
+  RTEngine(const RTEngine&) = delete;
+  RTEngine& operator=(const RTEngine&) = delete;
+
   RTEngine();
   ~RTEngine();
 
@@ -201,5 +203,4 @@ class RTEngine {
   bool initialized_;
 };
 
-}  // namespace details
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.hpp
index f1d5fb487..da9339ae7 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.hpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <thrust/swap.h>
 namespace gpuspatial {
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.hpp
similarity index 97%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.hpp
index 2f6941704..4cca08fd0 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.hpp
@@ -28,7 +28,7 @@
 
 #else
 #define DEV_HOST
-#define DEV_HOST_INLINE
+#define DEV_HOST_INLINE inline
 #define DEV_INLINE
 #define CONST_STATIC_INIT(...) = __VA_ARGS__
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.hpp
similarity index 99%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.hpp
index 91c5adce8..9bf3c9267 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.hpp
@@ -68,7 +68,7 @@
 
 #pragma once
 
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <array>
 #include <cfloat>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.hpp
similarity index 95%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.hpp
index a35005ebe..ab6f174e7 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.hpp
@@ -53,7 +53,7 @@ inline void optixCheck(OptixResult res, const char* call, const char* file,
     std::stringstream ss;
     ss << "OptiX API call (" << call << ") failed with error " << optixGetErrorName(res)
        << " (" << file << ":" << line << ")";
-    GPUSPATIAL_LOG_ERROR("Optix API error: {}", ss.str());
+    GPUSPATIAL_LOG_ERROR("Optix API error: %s", ss.str());
     throw GPUException(res, ss.str().c_str());
   }
 }
@@ -64,7 +64,7 @@ inline void cudaCheck(cudaError_t error, const char* call, const char* file,
     std::stringstream ss;
     ss << "CUDA API call (" << call << ") failed with error " << cudaGetErrorString(error)
        << " (" << file << ":" << line << ")";
-    GPUSPATIAL_LOG_ERROR("CUDA API error: {}", ss.str());
+    GPUSPATIAL_LOG_ERROR("CUDA API error: %s", ss.str());
     throw GPUException(ss.str().c_str());
   }
 }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.hpp
similarity index 99%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.hpp
index 9014a552b..6512fe40c 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <cmath>
 #include <cstdint>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
index 33c8d47bc..1cec9359f 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/exception.h"
+#include "gpuspatial/utils/exception.hpp"
 
 #include <cuda_runtime.h>
 namespace gpuspatial {
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.cuh
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.cuh
index 5fc1d54ff..99c02b38c 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.cuh
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <cassert>
 #include <climits>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.hpp
similarity index 94%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.hpp
index 09c2c8aed..31c0b6a7d 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.hpp
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/exception.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/exception.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/markers.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/markers.hpp
new file mode 100644
index 000000000..ced3af7c3
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/markers.hpp
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#ifndef GPUSPATIAL_PROFILING
+#define DISABLE_NVTX_MARKERS
+#endif
+
+#ifndef DISABLE_NVTX_MARKERS
+#include <nvtx3/nvtx3.hpp>
+#endif
+// This file provide a simple wrapper around NVTX3 for marking GPU code regions and events
+// for profiling purposes.
+namespace gpuspatial {
+
+struct Category {
+  static constexpr uint32_t KernelWorkitems = 1;
+  static constexpr uint32_t IntervalWorkitems = 2;
+};
+
+// Colors in ARGB format (Alpha, Red, Green, Blue)
+struct Color {
+  static constexpr uint32_t Red = 0xFF880000;
+  static constexpr uint32_t Green = 0xFF008800;
+  static constexpr uint32_t Blue = 0xFF000088;
+  static constexpr uint32_t Yellow = 0xFFFFFF00;
+  static constexpr uint32_t Default = 0;
+};
+
+#ifndef DISABLE_NVTX_MARKERS
+
+struct Instrument {
+  // ---------------------------------------------------------------------------
+  // Helper: Create attributes correctly using constructors
+  // ---------------------------------------------------------------------------
+  static nvtx3::event_attributes create_attr(const char* msg, uint32_t color_val,
+                                             uint32_t category_val) {
+    // 1. Basic Message
+    nvtx3::event_attributes attr{msg};
+
+    // 2. Apply Color (if not default)
+    if (color_val != Color::Default) {
+      // Use nvtx3::rgb wrapping the uint32_t directly usually works,
+      // but if it fails, we assign to the internal color_type directly via the generic
+      // color wrapper
+      attr = nvtx3::event_attributes{msg, nvtx3::color{color_val}};
+    }
+
+    // 3. Apply Category (if valid)
+    // Note: We cannot "append" to an existing immutable object.
+    // We must construct with all arguments at once.
+
+    if (color_val != Color::Default && category_val != 0) {
+      return nvtx3::event_attributes{msg, nvtx3::color{color_val},
+                                     nvtx3::category{category_val}};
+    } else if (color_val != Color::Default) {
+      return nvtx3::event_attributes{msg, nvtx3::color{color_val}};
+    } else if (category_val != 0) {
+      return nvtx3::event_attributes{msg, nvtx3::category{category_val}};
+    }
+
+    return attr;
+  }
+
+  // ---------------------------------------------------------------------------
+  // Instant Markers
+  // ---------------------------------------------------------------------------
+  static void Mark(const char* message, uint32_t color = Color::Default,
+                   uint32_t category = 0) {
+    nvtx3::mark(create_attr(message, color, category));
+  }
+
+  static void MarkInt(int64_t value, const char* message, uint32_t color = Color::Default,
+                      uint32_t category = 0) {
+    // Construct with payload immediately
+    // Note: If you need color+category+payload, the constructor list gets long.
+    // This covers the most common case: Message + Payload
+    if (color == Color::Default && category == 0) {
+      nvtx3::event_attributes attr{message, nvtx3::payload{value}};
+      nvtx3::mark(attr);
+    } else {
+      // Fallback: manually construct complex attribute
+      // Most NVTX3 versions support {msg, color, payload, category} in any order
+      nvtx3::event_attributes attr{message, nvtx3::color{color},
+                                   nvtx3::category{category}, nvtx3::payload{value}};
+      nvtx3::mark(attr);
+    }
+  }
+
+  static void MarkWorkitems(uint64_t items, const char* message = "Workitems") {
+    nvtx3::event_attributes attr{message, nvtx3::payload{items},
+                                 nvtx3::category{Category::KernelWorkitems}};
+    nvtx3::mark(attr);
+  }
+
+  // ---------------------------------------------------------------------------
+  // Scoped Ranges (RAII)
+  // ---------------------------------------------------------------------------
+  struct Range {
+    nvtx3::scoped_range range;
+
+    // Standard Range
+    explicit Range(const char* message, uint32_t color = Color::Default,
+                   uint32_t category = 0)
+        : range(Instrument::create_attr(message, color, category)) {}
+
+    // Payload Range (for workitems/intervals)
+    explicit Range(const char* message, uint64_t payload,
+                   uint32_t category = Category::IntervalWorkitems)
+        : range(nvtx3::event_attributes{message, nvtx3::payload{payload},
+                                        nvtx3::category{category}}) {}
+  };
+};
+
+#else
+
+// -----------------------------------------------------------------------------
+// No-Op Implementation
+// -----------------------------------------------------------------------------
+struct Instrument {
+  static inline void Mark(const char*, uint32_t = 0, uint32_t = 0) {}
+  static inline void MarkInt(int64_t, const char*, uint32_t = 0, uint32_t = 0) {}
+  static inline void MarkWorkitems(uint64_t, const char*) {}
+
+  struct Range {
+    explicit Range(const char*, uint32_t = 0, uint32_t = 0) {}
+    explicit Range(const char*, uint64_t, uint32_t = 0) {}
+  };
+};
+
+#endif  // DISABLE_NVTX_MARKERS
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/mem_utils.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/mem_utils.hpp
index 1b36c934f..779387676 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/mem_utils.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/mem_utils.hpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/exception.h"
+#include "gpuspatial/utils/exception.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.hpp
index ded74f02b..0867ed007 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.hpp
@@ -19,7 +19,7 @@
  */
 
 #pragma once
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <vector_types.h>
 #include <cuda/std/cmath>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.hpp
similarity index 99%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.hpp
index 73ac54d01..2c21ea5e4 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.hpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/exception.h"
+#include "gpuspatial/utils/exception.hpp"
 
 #include <cuda_runtime.h>  // For CUDA memory management functions
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.hpp
similarity index 95%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.hpp
index 29beac229..c1921dca3 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.hpp
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/queue_view.h"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/queue_view.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 #include "rmm/device_scalar.hpp"
@@ -41,6 +41,7 @@ class Queue {
     if (counter_ == nullptr) {
       counter_ = std::make_unique<rmm::device_scalar<SIZE_T>>(stream);
     }
+    Clear(stream);
   }
 
   void Clear(const rmm::cuda_stream_view& stream) {
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.hpp
index e4b10ef9d..f907bff57 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.hpp
@@ -16,8 +16,8 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <cooperative_groups.h>
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/thread_pool.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/thread_pool.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/thread_pool.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/thread_pool.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/type_traits.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/type_traits.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/type_traits.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/type_traits.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc b/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc
index 58ef354ab..5c6b530a8 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc
@@ -14,157 +14,372 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+
 #include "gpuspatial/gpuspatial_c.h"
-#include "gpuspatial/index/spatial_joiner.hpp"
+#include "gpuspatial/index/rt_spatial_index.hpp"
+#include "gpuspatial/index/spatial_index.hpp"
+#include "gpuspatial/mem/memory_manager.hpp"
+#include "gpuspatial/refine/rt_spatial_refiner.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/exception.hpp"
+
+#include "nanoarrow/nanoarrow.hpp"
 
 #include <threads.h>
+#include <algorithm>
+#include <cstring>
 #include <memory>
-#define GPUSPATIAL_ERROR_MSG_BUFFER_SIZE (1024)
 
-struct GpuSpatialJoinerExporter {
-  static void Export(std::unique_ptr<gpuspatial::StreamingJoiner>& idx,
-                     struct GpuSpatialJoiner* out) {
-    out->private_data = idx.release();
-    out->init = &CInit;
+// -----------------------------------------------------------------------------
+// INTERNAL HELPERS
+// -----------------------------------------------------------------------------
+// This is what the private_data points to for the public C interfaces
+template <typename T>
+struct GpuSpatialWrapper {
+  T payload;
+  std::string last_error;  // Pointer to std::string to store last error message
+};
+
+// The unified error handling wrapper
+// Func: The lambda containing the logic
+template <typename T, typename Func>
+int SafeExecute(GpuSpatialWrapper<T>* wrapper, Func&& func) {
+  try {
+    func();
+    wrapper->last_error.clear();
+    return 0;
+  } catch (const std::exception& e) {
+    wrapper->last_error = std::string(e.what());
+    return EINVAL;
+  } catch (...) {
+    wrapper->last_error = "Unknown internal error";
+    return EINVAL;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// IMPLEMENTATION
+// -----------------------------------------------------------------------------
+
+struct GpuSpatialRuntimeExporter {
+  struct Payload {
+    std::shared_ptr<gpuspatial::RTEngine> rt_engine;
+    int device_id;
+  };
+
+  using private_data_t = GpuSpatialWrapper<Payload>;
+  static void Export(struct GpuSpatialRuntime* out) {
+    private_data_t* private_data =
+        new private_data_t{Payload{std::make_shared<gpuspatial::RTEngine>()}, ""};
+    out->init = CInit;
+    out->release = CRelease;
+    out->get_last_error = CGetLastError;
+    out->private_data = private_data;
+  }
+
+  static int CInit(GpuSpatialRuntime* self, GpuSpatialRuntimeConfig* config) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      std::string ptx_root(config->ptx_root);
+      auto rt_config = gpuspatial::get_default_rt_config(ptx_root);
+
+      GPUSPATIAL_LOG_INFO("Initializing GpuSpatialRuntime on device %d, PTX root %s",
+                          config->device_id, config->ptx_root);
+
+      CUDA_CHECK(cudaSetDevice(config->device_id));
+
+      gpuspatial::MemoryManager::instance().Init(config->use_cuda_memory_pool,
+                                                 config->cuda_memory_pool_init_precent);
+
+      static_cast<private_data_t*>(self->private_data)
+          ->payload.rt_engine->Init(rt_config);
+    });
+  }
+
+  static void CRelease(GpuSpatialRuntime* self) {
+    gpuspatial::MemoryManager::instance().Shutdown();
+    delete static_cast<private_data_t*>(self->private_data);
+    self->private_data = nullptr;
+  }
+
+  static const char* CGetLastError(GpuSpatialRuntime* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    return private_data->last_error.c_str();
+  }
+};
+
+void GpuSpatialRuntimeCreate(struct GpuSpatialRuntime* runtime) {
+  GpuSpatialRuntimeExporter::Export(runtime);
+}
+
+using runtime_data_t = GpuSpatialRuntimeExporter::private_data_t;
+
+struct GpuSpatialIndexFloat2DExporter {
+  using scalar_t = float;
+  static constexpr int n_dim = 2;
+  using self_t = SedonaFloatIndex2D;
+  using spatial_index_t = gpuspatial::SpatialIndex<scalar_t, n_dim>;
+
+  struct Payload {
+    std::unique_ptr<spatial_index_t> index;
+    runtime_data_t* rdata;
+  };
+
+  struct ResultBuffer {
+    std::vector<uint32_t> build_indices;
+    std::vector<uint32_t> probe_indices;
+    ResultBuffer() = default;
+
+    ResultBuffer(const ResultBuffer&) = delete;
+    ResultBuffer& operator=(const ResultBuffer&) = delete;
+
+    ResultBuffer(ResultBuffer&&) = default;
+    ResultBuffer& operator=(ResultBuffer&&) = default;
+  };
+
+  using private_data_t = GpuSpatialWrapper<Payload>;
+  using context_t = GpuSpatialWrapper<ResultBuffer>;
+
+  static void Export(const struct GpuSpatialIndexConfig* config,
+                     struct SedonaFloatIndex2D* out) {
+    auto* rdata = static_cast<runtime_data_t*>(config->runtime->private_data);
+
+    gpuspatial::RTSpatialIndexConfig index_config;
+
+    index_config.rt_engine = rdata->payload.rt_engine;
+    index_config.concurrency = config->concurrency;
+
+    // Create SpatialIndex may involve GPU operations, set device here
+    CUDA_CHECK(cudaSetDevice(rdata->payload.device_id));
+
+    auto uniq_index = gpuspatial::CreateRTSpatialIndex<float, 2>(index_config);
+
     out->clear = &CClear;
-    out->push_build = &CPushBuild;
-    out->finish_building = &CFinishBuilding;
     out->create_context = &CCreateContext;
     out->destroy_context = &CDestroyContext;
-    out->push_stream = &CPushStream;
+    out->push_build = &CPushBuild;
+    out->finish_building = &CFinishBuilding;
+    out->probe = &CProbe;
     out->get_build_indices_buffer = &CGetBuildIndicesBuffer;
-    out->get_stream_indices_buffer = &CGetStreamIndicesBuffer;
+    out->get_probe_indices_buffer = &CGetProbeIndicesBuffer;
+    out->get_last_error = &CGetLastError;
+    out->context_get_last_error = &CContextGetLastError;
     out->release = &CRelease;
-    out->last_error = new char[GPUSPATIAL_ERROR_MSG_BUFFER_SIZE];
-  }
-
-  static int CInit(struct GpuSpatialJoiner* self, struct GpuSpatialJoinerConfig* config) {
-    int err = 0;
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    try {
-      gpuspatial::InitSpatialJoiner(joiner, config->ptx_root, config->concurrency);
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      auto* last_error = const_cast<char*>(self->last_error);
-      strncpy(last_error, e.what(), len);
-      last_error[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static void CCreateContext(struct GpuSpatialJoiner* self,
-                             struct GpuSpatialJoinerContext* context) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    context->private_data = new std::shared_ptr(joiner->CreateContext());
-    context->last_error = new char[GPUSPATIAL_ERROR_MSG_BUFFER_SIZE];
-    context->build_indices = new std::vector<uint32_t>();
-    context->stream_indices = new std::vector<uint32_t>();
-  }
-
-  static void CDestroyContext(struct GpuSpatialJoinerContext* context) {
-    delete (std::shared_ptr<gpuspatial::StreamingJoiner::Context>*)context->private_data;
-    delete[] context->last_error;
-    delete (std::vector<uint32_t>*)context->build_indices;
-    delete (std::vector<uint32_t>*)context->stream_indices;
+    out->private_data = new private_data_t{Payload{std::move(uniq_index), rdata}, ""};
+  }
+
+  static void CCreateContext(struct SedonaSpatialIndexContext* context) {
+    context->private_data = new context_t();
+  }
+
+  static void CDestroyContext(struct SedonaSpatialIndexContext* context) {
+    delete static_cast<context_t*>(context->private_data);
     context->private_data = nullptr;
-    context->last_error = nullptr;
-    context->build_indices = nullptr;
-    context->stream_indices = nullptr;
-  }
-
-  static void CClear(struct GpuSpatialJoiner* self) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    joiner->Clear();
-  }
-
-  static int CPushBuild(struct GpuSpatialJoiner* self, const struct ArrowSchema* schema,
-                        const struct ArrowArray* array, int64_t offset, int64_t length) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    int err = 0;
-    try {
-      joiner->PushBuild(schema, array, offset, length);
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      auto* last_error = const_cast<char*>(self->last_error);
-      strncpy(last_error, e.what(), len);
-      last_error[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static int CFinishBuilding(struct GpuSpatialJoiner* self) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    int err = 0;
-    try {
-      joiner->FinishBuilding();
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      auto* last_error = const_cast<char*>(self->last_error);
-      strncpy(last_error, e.what(), len);
-      last_error[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static int CPushStream(struct GpuSpatialJoiner* self,
-                         struct GpuSpatialJoinerContext* context,
-                         const struct ArrowSchema* schema, const struct ArrowArray* array,
-                         int64_t offset, int64_t length,
-                         enum GpuSpatialPredicate predicate, int32_t array_index_offset) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    auto* private_data =
-        (std::shared_ptr<gpuspatial::StreamingJoiner::Context>*)context->private_data;
-    int err = 0;
-    try {
-      joiner->PushStream(private_data->get(), schema, array, offset, length,
-                         static_cast<gpuspatial::Predicate>(predicate),
-                         static_cast<std::vector<uint32_t>*>(context->build_indices),
-                         static_cast<std::vector<uint32_t>*>(context->stream_indices),
-                         array_index_offset);
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      strncpy((char*)context->last_error, e.what(), len);
-      ((char*)context->last_error)[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static void CGetBuildIndicesBuffer(struct GpuSpatialJoinerContext* context,
-                                     void** build_indices,
+  }
+
+  static int CClear(self_t* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [=] { use_index(self).Clear(); });
+  }
+
+  static int CPushBuild(self_t* self, const float* buf, uint32_t n_rects) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      auto* rects = reinterpret_cast<const spatial_index_t::box_t*>(buf);
+      use_index(self).PushBuild(rects, n_rects);
+    });
+  }
+
+  static int CFinishBuilding(self_t* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_index(self).FinishBuilding(); });
+  }
+
+  static int CProbe(self_t* self, SedonaSpatialIndexContext* context, const float* buf,
+                    uint32_t n_rects) {
+    return SafeExecute(static_cast<context_t*>(context->private_data), [&] {
+      auto* rects = reinterpret_cast<const spatial_index_t::box_t*>(buf);
+      auto& buff = static_cast<context_t*>(context->private_data)->payload;
+      use_index(self).Probe(rects, n_rects, &buff.build_indices, &buff.probe_indices);
+    });
+  }
+
+  static void CGetBuildIndicesBuffer(struct SedonaSpatialIndexContext* context,
+                                     uint32_t** build_indices,
                                      uint32_t* build_indices_length) {
-    auto* vec = static_cast<std::vector<uint32_t>*>(context->build_indices);
+    auto* ctx = static_cast<context_t*>(context->private_data);
+    *build_indices = ctx->payload.build_indices.data();
+    *build_indices_length = ctx->payload.build_indices.size();
+  }
 
-    *build_indices = vec->data();
-    *build_indices_length = vec->size();
+  static void CGetProbeIndicesBuffer(struct SedonaSpatialIndexContext* context,
+                                     uint32_t** probe_indices,
+                                     uint32_t* probe_indices_length) {
+    auto* ctx = static_cast<context_t*>(context->private_data);
+    *probe_indices = ctx->payload.probe_indices.data();
+    *probe_indices_length = ctx->payload.probe_indices.size();
   }
 
-  static void CGetStreamIndicesBuffer(struct GpuSpatialJoinerContext* context,
-                                      void** stream_indices,
-                                      uint32_t* stream_indices_length) {
-    auto* vec = static_cast<std::vector<uint32_t>*>(context->stream_indices);
+  static const char* CGetLastError(self_t* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    return private_data->last_error.c_str();
+  }
 
-    *stream_indices = vec->data();
-    *stream_indices_length = vec->size();
+  static const char* CContextGetLastError(SedonaSpatialIndexContext* self) {
+    auto* private_data = static_cast<context_t*>(self->private_data);
+    return private_data->last_error.c_str();
   }
 
-  static void CRelease(struct GpuSpatialJoiner* self) {
-    delete[] self->last_error;
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    delete joiner;
+  static void CRelease(self_t* self) {
+    delete static_cast<private_data_t*>(self->private_data);
     self->private_data = nullptr;
-    self->last_error = nullptr;
+  }
+
+  static spatial_index_t& use_index(self_t* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    auto* r_data = private_data->payload.rdata;
+
+    CUDA_CHECK(cudaSetDevice(r_data->payload.device_id));
+    return *(private_data->payload.index);
   }
 };
 
-void GpuSpatialJoinerCreate(struct GpuSpatialJoiner* joiner) {
-  auto idx = gpuspatial::CreateSpatialJoiner();
-  GpuSpatialJoinerExporter::Export(idx, joiner);
+int GpuSpatialIndexFloat2DCreate(struct SedonaFloatIndex2D* index,
+                                 const struct GpuSpatialIndexConfig* config) {
+  try {
+    GpuSpatialIndexFloat2DExporter::Export(config, index);
+  } catch (std::exception& e) {
+    GPUSPATIAL_LOG_ERROR("Failed to create GpuSpatialIndexFloat2D: %s", e.what());
+    return EINVAL;
+  }
+  return 0;
+}
+
+struct GpuSpatialRefinerExporter {
+  struct Payload {
+    std::unique_ptr<gpuspatial::SpatialRefiner> refiner;
+    nanoarrow::UniqueArrayView build_array_view;
+    nanoarrow::UniqueArrayView probe_array_view;
+    runtime_data_t* rdata;
+  };
+  using private_data_t = GpuSpatialWrapper<Payload>;
+
+  static void Export(const GpuSpatialRefinerConfig* config,
+                     struct SedonaSpatialRefiner* out) {
+    auto* rdata = static_cast<runtime_data_t*>(config->runtime->private_data);
+
+    gpuspatial::RTSpatialRefinerConfig refiner_config;
+
+    refiner_config.rt_engine = rdata->payload.rt_engine;
+    refiner_config.concurrency = config->concurrency;
+    refiner_config.compact = config->compress_bvh;
+    refiner_config.pipeline_batches = config->pipeline_batches;
+
+    // Create Refinner may involve GPU operations, set device here
+    CUDA_CHECK(cudaSetDevice(rdata->payload.device_id));
+
+    auto refiner = gpuspatial::CreateRTSpatialRefiner(refiner_config);
+
+    out->clear = &CClear;
+    out->init_schema = &CInitSchema;
+    out->push_build = &CPushBuild;
+    out->finish_building = &CFinishBuilding;
+    out->refine = &CRefine;
+    out->get_last_error = &CGetLastError;
+    out->release = &CRelease;
+    out->private_data =
+        new private_data_t{Payload{std::move(refiner), nanoarrow::UniqueArrayView(),
+                                   nanoarrow::UniqueArrayView(), rdata},
+                           ""};
+  }
+
+  static int CClear(SedonaSpatialRefiner* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_refiner(self).Clear(); });
+  }
+
+  static int CInitSchema(SedonaSpatialRefiner* self, const ArrowSchema* build_schema,
+                         const ArrowSchema* probe_schema) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      auto* private_data = static_cast<private_data_t*>(self->private_data);
+      ArrowError arrow_error;
+      if (ArrowArrayViewInitFromSchema(private_data->payload.build_array_view.get(),
+                                       build_schema, &arrow_error) != NANOARROW_OK) {
+        throw std::runtime_error("ArrowArrayViewInitFromSchema error " +
+                                 std::string(arrow_error.message));
+      }
+      if (ArrowArrayViewInitFromSchema(private_data->payload.probe_array_view.get(),
+                                       probe_schema, &arrow_error) != NANOARROW_OK) {
+        throw std::runtime_error("ArrowArrayViewInitFromSchema error " +
+                                 std::string(arrow_error.message));
+      }
+    });
+  }
+
+  static int CPushBuild(SedonaSpatialRefiner* self, const ArrowArray* build_array) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      auto* private_data = static_cast<private_data_t*>(self->private_data);
+      auto* array_view = private_data->payload.build_array_view.get();
+      ArrowError arrow_error;
+
+      if (ArrowArrayViewSetArray(array_view, build_array, &arrow_error) != NANOARROW_OK) {
+        throw std::runtime_error("ArrowArrayViewSetArray error " +
+                                 std::string(arrow_error.message));
+      }
+
+      use_refiner(self).PushBuild(array_view);
+    });
+  }
+
+  static int CFinishBuilding(SedonaSpatialRefiner* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_refiner(self).FinishBuilding(); });
+  }
+
+  static int CRefine(SedonaSpatialRefiner* self, const ArrowArray* probe_array,
+                     SedonaSpatialRelationPredicate predicate, uint32_t* build_indices,
+                     uint32_t* probe_indices, uint32_t indices_size,
+                     uint32_t* new_indices_size) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      auto* private_data = static_cast<private_data_t*>(self->private_data);
+      auto* array_view = private_data->payload.build_array_view.get();
+      ArrowError arrow_error;
+
+      if (ArrowArrayViewSetArray(array_view, probe_array, &arrow_error) != NANOARROW_OK) {
+        throw std::runtime_error("ArrowArrayViewSetArray error " +
+                                 std::string(arrow_error.message));
+      }
+
+      *new_indices_size = use_refiner(self).Refine(
+          array_view, static_cast<gpuspatial::Predicate>(predicate), build_indices,
+          probe_indices, indices_size);
+    });
+  }
+
+  static const char* CGetLastError(SedonaSpatialRefiner* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    return private_data->last_error.c_str();
+  }
+
+  static void CRelease(SedonaSpatialRefiner* self) {
+    delete static_cast<private_data_t*>(self->private_data);
+    self->private_data = nullptr;
+  }
+
+  static gpuspatial::SpatialRefiner& use_refiner(SedonaSpatialRefiner* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    auto* r_data = private_data->payload.rdata;
+
+    CUDA_CHECK(cudaSetDevice(r_data->payload.device_id));
+    return *(private_data->payload.refiner);
+  }
+};
+
+int GpuSpatialRefinerCreate(SedonaSpatialRefiner* refiner,
+                            const GpuSpatialRefinerConfig* config) {
+  try {
+    GpuSpatialRefinerExporter::Export(config, refiner);
+  } catch (std::exception& e) {
+    GPUSPATIAL_LOG_ERROR("Failed to create GpuSpatialRefiner: %s", e.what());
+    return EINVAL;
+  }
+  return 0;
 }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc b/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
new file mode 100644
index 000000000..fdf66e700
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gpuspatial/mem/memory_manager.hpp"
+#include "gpuspatial/utils/logger.hpp"
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__linux__)
+#include <sys/sysinfo.h>
+#else  // POSIX (BSD, Solaris, etc.)
+#include <unistd.h>
+#endif
+namespace gpuspatial {
+namespace detail {
+inline long long get_free_physical_memory() {
+#if defined(_WIN32)
+  // --- Windows ---
+  MEMORYSTATUSEX status;
+  status.dwLength = sizeof(status);
+  if (GlobalMemoryStatusEx(&status)) {
+    return (long long)status.ullAvailPhys;
+  }
+  return 0;
+
+#elif defined(__linux__)
+  // --- Linux (sysinfo) ---
+  struct sysinfo info;
+  if (sysinfo(&info) == 0) {
+    return (long long)info.freeram * (long long)info.mem_unit;
+  }
+  return 0;
+
+#else
+  // --- Generic POSIX ---
+  // _SC_AVPHYS_PAGES: The number of physical memory pages not currently in use.
+  long pages = sysconf(_SC_AVPHYS_PAGES);
+  long page_size = sysconf(_SC_PAGESIZE);
+
+  if (pages > 0 && page_size > 0) {
+    return (long long)pages * (long long)page_size;
+  }
+  return 0;
+#endif
+}
+}  // namespace detail
+
+MemoryManager& MemoryManager::instance() {
+  static MemoryManager instance;
+  return instance;
+}
+
+MemoryManager::~MemoryManager() { Shutdown(); }
+
+void MemoryManager::Shutdown() {
+  if (is_initialized_) {
+    rmm::mr::set_current_device_resource(nullptr);
+    active_resource_.reset();
+    pool_mr_.reset();
+    cuda_mr_.reset();
+    raw_tracker_ptr_ = nullptr;
+    is_initialized_ = false;
+  }
+}
+
+void MemoryManager::Init(bool use_pool, int init_pool_precent) {
+  if (is_initialized_) {
+    GPUSPATIAL_LOG_WARN(
+        "MemoryManager is already initialized. Skipping re-initialization.");
+    return;
+  }
+
+  cuda_mr_ = std::make_unique<CudaMR>();
+  use_pool_ = use_pool;
+
+  if (use_pool_) {
+    auto safe_precent = std::max(0, std::min(init_pool_precent, 100));
+    auto pool_bytes = rmm::percent_of_free_device_memory(safe_precent);
+
+    GPUSPATIAL_LOG_INFO("Creating RMM pool memory resource with size %zu MB",
+                        pool_bytes / 1024 / 1024);
+
+    pool_mr_ = std::make_unique<PoolMR>(cuda_mr_.get(), pool_bytes);
+    active_resource_ = std::make_unique<PoolTracker>(pool_mr_.get());
+  } else {
+    active_resource_ = std::make_unique<CudaTracker>(cuda_mr_.get());
+  }
+
+  raw_tracker_ptr_ = active_resource_.get();
+
+  rmm::mr::set_current_device_resource(active_resource_.get());
+  is_initialized_ = true;
+}
+
+size_t MemoryManager::get_available_device_memory() const {
+  auto avail_bytes = rmm::available_device_memory().first;
+  if (!is_initialized_ || !use_pool_) {
+    return avail_bytes;
+  }
+
+  // --- POOL STRATEGY ---
+  auto* tracker = static_cast<PoolTracker*>(raw_tracker_ptr_);
+  size_t used = tracker->get_allocated_bytes();
+
+  // Safety Buffer: 5% of TOTAL capacity (not just pool capacity)
+  size_t safe_limit = static_cast<size_t>(avail_bytes * 0.95);
+
+  return (used < safe_limit) ? (safe_limit - used) : 0;
+}
+
+size_t MemoryManager::get_available_host_memory() {
+  return detail::get_free_physical_memory();
+}
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu b/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
index da978012c..db081da22 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
@@ -14,19 +14,21 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/index/geometry_grouper.hpp"
-#include "gpuspatial/index/relate_engine.cuh"
-#include "gpuspatial/relate/predicate.cuh"
-#include "gpuspatial/relate/relate.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/helpers.h"
-#include "gpuspatial/utils/launcher.h"
+#include "gpuspatial/mem/memory_manager.hpp"
+#include "gpuspatial/relate/predicate.hpp"
+#include "gpuspatial/relate/relate.hpp"
+#include "gpuspatial/relate/relate_engine.cuh"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/helpers.cuh"
+#include "gpuspatial/utils/launcher.hpp"
 #include "gpuspatial/utils/logger.hpp"
-#include "gpuspatial/utils/queue.h"
 #include "rt/shaders/shader_id.hpp"
 
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/device/tracking_resource_adaptor.hpp>
 #include "rmm/cuda_stream_view.hpp"
+#include "rmm/device_scalar.hpp"
 #include "rmm/exec_policy.hpp"
 
 #include <thrust/remove.h>
@@ -93,6 +95,92 @@ DEV_HOST_INLINE bool EvaluatePredicate(Predicate p, int32_t im) {
   }
   return false;
 }
+
+template <typename POINT_T, typename INDEX_T>
+uint32_t ComputeNumAabbs(const rmm::cuda_stream_view& stream,
+                         const PolygonArrayView<POINT_T, INDEX_T>& polygons,
+                         ArrayView<uint32_t> polygon_ids, int segs_per_aabb) {
+  auto n_polygons = polygon_ids.size();
+
+  rmm::device_uvector<uint32_t> n_aabbs(n_polygons, stream);
+  auto* p_n_aabbs = n_aabbs.data();
+
+  LaunchKernel(stream, [=] __device__() {
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
+    auto lane = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto global_warp_id = TID_1D / 32;
+    auto n_warps = TOTAL_THREADS_1D / 32;
+
+    for (auto i = global_warp_id; i < n_polygons; i += n_warps) {
+      auto id = polygon_ids[i];
+      const auto& polygon = polygons[id];
+      uint32_t total_segs = 0;
+
+      for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
+        total_segs +=
+            (polygon.get_ring(ring).num_segments() + segs_per_aabb - 1) / segs_per_aabb;
+      }
+      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
+      if (lane == 0) {
+        p_n_aabbs[i] = total_segs;
+      }
+    }
+  });
+  return thrust::reduce(rmm::exec_policy_nosync(stream), n_aabbs.begin(), n_aabbs.end());
+}
+
+template <typename POINT_T, typename INDEX_T>
+uint32_t ComputeNumAabbs(const rmm::cuda_stream_view& stream,
+                         const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polygons,
+                         ArrayView<uint32_t> multi_polygon_ids, int segs_per_aabb) {
+  auto n_multi_polygons = multi_polygon_ids.size();
+  rmm::device_uvector<uint32_t> n_aabbs(n_multi_polygons, stream);
+  auto* p_n_aabbs = n_aabbs.data();
+
+  LaunchKernel(stream, [=] __device__() {
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
+    auto lane = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto global_warp_id = TID_1D / 32;
+    auto n_warps = TOTAL_THREADS_1D / 32;
+
+    for (auto i = global_warp_id; i < n_multi_polygons; i += n_warps) {
+      auto id = multi_polygon_ids[i];
+      const auto& multi_polygon = multi_polygons[id];
+
+      uint32_t multipoly_aabb_count = 0;
+
+      for (int part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
+        auto polygon = multi_polygon.get_polygon(part_idx);
+
+        // Local accumulator for this thread
+        uint32_t thread_aabb_count = 0;
+
+        for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
+          auto n_segs = polygon.get_ring(ring).num_segments();
+
+          thread_aabb_count += (n_segs + segs_per_aabb - 1) / segs_per_aabb;
+        }
+
+        // Reduce across the warp to get total AABBs for this polygon (part)
+        uint32_t part_total = WarpReduce(temp_storage[warp_id]).Sum(thread_aabb_count);
+
+        // Add this part's total to the multi-polygon accumulator
+        if (lane == 0) {
+          multipoly_aabb_count += part_total;
+        }
+      }
+
+      if (lane == 0) {
+        p_n_aabbs[i] = multipoly_aabb_count;
+      }
+    }
+  });
+  return thrust::reduce(rmm::exec_policy_nosync(stream), n_aabbs.begin(), n_aabbs.end());
+}
 }  // namespace detail
 
 template <typename POINT_T, typename INDEX_T>
@@ -102,48 +190,49 @@ RelateEngine<POINT_T, INDEX_T>::RelateEngine(
 
 template <typename POINT_T, typename INDEX_T>
 RelateEngine<POINT_T, INDEX_T>::RelateEngine(
-    const DeviceGeometries<POINT_T, INDEX_T>* geoms1, const details::RTEngine* rt_engine)
+    const DeviceGeometries<POINT_T, INDEX_T>* geoms1, const RTEngine* rt_engine)
     : geoms1_(geoms1), rt_engine_(rt_engine) {}
 
 template <typename POINT_T, typename INDEX_T>
 void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream, const DeviceGeometries<POINT_T, INDEX_T>& geoms2,
-    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+    rmm::device_uvector<INDEX_T>& ids2) {
   switch (geoms2.get_geometry_type()) {
     case GeometryType::kPoint: {
       using geom2_array_view_t = PointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPoint: {
       using geom2_array_view_t = MultiPointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kLineString: {
       using geom2_array_view_t = LineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiLineString: {
       using geom2_array_view_t = MultiLineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kPolygon: {
       using geom2_array_view_t = PolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPolygon: {
       using geom2_array_view_t = MultiPolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     default:
@@ -153,44 +242,46 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
 
 template <typename POINT_T, typename INDEX_T>
 template <typename GEOM2_ARRAY_VIEW_T>
-void RelateEngine<POINT_T, INDEX_T>::Evaluate(
-    const rmm::cuda_stream_view& stream, const GEOM2_ARRAY_VIEW_T& geom_array2,
-    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+void RelateEngine<POINT_T, INDEX_T>::Evaluate(const rmm::cuda_stream_view& stream,
+                                              const GEOM2_ARRAY_VIEW_T& geom_array2,
+                                              Predicate predicate,
+                                              rmm::device_uvector<INDEX_T>& ids1,
+                                              rmm::device_uvector<INDEX_T>& ids2) {
   switch (geoms1_->get_geometry_type()) {
     case GeometryType::kPoint: {
       using geom1_array_view_t = PointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPoint: {
       using geom1_array_view_t = MultiPointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kLineString: {
       using geom1_array_view_t = LineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiLineString: {
       using geom1_array_view_t = MultiLineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kPolygon: {
       using geom1_array_view_t = PolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPolygon: {
       using geom1_array_view_t = MultiPolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     default:
@@ -200,11 +291,14 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
 
 template <typename POINT_T, typename INDEX_T>
 template <typename GEOM1_ARRAY_VIEW_T, typename GEOM2_ARRAY_VIEW_T>
-void RelateEngine<POINT_T, INDEX_T>::Evaluate(
-    const rmm::cuda_stream_view& stream, const GEOM1_ARRAY_VIEW_T& geom_array1,
-    const GEOM2_ARRAY_VIEW_T& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  size_t ids_size = ids.size(stream);
+void RelateEngine<POINT_T, INDEX_T>::Evaluate(const rmm::cuda_stream_view& stream,
+                                              const GEOM1_ARRAY_VIEW_T& geom_array1,
+                                              const GEOM2_ARRAY_VIEW_T& geom_array2,
+                                              Predicate predicate,
+                                              rmm::device_uvector<INDEX_T>& ids1,
+                                              rmm::device_uvector<INDEX_T>& ids2) {
+  assert(ids1.size() == ids2.size());
+  size_t ids_size = ids1.size();
   GPUSPATIAL_LOG_INFO(
       "Refine with generic kernel, geom1 %zu, geom2 %zu, predicate %s, result size %zu",
       geom_array1.size(), geom_array2.size(), PredicateToString(predicate), ids_size);
@@ -219,20 +313,24 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     GPUSPATIAL_LOG_WARN(
         "Evaluate Polygon-Polygon relate with the GPU, which is not well-tested and the performance may be poor.");
   }
-  auto end = thrust::remove_if(
-      rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-      [=] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-        auto geom1_id = pair.first;
-        auto geom2_id = pair.second;
-        const auto& geom1 = geom_array1[geom1_id];
-        const auto& geom2 = geom_array2[geom2_id];
-
-        auto IM = relate(geom1, geom2);
-        return !detail::EvaluatePredicate(predicate, IM);
-      });
-  size_t new_size = thrust::distance(ids.data(), end);
-  GPUSPATIAL_LOG_INFO("Refined, result size %zu", new_size);
-  ids.set_size(stream, new_size);
+  auto zip_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(ids1.begin(), ids2.begin()));
+  auto zip_end = thrust::make_zip_iterator(thrust::make_tuple(ids1.end(), ids2.end()));
+
+  auto end =
+      thrust::remove_if(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+                        [=] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tuple) {
+                          auto geom1_id = thrust::get<0>(tuple);
+                          auto geom2_id = thrust::get<1>(tuple);
+                          const auto& geom1 = geom_array1[geom1_id];
+                          const auto& geom2 = geom_array2[geom2_id];
+
+                          auto IM = relate(geom1, geom2);
+                          return !detail::EvaluatePredicate(predicate, IM);
+                        });
+  size_t new_size = thrust::distance(zip_begin, end);
+  ids1.resize(new_size, stream);
+  ids2.resize(new_size, stream);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -240,9 +338,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PointArrayView<POINT_T, INDEX_T>& geom_array1,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array1, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -250,9 +348,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array1, geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -260,19 +358,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array2, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -280,19 +368,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array2, geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -300,9 +378,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PointArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array1, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -310,9 +388,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array1, geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -320,19 +398,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array2, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -340,19 +408,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array2, geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -361,10 +419,15 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
     const PointArrayView<POINT_T, INDEX_T>& point_array,
     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
     const PolygonArrayView<POINT_T, INDEX_T>& poly_array, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids, bool inverse) {
+    rmm::device_uvector<INDEX_T>& point_ids, rmm::device_uvector<INDEX_T>& poly_ids,
+    bool inverse) {
+  // Casting short rays from each point to do precise point-in-polygon test
+  // Reference: "Geng L, Lee R, Zhang X. Rayjoin: Fast and precise spatial join.
+  // InProceedings of the 38th ACM International Conference on Supercomputing 2024"
   using params_t = detail::LaunchParamsPolygonPointQuery<POINT_T, INDEX_T>;
-
-  size_t ids_size = ids.size(stream);
+  assert(point_array.empty() || multi_point_array.empty());
+  assert(point_ids.size() == poly_ids.size());
+  size_t ids_size = point_ids.size();
   GPUSPATIAL_LOG_INFO(
       "Refine with ray-tracing, (multi-)point %zu, polygon %zu, predicate %s, result size %zu, inverse %d",
       !point_array.empty() ? point_array.size() : multi_point_array.size(),
@@ -373,79 +436,88 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
   if (ids_size == 0) {
     return;
   }
-  // pair.first is point id; pair.second is polygon id
-  // Sort by multi polygon id
-  thrust::sort(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-               [] __device__(const thrust::pair<uint32_t, uint32_t>& pair1,
-                             const thrust::pair<uint32_t, uint32_t>& pair2) {
-                 return pair1.second < pair2.second;
+
+  auto zip_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin(), poly_ids.begin()));
+  auto zip_end =
+      thrust::make_zip_iterator(thrust::make_tuple(point_ids.end(), poly_ids.end()));
+  auto invalid_tuple = thrust::make_tuple(std::numeric_limits<INDEX_T>::max(),
+                                          std::numeric_limits<INDEX_T>::max());
+
+  // Sort by polygon id
+  thrust::sort(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+               [] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu1,
+                             const thrust::tuple<INDEX_T, INDEX_T>& tu2) {
+                 return thrust::get<1>(tu1) < thrust::get<1>(tu2);
                });
 
-  rmm::device_uvector<uint32_t> poly_ids(ids_size, stream);
+  rmm::device_uvector<INDEX_T> uniq_poly_ids(ids_size, stream);
 
-  thrust::transform(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-                    poly_ids.data(),
-                    [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                      return pair.second;
-                    });
-  auto poly_ids_end =
-      thrust::unique(rmm::exec_policy_nosync(stream), poly_ids.begin(), poly_ids.end());
-  poly_ids.resize(thrust::distance(poly_ids.begin(), poly_ids_end), stream);
-  poly_ids.shrink_to_fit(stream);
+  thrust::copy(rmm::exec_policy_nosync(stream), poly_ids.begin(), poly_ids.end(),
+               uniq_poly_ids.begin());
 
-  auto bvh_bytes = EstimateBVHSize(stream, poly_array, ArrayView<uint32_t>(poly_ids));
-  size_t avail_bytes = rmm::available_device_memory().first * config_.memory_quota;
+  // Collect uniq polygon ids to estimate total BVH memory usage
+  auto uniq_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
+                                          uniq_poly_ids.begin(), uniq_poly_ids.end());
+  uniq_poly_ids.resize(thrust::distance(uniq_poly_ids.begin(), uniq_poly_ids_end),
+                       stream);
+  uniq_poly_ids.shrink_to_fit(stream);
+
+  auto bvh_bytes = EstimateBVHSize(stream, poly_array, ArrayView<uint32_t>(uniq_poly_ids),
+                                   config_.segs_per_aabb);
+  size_t avail_bytes =
+      MemoryManager::instance().get_available_device_memory() * config_.memory_quota;
   auto n_batches = bvh_bytes / avail_bytes + 1;
   auto batch_size = (ids_size + n_batches - 1) / n_batches;
-  auto invalid_pair = thrust::make_pair(std::numeric_limits<uint32_t>::max(),
-                                        std::numeric_limits<uint32_t>::max());
 
   GPUSPATIAL_LOG_INFO(
       "Unique polygons %zu, memory quota %zu MB, estimated BVH size %zu MB",
-      poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
+      uniq_poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
 
   for (int batch = 0; batch < n_batches; batch++) {
     auto ids_begin = batch * batch_size;
     auto ids_end = std::min(ids_begin + batch_size, ids_size);
     auto ids_size_batch = ids_end - ids_begin;
 
-    poly_ids.resize(ids_size_batch, stream);
-    thrust::transform(rmm::exec_policy_nosync(stream), ids.data() + ids_begin,
-                      ids.data() + ids_end, poly_ids.data(),
-                      [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                        return pair.second;
-                      });
+    // Extract unique polygon IDs in this batch
+    uniq_poly_ids.resize(ids_size_batch, stream);
+    thrust::copy(rmm::exec_policy_nosync(stream), poly_ids.begin() + ids_begin,
+                 poly_ids.begin() + ids_end, uniq_poly_ids.begin());
 
-    // ids is sorted
-    poly_ids_end =
-        thrust::unique(rmm::exec_policy_nosync(stream), poly_ids.begin(), poly_ids.end());
+    // poly ids are sorted
+    uniq_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
+                                       uniq_poly_ids.begin(), uniq_poly_ids.end());
 
-    poly_ids.resize(thrust::distance(poly_ids.begin(), poly_ids_end), stream);
-    poly_ids.shrink_to_fit(stream);
+    uniq_poly_ids.resize(thrust::distance(uniq_poly_ids.begin(), uniq_poly_ids_end),
+                         stream);
+    uniq_poly_ids.shrink_to_fit(stream);
 
     rmm::device_uvector<int> IMs(ids_size_batch, stream);
-    rmm::device_uvector<INDEX_T> seg_begins(0, stream);
     rmm::device_uvector<PointLocation> locations(ids_size_batch, stream);
     rmm::device_buffer bvh_buffer(0, stream);
     rmm::device_uvector<INDEX_T> aabb_poly_ids(0, stream), aabb_ring_ids(0, stream);
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>> aabb_vertex_offsets(0, stream);
 
     // aabb id -> vertex begin[polygon] + ith point in this polygon
-    auto handle = BuildBVH(stream, poly_array, ArrayView<INDEX_T>(poly_ids), seg_begins,
-                           bvh_buffer, aabb_poly_ids, aabb_ring_ids);
+    auto handle = BuildBVH(stream, poly_array, ArrayView<INDEX_T>(uniq_poly_ids),
+                           config_.segs_per_aabb, bvh_buffer, aabb_poly_ids,
+                           aabb_ring_ids, aabb_vertex_offsets);
 
     params_t params;
 
     params.points = point_array;
     params.multi_points = multi_point_array;
     params.polygons = poly_array;
-    params.polygon_ids = ArrayView<INDEX_T>(poly_ids);
-    params.ids = ArrayView<thrust::pair<uint32_t, uint32_t>>(ids.data() + ids_begin,
-                                                             ids_size_batch);
-    params.seg_begins = ArrayView<INDEX_T>(seg_begins);
+    params.uniq_polygon_ids = ArrayView<INDEX_T>(uniq_poly_ids);
+    params.query_point_ids = point_ids.data() + ids_begin;
+    params.query_polygon_ids = poly_ids.data() + ids_begin;
+    params.query_size = ids_size_batch;
     params.IMs = ArrayView<int>(IMs);
     params.handle = handle;
     params.aabb_poly_ids = ArrayView<INDEX_T>(aabb_poly_ids);
     params.aabb_ring_ids = ArrayView<INDEX_T>(aabb_ring_ids);
+    params.aabb_vertex_offsets =
+        ArrayView<thrust::pair<INDEX_T, INDEX_T>>(aabb_vertex_offsets);
 
     rmm::device_buffer params_buffer(sizeof(params_t), stream);
 
@@ -457,34 +529,32 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
         dim3{static_cast<unsigned int>(ids_size_batch), 1, 1},
         ArrayView<char>((char*)params_buffer.data(), params_buffer.size()));
 
-    auto* p_IMs = IMs.data();
-    auto* p_ids = ids.data();
-
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      thrust::make_counting_iterator<uint32_t>(0),
-                      thrust::make_counting_iterator<uint32_t>(ids_size_batch),
-                      ids.data() + ids_begin, [=] __device__(uint32_t i) {
-                        const auto& pair = p_ids[ids_begin + i];
-
-                        auto IM = p_IMs[i];
-                        if (inverse) {
-                          IM = IntersectionMatrix::Transpose(IM);
-                        }
-                        if (detail::EvaluatePredicate(predicate, IM)) {
-                          return pair;
-                        } else {
-                          return invalid_pair;
-                        }
-                      });
+    thrust::transform(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(thrust::make_tuple(
+            point_ids.begin() + ids_begin, poly_ids.begin() + ids_begin, IMs.begin())),
+        thrust::make_zip_iterator(thrust::make_tuple(
+            point_ids.begin() + ids_end, poly_ids.begin() + ids_end, IMs.end())),
+        thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin() + ids_begin,
+                                                     poly_ids.begin() + ids_begin)),
+        [=] __device__(const thrust::tuple<INDEX_T, INDEX_T, int>& t) {
+          auto res = thrust::make_tuple(thrust::get<0>(t), thrust::get<1>(t));
+          auto IM = thrust::get<2>(t);
+
+          if (inverse) {
+            IM = IntersectionMatrix::Transpose(IM);
+          }
+
+          return detail::EvaluatePredicate(predicate, IM) ? res : invalid_tuple;
+        });
   }
-  auto end = thrust::remove_if(
-      rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-      [=] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-        return pair == invalid_pair;
-      });
-  size_t new_size = thrust::distance(ids.data(), end);
-  GPUSPATIAL_LOG_INFO("Refined, result size %zu", new_size);
-  ids.set_size(stream, new_size);
+  auto end = thrust::remove_if(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+                               [=] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu) {
+                                 return tu == invalid_tuple;
+                               });
+  size_t new_size = thrust::distance(zip_begin, end);
+  point_ids.resize(new_size, stream);
+  poly_ids.resize(new_size, stream);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -493,11 +563,12 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
     const PointArrayView<POINT_T, INDEX_T>& point_array,
     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_poly_array, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids, bool inverse) {
+    rmm::device_uvector<INDEX_T>& point_ids, rmm::device_uvector<INDEX_T>& multi_poly_ids,
+    bool inverse) {
   using params_t = detail::LaunchParamsPointMultiPolygonQuery<POINT_T, INDEX_T>;
-
   assert(point_array.empty() || multi_point_array.empty());
-  size_t ids_size = ids.size(stream);
+  assert(point_ids.size() == multi_poly_ids.size());
+  size_t ids_size = point_ids.size();
   GPUSPATIAL_LOG_INFO(
       "Refine with ray-tracing, (multi-)point %zu, multi-polygon %zu, predicate %s, result size %zu, inverse %d",
       !point_array.empty() ? point_array.size() : multi_point_array.size(),
@@ -506,37 +577,44 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
   if (ids_size == 0) {
     return;
   }
-  // pair.first is point id; pair.second is multi polygon id
-  // Sort by multi polygon id
-  thrust::sort(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-               [] __device__(const thrust::pair<uint32_t, uint32_t>& pair1,
-                             const thrust::pair<uint32_t, uint32_t>& pair2) {
-                 return pair1.second < pair2.second;
+  auto zip_begin = thrust::make_zip_iterator(
+      thrust::make_tuple(point_ids.begin(), multi_poly_ids.begin()));
+  auto zip_end = thrust::make_zip_iterator(
+      thrust::make_tuple(point_ids.end(), multi_poly_ids.end()));
+  auto invalid_tuple = thrust::make_tuple(std::numeric_limits<INDEX_T>::max(),
+                                          std::numeric_limits<INDEX_T>::max());
+
+  // Sort by polygon id
+  thrust::sort(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+               [] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu1,
+                             const thrust::tuple<INDEX_T, INDEX_T>& tu2) {
+                 return thrust::get<1>(tu1) < thrust::get<1>(tu2);
                });
 
-  rmm::device_uvector<uint32_t> multi_poly_ids(ids_size, stream);
+  rmm::device_uvector<uint32_t> uniq_multi_poly_ids(ids_size, stream);
 
-  thrust::transform(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-                    multi_poly_ids.data(),
-                    [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                      return pair.second;
-                    });
-  auto multi_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
-                                           multi_poly_ids.begin(), multi_poly_ids.end());
-  multi_poly_ids.resize(thrust::distance(multi_poly_ids.begin(), multi_poly_ids_end),
-                        stream);
-  multi_poly_ids.shrink_to_fit(stream);
+  thrust::copy(rmm::exec_policy_nosync(stream), multi_poly_ids.begin(),
+               multi_poly_ids.end(), uniq_multi_poly_ids.begin());
+
+  // Collect uniq polygon ids to estimate total BVH memory usage
+  auto uniq_multi_poly_ids_end =
+      thrust::unique(rmm::exec_policy_nosync(stream), uniq_multi_poly_ids.begin(),
+                     uniq_multi_poly_ids.end());
+  uniq_multi_poly_ids.resize(
+      thrust::distance(uniq_multi_poly_ids.begin(), uniq_multi_poly_ids_end), stream);
+  uniq_multi_poly_ids.shrink_to_fit(stream);
 
   auto bvh_bytes =
-      EstimateBVHSize(stream, multi_poly_array, ArrayView<uint32_t>(multi_poly_ids));
-  size_t avail_bytes = rmm::available_device_memory().first * config_.memory_quota;
+      EstimateBVHSize(stream, multi_poly_array, ArrayView<uint32_t>(uniq_multi_poly_ids),
+                      config_.segs_per_aabb);
+  size_t avail_bytes =
+      MemoryManager::instance().get_available_device_memory() * config_.memory_quota;
   auto n_batches = bvh_bytes / avail_bytes + 1;
   auto batch_size = (ids_size + n_batches - 1) / n_batches;
-  auto invalid_pair = thrust::make_pair(std::numeric_limits<uint32_t>::max(),
-                                        std::numeric_limits<uint32_t>::max());
+
   GPUSPATIAL_LOG_INFO(
       "Unique multi-polygons %zu, memory quota %zu MB, estimated BVH size %zu MB",
-      multi_poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
+      uniq_multi_poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
 
   for (int batch = 0; batch < n_batches; batch++) {
     auto ids_begin = batch * batch_size;
@@ -544,47 +622,48 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
     auto ids_size_batch = ids_end - ids_begin;
 
     // Extract multi polygon IDs in this batch
-    multi_poly_ids.resize(ids_size_batch, stream);
+    uniq_multi_poly_ids.resize(ids_size_batch, stream);
 
-    thrust::transform(rmm::exec_policy_nosync(stream), ids.data() + ids_begin,
-                      ids.data() + ids_end, multi_poly_ids.data(),
-                      [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                        return pair.second;
-                      });
+    thrust::copy(rmm::exec_policy_nosync(stream), multi_poly_ids.begin() + ids_begin,
+                 multi_poly_ids.begin() + ids_end, uniq_multi_poly_ids.begin());
 
     // multi polygon ids have been sorted before
-    multi_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
-                                        multi_poly_ids.begin(), multi_poly_ids.end());
-    multi_poly_ids.resize(thrust::distance(multi_poly_ids.begin(), multi_poly_ids_end),
-                          stream);
-    multi_poly_ids.shrink_to_fit(stream);
+    uniq_multi_poly_ids_end =
+        thrust::unique(rmm::exec_policy_nosync(stream), uniq_multi_poly_ids.begin(),
+                       uniq_multi_poly_ids.end());
+    uniq_multi_poly_ids.resize(
+        thrust::distance(uniq_multi_poly_ids.begin(), uniq_multi_poly_ids_end), stream);
+    uniq_multi_poly_ids.shrink_to_fit(stream);
 
     rmm::device_uvector<int> IMs(ids_size_batch, stream);
-    rmm::device_uvector<INDEX_T> seg_begins(0, stream);
-    rmm::device_uvector<INDEX_T> uniq_part_begins(0, stream);
     rmm::device_buffer bvh_buffer(0, stream);
     rmm::device_uvector<INDEX_T> aabb_multi_poly_ids(0, stream), aabb_part_ids(0, stream),
         aabb_ring_ids(0, stream);
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>> aabb_vertex_offsets(0, stream);
+    rmm::device_uvector<INDEX_T> uniq_part_begins(0, stream);
 
-    auto handle = BuildBVH(stream, multi_poly_array, ArrayView<INDEX_T>(multi_poly_ids),
-                           seg_begins, uniq_part_begins, bvh_buffer, aabb_multi_poly_ids,
-                           aabb_part_ids, aabb_ring_ids);
+    auto handle =
+        BuildBVH(stream, multi_poly_array, ArrayView<INDEX_T>(uniq_multi_poly_ids),
+                 config_.segs_per_aabb, bvh_buffer, aabb_multi_poly_ids, aabb_part_ids,
+                 aabb_ring_ids, aabb_vertex_offsets, uniq_part_begins);
 
     params_t params;
 
     params.points = point_array;
     params.multi_points = multi_point_array;
     params.multi_polygons = multi_poly_array;
-    params.multi_polygon_ids = ArrayView<INDEX_T>(multi_poly_ids);
-    params.ids = ArrayView<thrust::pair<uint32_t, uint32_t>>(ids.data() + ids_begin,
-                                                             ids_size_batch);
-    params.seg_begins = ArrayView<INDEX_T>(seg_begins);
+    params.uniq_multi_polygon_ids = ArrayView<INDEX_T>(uniq_multi_poly_ids);
+    params.query_point_ids = point_ids.data() + ids_begin;
+    params.query_multi_polygon_ids = multi_poly_ids.data() + ids_begin;
+    params.query_size = ids_size_batch;
     params.uniq_part_begins = ArrayView<INDEX_T>(uniq_part_begins);
     params.IMs = ArrayView<int>(IMs);
     params.handle = handle;
     params.aabb_multi_poly_ids = ArrayView<INDEX_T>(aabb_multi_poly_ids);
     params.aabb_part_ids = ArrayView<INDEX_T>(aabb_part_ids);
     params.aabb_ring_ids = ArrayView<INDEX_T>(aabb_ring_ids);
+    params.aabb_vertex_offsets =
+        ArrayView<thrust::pair<INDEX_T, INDEX_T>>(aabb_vertex_offsets);
 
     rmm::device_buffer params_buffer(sizeof(params_t), stream);
 
@@ -596,166 +675,90 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
         dim3{static_cast<unsigned int>(ids_size_batch), 1, 1},
         ArrayView<char>((char*)params_buffer.data(), params_buffer.size()));
 
-    auto* p_IMs = IMs.data();
-    auto* p_ids = ids.data();
-
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      thrust::make_counting_iterator<uint32_t>(0),
-                      thrust::make_counting_iterator<uint32_t>(ids_size_batch),
-                      ids.data() + ids_begin, [=] __device__(uint32_t i) {
-                        const auto& pair = p_ids[ids_begin + i];
-
-                        auto IM = p_IMs[i];
-                        if (inverse) {
-                          IM = IntersectionMatrix::Transpose(IM);
-                        }
-                        if (detail::EvaluatePredicate(predicate, IM)) {
-                          return pair;
-                        } else {
-                          return invalid_pair;
-                        }
-                      });
+    thrust::transform(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin() + ids_begin,
+                                                     multi_poly_ids.begin() + ids_begin,
+                                                     IMs.begin())),
+        thrust::make_zip_iterator(thrust::make_tuple(
+            point_ids.begin() + ids_end, multi_poly_ids.begin() + ids_end, IMs.end())),
+        thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin() + ids_begin,
+                                                     multi_poly_ids.begin() + ids_begin)),
+        [=] __device__(const thrust::tuple<INDEX_T, INDEX_T, int>& t) {
+          auto res = thrust::make_tuple(thrust::get<0>(t), thrust::get<1>(t));
+          auto IM = thrust::get<2>(t);
+
+          if (inverse) {
+            IM = IntersectionMatrix::Transpose(IM);
+          }
+
+          return detail::EvaluatePredicate(predicate, IM) ? res : invalid_tuple;
+        });
   }
-  auto end = thrust::remove_if(
-      rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-      [=] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-        return pair == invalid_pair;
-      });
-  size_t new_size = thrust::distance(ids.data(), end);
-  GPUSPATIAL_LOG_INFO("Refined, result size %zu", new_size);
-  ids.set_size(stream, new_size);
+  auto end = thrust::remove_if(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+                               [=] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu) {
+                                 return tu == invalid_tuple;
+                               });
+  size_t new_size = thrust::distance(zip_begin, end);
+  point_ids.resize(new_size, stream);
+  multi_poly_ids.resize(new_size, stream);
 }
 
 template <typename POINT_T, typename INDEX_T>
 size_t RelateEngine<POINT_T, INDEX_T>::EstimateBVHSize(
     const rmm::cuda_stream_view& stream, const PolygonArrayView<POINT_T, INDEX_T>& polys,
-    ArrayView<uint32_t> poly_ids) {
-  auto n_polygons = poly_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_polygons, stream);
-  auto* p_nsegs = n_segs.data();
-
-  LaunchKernel(stream, [=] __device__() {
-    using WarpReduce = cub::WarpReduce<uint32_t>;
-    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-    auto lane = threadIdx.x % 32;
-    auto warp_id = threadIdx.x / 32;
-    auto global_warp_id = TID_1D / 32;
-    auto n_warps = TOTAL_THREADS_1D / 32;
-
-    for (auto i = global_warp_id; i < n_polygons; i += n_warps) {
-      auto id = poly_ids[i];
-      const auto& polygon = polys[id];
-      uint32_t total_segs = 0;
-
-      for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
-        total_segs += polygon.get_ring(ring).num_points();
-      }
-      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
-      if (lane == 0) {
-        p_nsegs[i] = total_segs;
-      }
-    }
-  });
-  auto total_segs =
-      thrust::reduce(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end());
-  if (total_segs == 0) {
+    ArrayView<uint32_t> poly_ids, int segs_per_aabb) {
+  auto num_aabbs = detail::ComputeNumAabbs(stream, polys, poly_ids, segs_per_aabb);
+  if (num_aabbs == 0) {
     return 0;
   }
+
   // temporary but still needed to consider this part of memory
-  auto aabb_size = total_segs * sizeof(OptixAabb);
+  auto aabb_size = num_aabbs * sizeof(OptixAabb);
   auto bvh_bytes = rt_engine_->EstimateMemoryUsageForAABB(
-      total_segs, config_.bvh_fast_build, config_.bvh_fast_compact);
-  // BVH size and aabb_poly_ids, aabb_ring_ids
-  return aabb_size + bvh_bytes + 2 * sizeof(INDEX_T) * total_segs;
+      num_aabbs, config_.bvh_fast_build, config_.bvh_compact);
+  // BVH size and aabb_poly_ids, aabb_ring_ids, aabb_vertex_offsets
+  return aabb_size + bvh_bytes + 4 * sizeof(INDEX_T) * num_aabbs;
 }
 
 template <typename POINT_T, typename INDEX_T>
 size_t RelateEngine<POINT_T, INDEX_T>::EstimateBVHSize(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-    ArrayView<uint32_t> multi_poly_ids) {
-  auto n_mult_polygons = multi_poly_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_mult_polygons, stream);
-  auto* p_nsegs = n_segs.data();
-
-  LaunchKernel(stream, [=] __device__() {
-    using WarpReduce = cub::WarpReduce<uint32_t>;
-    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-    auto lane = threadIdx.x % 32;
-    auto warp_id = threadIdx.x / 32;
-    auto global_warp_id = TID_1D / 32;
-    auto n_warps = TOTAL_THREADS_1D / 32;
-
-    for (auto i = global_warp_id; i < n_mult_polygons; i += n_warps) {
-      auto id = multi_poly_ids[i];
-      const auto& multi_polygon = multi_polys[id];
-      uint32_t total_segs = 0;
+    ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb) {
+  auto num_aabbs =
+      detail::ComputeNumAabbs(stream, multi_polys, multi_poly_ids, segs_per_aabb);
 
-      for (int part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
-        auto polygon = multi_polygon.get_polygon(part_idx);
-        for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
-          total_segs += polygon.get_ring(ring).num_points();
-        }
-      }
-      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
-      if (lane == 0) {
-        p_nsegs[i] = total_segs;
-      }
-    }
-  });
-  auto total_segs =
-      thrust::reduce(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end());
-  if (total_segs == 0) {
-    return 0;
-  }
   // temporary but still needed to consider this part of memory
-  auto aabb_size = total_segs * sizeof(OptixAabb);
+  auto aabb_size = num_aabbs * sizeof(OptixAabb);
   auto bvh_bytes = rt_engine_->EstimateMemoryUsageForAABB(
-      total_segs, config_.bvh_fast_build, config_.bvh_fast_compact);
-  // BVH size and aabb_multi_poly_ids, aabb_part_ids, aabb_ring_ids
-  return aabb_size + bvh_bytes + 3 * sizeof(INDEX_T) * total_segs;
+      num_aabbs, config_.bvh_fast_build, config_.bvh_compact);
+  // BVH size and aabb_multi_poly_ids, aabb_part_ids, aabb_ring_ids, aabb_vertex_offsets
+  return aabb_size + bvh_bytes + 5 * sizeof(INDEX_T) * num_aabbs;
 }
 
 template <typename POINT_T, typename INDEX_T>
 OptixTraversableHandle RelateEngine<POINT_T, INDEX_T>::BuildBVH(
     const rmm::cuda_stream_view& stream,
     const PolygonArrayView<POINT_T, INDEX_T>& polygons, ArrayView<uint32_t> polygon_ids,
-    rmm::device_uvector<INDEX_T>& seg_begins, rmm::device_buffer& buffer,
+    int segs_per_aabb, rmm::device_buffer& buffer,
     rmm::device_uvector<INDEX_T>& aabb_poly_ids,
-    rmm::device_uvector<INDEX_T>& aabb_ring_ids) {
+    rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets) {
   auto n_polygons = polygon_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_polygons, stream);
-
-  // TODO: warp reduce
-  thrust::transform(rmm::exec_policy_nosync(stream), polygon_ids.begin(),
-                    polygon_ids.end(), n_segs.begin(),
-                    [=] __device__(const uint32_t& id) -> uint32_t {
-                      const auto& polygon = polygons[id];
-                      uint32_t total_segs = 0;
-
-                      for (int ring = 0; ring < polygon.num_rings(); ring++) {
-                        total_segs += polygon.get_ring(ring).num_points();
-                      }
-                      return total_segs;
-                    });
-
-  seg_begins = std::move(rmm::device_uvector<INDEX_T>(n_polygons + 1, stream));
-  auto* p_seg_begins = seg_begins.data();
-  seg_begins.set_element_to_zero_async(0, stream);
-
-  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end(),
-                         seg_begins.begin() + 1);
-
-  uint32_t num_aabbs = seg_begins.back_element(stream);
-
+  auto num_aabbs = detail::ComputeNumAabbs(stream, polygons, polygon_ids, segs_per_aabb);
   aabb_poly_ids = std::move(rmm::device_uvector<INDEX_T>(num_aabbs, stream));
   aabb_ring_ids = std::move(rmm::device_uvector<INDEX_T>(num_aabbs, stream));
+  aabb_vertex_offsets =
+      std::move(rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>(num_aabbs, stream));
 
-  auto* p_poly_ids = aabb_poly_ids.data();
-  auto* p_ring_ids = aabb_ring_ids.data();
+  auto* p_aabb_poly_ids = aabb_poly_ids.data();
+  auto* p_aabb_ring_ids = aabb_ring_ids.data();
+  auto* p_aabb_vertex_offsets = aabb_vertex_offsets.data();
 
-  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
-  auto* p_aabbs = aabbs.data();
+  rmm::device_scalar<uint32_t> d_tail(0, stream);
+
+  auto* p_tail = d_tail.data();
 
   LaunchKernel(stream.value(), [=] __device__() {
     auto lane = threadIdx.x % 32;
@@ -763,191 +766,222 @@ OptixTraversableHandle RelateEngine<POINT_T, INDEX_T>::BuildBVH(
     auto n_warps = TOTAL_THREADS_1D / 32;
 
     // each warp takes a polygon
-    // i is the renumbered polygon id starting from 0
     for (auto i = global_warp_id; i < n_polygons; i += n_warps) {
       auto poly_id = polygon_ids[i];
       const auto& polygon = polygons[poly_id];
-      auto tail = p_seg_begins[i];
 
       // entire warp sequentially visit each ring
       for (uint32_t ring_idx = 0; ring_idx < polygon.num_rings(); ring_idx++) {
         auto ring = polygon.get_ring(ring_idx);
-        // this is like a hash function, its okay to overflow
-        OptixAabb aabb;
-        aabb.minZ = aabb.maxZ = i;
-
-        // each lane takes a seg
-        for (auto seg_idx = lane; seg_idx < ring.num_segments(); seg_idx += 32) {
-          const auto& seg = ring.get_line_segment(seg_idx);
-          const auto& p1 = seg.get_p1();
-          const auto& p2 = seg.get_p2();
-
-          aabb.minX = std::min(p1.x(), p2.x());
-          aabb.maxX = std::max(p1.x(), p2.x());
-          aabb.minY = std::min(p1.y(), p2.y());
-          aabb.maxY = std::max(p1.y(), p2.y());
-
-          if (std::is_same_v<scalar_t, double>) {
-            aabb.minX = next_float_from_double(aabb.minX, -1, 2);
-            aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
-            aabb.minY = next_float_from_double(aabb.minY, -1, 2);
-            aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
-          }
-          p_aabbs[tail + seg_idx] = aabb;
-          p_poly_ids[tail + seg_idx] = poly_id;
-          p_ring_ids[tail + seg_idx] = ring_idx;
+        auto aabbs_per_ring = (ring.num_segments() + segs_per_aabb - 1) / segs_per_aabb;
+        // e.g., num segs = 3, segs_per_aabb = 2
+        // The first aabb covers seg 0,1, with vertex id (0,1,2)
+        // The second aabb covers seg 2, with vertex id (2,3)
+        // each lane takes an aabb
+        for (auto aabb_idx = lane; aabb_idx < aabbs_per_ring; aabb_idx += 32) {
+          INDEX_T local_vertex_begin = aabb_idx * segs_per_aabb;
+          INDEX_T local_vertex_end =
+              std::min((INDEX_T)(local_vertex_begin + segs_per_aabb),
+                       (INDEX_T)ring.num_segments());
+
+          auto tail = atomicAdd(p_tail, 1);
+
+          assert(tail < num_aabbs);
+          p_aabb_poly_ids[tail] = poly_id;
+          p_aabb_ring_ids[tail] = ring_idx;
+          p_aabb_vertex_offsets[tail] =
+              thrust::make_pair(local_vertex_begin, local_vertex_end);
         }
-        tail += ring.num_segments();
-        // fill a dummy AABB, so we have aabb-vertex one-to-one relationship
-        if (lane == 0) {
-          p_aabbs[tail] = OptixAabb{0, 0, 0, 0, 0, 0};
-        }
-        tail++;
       }
-      assert(p_seg_begins[i + 1] == tail);
     }
   });
+  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
+
+  // Fill AABBs
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator<uint32_t>(0),
+                    thrust::make_counting_iterator<uint32_t>(num_aabbs), aabbs.begin(),
+                    [=] __device__(const uint32_t& aabb_idx) {
+                      OptixAabb aabb;
+                      aabb.minX = std::numeric_limits<scalar_t>::max();
+                      aabb.minY = std::numeric_limits<scalar_t>::max();
+                      aabb.maxX = std::numeric_limits<scalar_t>::lowest();
+                      aabb.maxY = std::numeric_limits<scalar_t>::lowest();
+
+                      auto poly_id = p_aabb_poly_ids[aabb_idx];
+                      auto ring_id = p_aabb_ring_ids[aabb_idx];
+                      auto vertex_offset_pair = p_aabb_vertex_offsets[aabb_idx];
+                      const auto& polygon = polygons[poly_id];
+                      const auto& ring = polygon.get_ring(ring_id);
+
+                      for (auto vidx = vertex_offset_pair.first;
+                           vidx <= vertex_offset_pair.second; vidx++) {
+                        const auto& v = ring.get_point(vidx);
+                        float x = v.x();
+                        float y = v.y();
+
+                        aabb.minX = fminf(aabb.minX, x);
+                        aabb.maxX = fmaxf(aabb.maxX, x);
+                        aabb.minY = fminf(aabb.minY, y);
+                        aabb.maxY = fmaxf(aabb.maxY, y);
+                      }
+
+                      if (std::is_same_v<scalar_t, double>) {
+                        aabb.minX = next_float_from_double(aabb.minX, -1, 2);
+                        aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
+                        aabb.minY = next_float_from_double(aabb.minY, -1, 2);
+                        aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
+                      }
+                      // Using minZ/maxZ to store polygon id for better filtering
+                      // Refer to polygon_point_query.cu
+                      aabb.minZ = aabb.maxZ = poly_id;
+                      return aabb;
+                    });
+
   assert(rt_engine_ != nullptr);
   return rt_engine_->BuildAccelCustom(stream.value(), ArrayView<OptixAabb>(aabbs), buffer,
-                                      config_.bvh_fast_build, config_.bvh_fast_compact);
+                                      config_.bvh_fast_build, config_.bvh_compact);
 }
 
 template <typename POINT_T, typename INDEX_T>
 OptixTraversableHandle RelateEngine<POINT_T, INDEX_T>::BuildBVH(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-    ArrayView<uint32_t> multi_poly_ids, rmm::device_uvector<INDEX_T>& seg_begins,
-    rmm::device_uvector<INDEX_T>& part_begins, rmm::device_buffer& buffer,
+    ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb, rmm::device_buffer& buffer,
     rmm::device_uvector<INDEX_T>& aabb_multi_poly_ids,
     rmm::device_uvector<INDEX_T>& aabb_part_ids,
-    rmm::device_uvector<INDEX_T>& aabb_ring_ids) {
+    rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets,
+    rmm::device_uvector<INDEX_T>& part_begins) {
   auto n_mult_polygons = multi_poly_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_mult_polygons, stream);
-  auto* p_nsegs = n_segs.data();
-
-  LaunchKernel(stream, [=] __device__() {
-    using WarpReduce = cub::WarpReduce<uint32_t>;
-    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-    auto lane = threadIdx.x % 32;
-    auto warp_id = threadIdx.x / 32;
-    auto global_warp_id = TID_1D / 32;
-    auto n_warps = TOTAL_THREADS_1D / 32;
-
-    for (auto i = global_warp_id; i < n_mult_polygons; i += n_warps) {
-      auto id = multi_poly_ids[i];
-      const auto& multi_polygon = multi_polys[id];
-      uint32_t total_segs = 0;
-
-      for (int part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
-        auto polygon = multi_polygon.get_polygon(part_idx);
-        for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
-          total_segs += polygon.get_ring(ring).num_points();
-        }
-      }
-      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
-      if (lane == 0) {
-        p_nsegs[i] = total_segs;
-      }
-    }
-  });
-
-  seg_begins = std::move(rmm::device_uvector<INDEX_T>(n_mult_polygons + 1, stream));
-  auto* p_seg_begins = seg_begins.data();
-  seg_begins.set_element_to_zero_async(0, stream);
-
-  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end(),
-                         seg_begins.begin() + 1);
 
-  // each line seg is corresponding to an AABB and each ring includes an empty AABB
-  uint32_t num_aabbs = seg_begins.back_element(stream);
+  auto num_aabbs =
+      detail::ComputeNumAabbs(stream, multi_polys, multi_poly_ids, segs_per_aabb);
+  if (num_aabbs == 0) {
+    return 0;
+  }
 
   aabb_multi_poly_ids = std::move(rmm::device_uvector<INDEX_T>(num_aabbs, stream));
   aabb_part_ids = std::move(rmm::device_uvector<uint32_t>(num_aabbs, stream));
   aabb_ring_ids = std::move(rmm::device_uvector<uint32_t>(num_aabbs, stream));
+  aabb_vertex_offsets =
+      std::move(rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>(num_aabbs, stream));
+  rmm::device_uvector<INDEX_T> aabb_seq_ids(num_aabbs, stream);
 
-  auto* p_multi_poly_ids = aabb_multi_poly_ids.data();
-  auto* p_part_ids = aabb_part_ids.data();
-  auto* p_ring_ids = aabb_ring_ids.data();
-
-  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
-  auto* p_aabbs = aabbs.data();
-
-  rmm::device_uvector<uint32_t> num_parts(n_mult_polygons, stream);
+  auto* p_aabb_multi_poly_ids = aabb_multi_poly_ids.data();
+  auto* p_aabb_part_ids = aabb_part_ids.data();
+  auto* p_aabb_ring_ids = aabb_ring_ids.data();
+  auto* p_aabb_vertex_offsets = aabb_vertex_offsets.data();
+  auto* p_aabb_seq_ids = aabb_seq_ids.data();
 
-  thrust::transform(rmm::exec_policy_nosync(stream), multi_poly_ids.begin(),
-                    multi_poly_ids.end(), num_parts.begin(), [=] __device__(uint32_t id) {
-                      const auto& multi_polygon = multi_polys[id];
-                      return multi_polygon.num_polygons();
-                    });
+  rmm::device_scalar<uint32_t> d_tail(0, stream);
 
-  part_begins = std::move(rmm::device_uvector<uint32_t>(n_mult_polygons + 1, stream));
-  auto* p_part_begins = part_begins.data();
-  part_begins.set_element_to_zero_async(0, stream);
-  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), num_parts.begin(),
-                         num_parts.end(), part_begins.begin() + 1);
-  num_parts.resize(0, stream);
-  num_parts.shrink_to_fit(stream);
+  auto* p_tail = d_tail.data();
 
   LaunchKernel(stream.value(), [=] __device__() {
     auto lane = threadIdx.x % 32;
     auto global_warp_id = TID_1D / 32;
     auto n_warps = TOTAL_THREADS_1D / 32;
 
-    // each warp takes a multi polygon
-    // i is the renumbered polygon id starting from 0
+    // each warp takes a polygon
     for (auto i = global_warp_id; i < n_mult_polygons; i += n_warps) {
       auto multi_poly_id = multi_poly_ids[i];
       const auto& multi_polygon = multi_polys[multi_poly_id];
-      auto tail = p_seg_begins[i];
 
-      // entire warp sequentially visit each part
       for (uint32_t part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
         auto polygon = multi_polygon.get_polygon(part_idx);
-
         // entire warp sequentially visit each ring
         for (uint32_t ring_idx = 0; ring_idx < polygon.num_rings(); ring_idx++) {
           auto ring = polygon.get_ring(ring_idx);
-          // this is like a hash function, its okay to overflow
-          OptixAabb aabb;
-          aabb.minZ = aabb.maxZ = p_part_begins[i] + part_idx;
-
-          // each lane takes a seg
-          for (auto seg_idx = lane; seg_idx < ring.num_segments(); seg_idx += 32) {
-            const auto& seg = ring.get_line_segment(seg_idx);
-            const auto& p1 = seg.get_p1();
-            const auto& p2 = seg.get_p2();
-
-            aabb.minX = std::min(p1.x(), p2.x());
-            aabb.maxX = std::max(p1.x(), p2.x());
-            aabb.minY = std::min(p1.y(), p2.y());
-            aabb.maxY = std::max(p1.y(), p2.y());
-
-            if (std::is_same_v<scalar_t, double>) {
-              aabb.minX = next_float_from_double(aabb.minX, -1, 2);
-              aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
-              aabb.minY = next_float_from_double(aabb.minY, -1, 2);
-              aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
-            }
-            p_aabbs[tail + seg_idx] = aabb;
-            p_multi_poly_ids[tail + seg_idx] = multi_poly_id;
-            p_part_ids[tail + seg_idx] = part_idx;
-            p_ring_ids[tail + seg_idx] = ring_idx;
-          }
-          tail += ring.num_segments();
-          // fill a dummy AABB, so we have aabb-vertex one-to-one relationship
-          if (lane == 0) {
-            p_aabbs[tail] = OptixAabb{0, 0, 0, 0, 0, 0};
+          auto aabbs_per_ring = (ring.num_segments() + segs_per_aabb - 1) / segs_per_aabb;
+          // e.g., num segs = 3, segs_per_aabb = 2
+          // The first aabb covers seg 0,1, with vertex id (0,1,2)
+          // The second aabb covers seg 2, with vertex id (2,3)
+          // each lane takes an aabb
+          for (auto aabb_idx = lane; aabb_idx < aabbs_per_ring; aabb_idx += 32) {
+            INDEX_T local_vertex_begin = aabb_idx * segs_per_aabb;
+            INDEX_T local_vertex_end =
+                std::min((INDEX_T)(local_vertex_begin + segs_per_aabb),
+                         (INDEX_T)ring.num_segments());
+
+            auto tail = atomicAdd(p_tail, 1);
+
+            assert(tail < num_aabbs);
+            p_aabb_multi_poly_ids[tail] = multi_poly_id;
+            p_aabb_part_ids[tail] = part_idx;
+            p_aabb_ring_ids[tail] = ring_idx;
+            p_aabb_vertex_offsets[tail] =
+                thrust::make_pair(local_vertex_begin, local_vertex_end);
+            p_aabb_seq_ids[tail] = i;
           }
-          tail++;
         }
       }
-      assert(p_seg_begins[i + 1] == tail);
     }
   });
 
+  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
+  part_begins = std::move(rmm::device_uvector<uint32_t>(n_mult_polygons + 1, stream));
+  auto* p_part_begins = part_begins.data();
+  part_begins.set_element_to_zero_async(0, stream);
+  rmm::device_uvector<uint32_t> num_parts(n_mult_polygons, stream);
+
+  thrust::transform(rmm::exec_policy_nosync(stream), multi_poly_ids.begin(),
+                    multi_poly_ids.end(), num_parts.begin(), [=] __device__(uint32_t id) {
+                      const auto& multi_polygon = multi_polys[id];
+                      return multi_polygon.num_polygons();
+                    });
+
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), num_parts.begin(),
+                         num_parts.end(), part_begins.begin() + 1);
+  num_parts.resize(0, stream);
+  num_parts.shrink_to_fit(stream);
+
+  // Fill AABBs
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator<uint32_t>(0),
+                    thrust::make_counting_iterator<uint32_t>(num_aabbs), aabbs.begin(),
+                    [=] __device__(const uint32_t& aabb_idx) {
+                      OptixAabb aabb;
+                      aabb.minX = std::numeric_limits<scalar_t>::max();
+                      aabb.minY = std::numeric_limits<scalar_t>::max();
+                      aabb.maxX = std::numeric_limits<scalar_t>::lowest();
+                      aabb.maxY = std::numeric_limits<scalar_t>::lowest();
+
+                      auto multi_poly_id = p_aabb_multi_poly_ids[aabb_idx];
+                      auto part_id = p_aabb_part_ids[aabb_idx];
+                      auto ring_id = p_aabb_ring_ids[aabb_idx];
+                      auto vertex_offset_pair = p_aabb_vertex_offsets[aabb_idx];
+                      auto seq_id = p_aabb_seq_ids[aabb_idx];
+                      auto multi_polygon = multi_polys[multi_poly_id];
+                      const auto& polygon = multi_polygon.get_polygon(part_id);
+                      const auto& ring = polygon.get_ring(ring_id);
+
+                      for (auto vidx = vertex_offset_pair.first;
+                           vidx <= vertex_offset_pair.second; vidx++) {
+                        const auto& v = ring.get_point(vidx);
+                        float x = v.x();
+                        float y = v.y();
+
+                        aabb.minX = fminf(aabb.minX, x);
+                        aabb.maxX = fmaxf(aabb.maxX, x);
+                        aabb.minY = fminf(aabb.minY, y);
+                        aabb.maxY = fmaxf(aabb.maxY, y);
+                      }
+
+                      if (std::is_same_v<scalar_t, double>) {
+                        aabb.minX = next_float_from_double(aabb.minX, -1, 2);
+                        aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
+                        aabb.minY = next_float_from_double(aabb.minY, -1, 2);
+                        aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
+                      }
+
+                      aabb.minZ = aabb.maxZ = p_part_begins[seq_id] + part_id;
+                      return aabb;
+                    });
   assert(rt_engine_ != nullptr);
+
   return rt_engine_->BuildAccelCustom(stream.value(), ArrayView<OptixAabb>(aabbs), buffer,
-                                      config_.bvh_fast_build, config_.bvh_fast_compact);
+                                      config_.bvh_fast_build, config_.bvh_compact);
 }
 // Explicitly instantiate the template for specific types
 template class RelateEngine<Point<double, 2>, uint32_t>;
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp b/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp
index 7596e0cb3..9857be56c 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp
@@ -14,12 +14,12 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/rt_engine.hpp"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/exception.h"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/exception.hpp"
 #include "gpuspatial/utils/logger.hpp"
 
-#include "rt/shaders/shader_config.h"
+#include "rt/shaders/shader_config.hpp"
 
 #include "rmm/device_scalar.hpp"
 
@@ -57,8 +57,6 @@ void context_log_cb(unsigned int level, const char* tag, const char* message, vo
 }  // namespace
 
 namespace gpuspatial {
-namespace details {
-
 // --- RTConfig Method Definitions ---
 
 void RTConfig::AddModule(const Module& mod) {
@@ -103,6 +101,12 @@ RTConfig get_default_rt_config(const std::string& ptx_root) {
 RTEngine::RTEngine() : initialized_(false) {}
 
 RTEngine::~RTEngine() {
+  cudaError_t probe = cudaPeekAtLastError();
+
+  if (probe == cudaErrorCudartUnloading) {
+    GPUSPATIAL_LOG_ERROR("CUDA runtime is unloaded");
+    return;
+  }
   if (initialized_) {
     releaseOptixResources();
   }
@@ -112,6 +116,7 @@ void RTEngine::Init(const RTConfig& config) {
   if (initialized_) {
     releaseOptixResources();
   }
+  GPUSPATIAL_LOG_INFO("Initialize RTEngine");
   initOptix(config);
   createContext();
   createModule(config);
@@ -163,32 +168,34 @@ OptixTraversableHandle RTEngine::BuildAccelCustom(cudaStream_t cuda_stream,
   OPTIX_CHECK(optixAccelComputeMemoryUsage(optix_context_, &accelOptions, &build_input, 1,
                                            &blas_buffer_sizes));
 
-  GPUSPATIAL_LOG_INFO(
+  GPUSPATIAL_LOG_DEBUG(
       "ComputeBVHMemoryUsage, AABB count: %u, temp size: %zu MB, output size: %zu MB",
       num_prims, blas_buffer_sizes.tempSizeInBytes / 1024 / 1024,
       blas_buffer_sizes.outputSizeInBytes / 1024 / 1024);
 
   rmm::device_buffer temp_buf(blas_buffer_sizes.tempSizeInBytes, cuda_stream);
-  out_buf.resize(blas_buffer_sizes.outputSizeInBytes, cuda_stream);
 
   if (compact) {
+    rmm::device_buffer uncompacted_buf(blas_buffer_sizes.outputSizeInBytes, cuda_stream);
     rmm::device_scalar<uint64_t> compacted_size(cuda_stream);
     OptixAccelEmitDesc emitDesc;
     emitDesc.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
     emitDesc.result = reinterpret_cast<CUdeviceptr>(compacted_size.data());
 
-    OPTIX_CHECK(optixAccelBuild(
-        optix_context_, cuda_stream, &accelOptions, &build_input, 1,
-        reinterpret_cast<CUdeviceptr>(temp_buf.data()), blas_buffer_sizes.tempSizeInBytes,
-        reinterpret_cast<CUdeviceptr>(out_buf.data()),
-        blas_buffer_sizes.outputSizeInBytes, &traversable, &emitDesc, 1));
+    OPTIX_CHECK(optixAccelBuild(optix_context_, cuda_stream, &accelOptions, &build_input,
+                                1, reinterpret_cast<CUdeviceptr>(temp_buf.data()),
+                                blas_buffer_sizes.tempSizeInBytes,
+                                reinterpret_cast<CUdeviceptr>(uncompacted_buf.data()),
+                                uncompacted_buf.size(), &traversable, &emitDesc, 1));
 
     auto size = compacted_size.value(cuda_stream);
     out_buf.resize(size, cuda_stream);
     OPTIX_CHECK(optixAccelCompact(optix_context_, cuda_stream, traversable,
-                                  reinterpret_cast<CUdeviceptr>(out_buf.data()), size,
-                                  &traversable));
+                                  reinterpret_cast<CUdeviceptr>(out_buf.data()),
+                                  out_buf.size(), &traversable));
   } else {
+    out_buf.resize(blas_buffer_sizes.outputSizeInBytes, cuda_stream);
+
     OPTIX_CHECK(optixAccelBuild(
         optix_context_, cuda_stream, &accelOptions, &build_input, 1,
         reinterpret_cast<CUdeviceptr>(temp_buf.data()), blas_buffer_sizes.tempSizeInBytes,
@@ -488,15 +495,14 @@ std::vector<char> RTEngine::readData(const std::string& filename) {
 }
 
 void RTEngine::releaseOptixResources() {
+  GPUSPATIAL_LOG_INFO("Release OptiX resources");
   for (auto& [id, res] : resources_) {
-    optixPipelineDestroy(res.pipeline);
-    optixProgramGroupDestroy(res.raygen_pg);
-    optixProgramGroupDestroy(res.miss_pg);
-    optixProgramGroupDestroy(res.hitgroup_pg);
-    optixModuleDestroy(res.module);
+    OPTIX_CHECK(optixPipelineDestroy(res.pipeline));
+    OPTIX_CHECK(optixProgramGroupDestroy(res.raygen_pg));
+    OPTIX_CHECK(optixProgramGroupDestroy(res.miss_pg));
+    OPTIX_CHECK(optixProgramGroupDestroy(res.hitgroup_pg));
+    OPTIX_CHECK(optixModuleDestroy(res.module));
   }
-  optixDeviceContextDestroy(optix_context_);
+  OPTIX_CHECK(optixDeviceContextDestroy(optix_context_));
 }
-
-}  // namespace details
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu
index 3ffdca9ea..f9a632dd3 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu
@@ -14,10 +14,9 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/relate/relate.cuh"
-#include "ray_params.h"
-#include "shader_config.h"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "ray_params.cuh"
+#include "shader_config.hpp"
 
 #include <cuda_runtime.h>
 #include <optix_device.h>
@@ -32,17 +31,22 @@ extern "C" __global__ void __intersection__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
   using ray_params_t = gpuspatial::detail::RayParams<n_dim>;
-  auto geom1_id = optixGetPayload_0();
-  auto geom2_id = optixGetPrimitiveIndex();
-  const auto& mbr1 = params.mbrs1[geom1_id];
-  const auto& mbr2 = params.mbrs2[geom2_id];
-  const auto& aabb1 = mbr1.ToOptixAabb();
-  const auto aabb2 = mbr2.ToOptixAabb();
+  auto rect1_id = optixGetPayload_0();
+  auto rect2_id = optixGetPrimitiveIndex();
+  const auto& rect1 = params.rects1[rect1_id];
+  const auto& rect2 = params.rects2[rect2_id];
+  const auto& aabb1 = rect1.ToOptixAabb();
+  const auto aabb2 = rect2.ToOptixAabb();
   ray_params_t ray_params(aabb1, false);
 
   if (ray_params.IsHit(aabb2)) {
-    if (mbr1.intersects(mbr2)) {
-      params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
+    if (rect1.intersects(rect2)) {
+      if (params.count == nullptr) {
+        auto tail = params.rect1_ids.Append(rect1_id);
+        params.rect2_ids[tail] = rect2_id;
+      } else {
+        atomicAdd(params.count, 1);
+      }
     }
   }
 }
@@ -53,20 +57,18 @@ extern "C" __global__ void __raygen__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < params.mbrs1.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.rects1.size();
        i += optixGetLaunchDimensions().x) {
-    const auto& mbr1 = params.mbrs1[i];
-    auto aabb1 = mbr1.ToOptixAabb();
+    const auto& rect1 = params.rects1[i];
+    if (!rect1.valid()) continue;
+    auto aabb1 = rect1.ToOptixAabb();
     gpuspatial::detail::RayParams<n_dim> ray_params(aabb1, false);
-    float3 origin, dir;
+    float3 origin{0, 0, 0}, dir{0, 0, 0};
 
-    origin.x = ray_params.o.x;
-    origin.y = ray_params.o.y;
-    origin.z = 0;
-
-    dir.x = ray_params.d.x;
-    dir.y = ray_params.d.y;
-    dir.z = 0;
+    for (int dim = 0; dim < n_dim; dim++) {
+      (&origin.x)[dim] = (&ray_params.o.x)[dim];
+      (&dir.x)[dim] = (&ray_params.d.x)[dim];
+    }
 
     float tmin = 0;
     float tmax = 1;
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu
index d85d63741..607d95649 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu
@@ -14,9 +14,9 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "ray_params.h"
-#include "shader_config.h"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "ray_params.cuh"
+#include "shader_config.hpp"
 
 #include <cuda_runtime.h>
 #include <optix_device.h>
@@ -31,20 +31,25 @@ extern "C" __global__ void __intersection__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
   using ray_params_t = gpuspatial::detail::RayParams<n_dim>;
-  auto geom1_id = optixGetPrimitiveIndex();
-  uint64_t geom2_id = optixGetPayload_0();
-  const auto& mbr1 = params.mbrs1[geom1_id];
-  const auto& mbr2 = params.mbrs2[geom2_id];
-  const auto& aabb1 = mbr1.ToOptixAabb();
-  const auto aabb2 = mbr2.ToOptixAabb();
+  auto rect1_id = optixGetPrimitiveIndex();
+  uint64_t rect2_id = optixGetPayload_0();
+  const auto& rect1 = params.rects1[rect1_id];
+  const auto& rect2 = params.rects2[rect2_id];
+  const auto& aabb1 = rect1.ToOptixAabb();
+  const auto aabb2 = rect2.ToOptixAabb();
 
   ray_params_t ray_params(aabb2, true);
 
   if (ray_params.IsHit(aabb1)) {  // ray cast from AABB2 hits AABB1
     ray_params = ray_params_t(aabb1, false);
     if (!ray_params.IsHit(aabb2)) {  // ray cast from AABB1 does not hit AABB2
-      if (mbr1.intersects(mbr2)) {
-        params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
+      if (rect1.intersects(rect2)) {
+        if (params.count == nullptr) {
+          auto tail = params.rect1_ids.Append(rect1_id);
+          params.rect2_ids[tail] = rect2_id;
+        } else {
+          atomicAdd(params.count, 1);
+        }
       }
     }
   }
@@ -56,20 +61,20 @@ extern "C" __global__ void __raygen__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < params.mbrs2.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.rects2.size();
        i += optixGetLaunchDimensions().x) {
-    const auto& mbr2 = params.mbrs2[i];
-    auto aabb2 = mbr2.ToOptixAabb();
-    gpuspatial::detail::RayParams<n_dim> ray_params(aabb2, true);
-    float3 origin, dir;
+    const auto& rect2 = params.rects2[i];
+
+    if (!rect2.valid()) continue;
 
-    origin.x = ray_params.o.x;
-    origin.y = ray_params.o.y;
-    origin.z = 0;
+    auto aabb2 = rect2.ToOptixAabb();
+    gpuspatial::detail::RayParams<n_dim> ray_params(aabb2, true);
+    float3 origin{0, 0, 0}, dir{0, 0, 0};
 
-    dir.x = ray_params.d.x;
-    dir.y = ray_params.d.y;
-    dir.z = 0;
+    for (int dim = 0; dim < n_dim; dim++) {
+      (&origin.x)[dim] = (&ray_params.o.x)[dim];
+      (&dir.x)[dim] = (&ray_params.d.x)[dim];
+    }
 
     float tmin = 0;
     float tmax = 1;
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake
index 56daf449a..13aac4e03 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake
@@ -20,7 +20,7 @@ function(CONFIG_SHADERS SHADER_PTX_FILES)
   set(SHADER_POINT_TYPES "SHADER_POINT_FLOAT_2D;SHADER_POINT_DOUBLE_2D")
 
   set(SHADERS_DEPS "${PROJECT_SOURCE_DIR}/include/gpuspatial/geom"
-                   "${PROJECT_SOURCE_DIR}/include/gpuspatial/index/detail")
+                   "${PROJECT_SOURCE_DIR}/include/gpuspatial/rt")
 
   set(OUTPUT_DIR "${PROJECT_BINARY_DIR}/shaders_ptx")
   set(OPTIX_MODULE_EXTENSION ".ptx")
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu
index f96226c69..3a5c216ba 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu
@@ -14,12 +14,12 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/geom/line_segment.cuh"
-#include "gpuspatial/geom/ray_crossing_counter.cuh"
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/relate/relate.cuh"
-#include "gpuspatial/utils/floating_point.h"
-#include "shader_config.h"
+#include "gpuspatial/geom/ray_crossing_counter.hpp"
+#include "gpuspatial/relate/relate.hpp"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "gpuspatial/utils/floating_point.hpp"
+#include "gpuspatial/utils/helpers.cuh"
+#include "shader_config.hpp"
 
 #include <cuda_runtime.h>
 #include <optix_device.h>
@@ -44,35 +44,36 @@ extern "C" __global__ void __intersection__gpuspatial() {
   auto point_part_id = optixGetPayload_7();
 
   const auto& multi_polygons = params.multi_polygons;
-  auto point_idx = params.ids[query_idx].first;
-  auto multi_polygon_idx = params.ids[query_idx].second;
+  auto point_idx = params.query_point_ids[query_idx];
+  auto multi_polygon_idx = params.query_multi_polygon_ids[query_idx];
   auto hit_multipolygon_idx = params.aabb_multi_poly_ids[aabb_id];
   auto hit_part_idx = params.aabb_part_ids[aabb_id];
   auto hit_ring_idx = params.aabb_ring_ids[aabb_id];
-
+  const auto& vertex_offsets = params.aabb_vertex_offsets[aabb_id];
   // the seg being hit is not from the query polygon
   if (hit_multipolygon_idx != multi_polygon_idx || hit_part_idx != part_idx ||
       hit_ring_idx != ring_idx) {
     return;
   }
 
-  uint32_t local_v1_idx = aabb_id - params.seg_begins[reordered_multi_polygon_idx];
-  uint32_t global_v1_idx = v_offset + local_v1_idx;
-  uint32_t global_v2_idx = global_v1_idx + 1;
-
-  auto vertices = multi_polygons.get_vertices();
-  // segment being hit
-  const auto& v1 = vertices[global_v1_idx];
-  const auto& v2 = vertices[global_v2_idx];
-
+  const auto& multi_polygon = multi_polygons[multi_polygon_idx];
+  const auto& polygon = multi_polygon.get_polygon(part_idx);
+  const auto& ring = polygon.get_ring(ring_idx);
   RayCrossingCounter locator(crossing_count, point_on_seg);
 
-  if (!params.points.empty()) {
-    const auto& p = params.points[point_idx];
-    locator.countSegment(p, v1, v2);
-  } else if (!params.multi_points.empty()) {
-    const auto& p = params.multi_points[point_idx].get_point(point_part_id);
-    locator.countSegment(p, v1, v2);
+  // For each segment in the AABB, count crossings
+  for (auto vertex_offset = vertex_offsets.first; vertex_offset < vertex_offsets.second;
+       ++vertex_offset) {
+    const auto& v1 = ring.get_point(vertex_offset);
+    const auto& v2 = ring.get_point(vertex_offset + 1);
+
+    if (!params.points.empty()) {
+      const auto& p = params.points[point_idx];
+      locator.countSegment(p, v1, v2);
+    } else if (!params.multi_points.empty()) {
+      const auto& p = params.multi_points[point_idx].get_point(point_part_id);
+      locator.countSegment(p, v1, v2);
+    }
   }
 
   optixSetPayload_5(locator.get_crossing_count());
@@ -82,22 +83,23 @@ extern "C" __global__ void __intersection__gpuspatial() {
 extern "C" __global__ void __raygen__gpuspatial() {
   using namespace gpuspatial;
   using point_t = gpuspatial::ShaderPointType;
-  const auto& ids = params.ids;
   const auto& multi_polygons = params.multi_polygons;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < ids.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.query_size;
        i += optixGetLaunchDimensions().x) {
-    auto point_idx = ids[i].first;
-    auto multi_polygon_idx = ids[i].second;
+    auto point_idx = params.query_point_ids[i];
+    auto multi_polygon_idx = params.query_multi_polygon_ids[i];
 
-    auto it = thrust::lower_bound(thrust::seq, params.multi_polygon_ids.begin(),
-                                  params.multi_polygon_ids.end(), multi_polygon_idx);
-    assert(it != params.multi_polygon_ids.end());
+    auto it = thrust::lower_bound(thrust::seq, params.uniq_multi_polygon_ids.begin(),
+                                  params.uniq_multi_polygon_ids.end(), multi_polygon_idx);
+    assert(it != params.uniq_multi_polygon_ids.end());
     uint32_t reordered_multi_polygon_idx =
-        thrust::distance(params.multi_polygon_ids.begin(), it);
-    assert(params.multi_polygon_ids[reordered_multi_polygon_idx] == multi_polygon_idx);
+        thrust::distance(params.uniq_multi_polygon_ids.begin(), it);
+    assert(params.uniq_multi_polygon_ids[reordered_multi_polygon_idx] ==
+           multi_polygon_idx);
 
     auto handle_point = [&](const point_t& p, uint32_t point_part_id, int& IM) {
+      assert(!p.empty());
       float3 origin;
       // each polygon takes a z-plane
       origin.x = p.x();
@@ -108,7 +110,8 @@ extern "C" __global__ void __raygen__gpuspatial() {
       const auto& mbr = multi_polygon.get_mbr();
       auto width = mbr.get_max().x() - mbr.get_min().x();
       float tmin = 0;
-      float tmax = width;
+      // ensure the floating number is greater than the double
+      float tmax = next_float_from_double(width, 1, 2);
 
       // first polygon offset
       uint32_t part_offset = multi_polygons.get_prefix_sum_geoms()[multi_polygon_idx];
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu
index 93f5ceb05..c728b4aa3 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu
@@ -14,8 +14,8 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "shader_config.h"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "shader_config.hpp"
 
 #include <cuda_runtime.h>
 #include <optix_device.h>
@@ -29,51 +29,38 @@ extern "C" __constant__
 
 extern "C" __global__ void __intersection__gpuspatial() {
   auto aabb_id = optixGetPrimitiveIndex();
-  auto geom2_id = optixGetPayload_0();
-  const auto& point = params.points2[geom2_id];
-  const auto& mbrs1 = params.mbrs1;
+  auto point_id = optixGetPayload_0();
+  const auto& point = params.points[point_id];
+  const auto& rect = params.rects[aabb_id];
 
-  if (params.grouped) {
-    assert(!params.prefix_sum.empty());
-    auto begin = params.prefix_sum[aabb_id];
-    auto end = params.prefix_sum[aabb_id + 1];
-
-    for (auto offset = begin; offset < end; offset++) {
-      auto geom1_id = params.reordered_indices[offset];
-      if (mbrs1.empty()) {
-        params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
-      } else {
-        const auto& mbr1 = mbrs1[geom1_id];
-
-        if (mbr1.covers(point.as_float())) {
-          params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
-        }
-      }
-    }
-  } else {
-    assert(!mbrs1.empty());
-    auto geom1_id = aabb_id;
-    const auto& mbr1 = mbrs1[geom1_id];
-
-    if (mbr1.covers(point.as_float())) {
-      params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
+  if (rect.covers(point)) {
+    if (params.count == nullptr) {
+      auto tail = params.rect_ids.Append(aabb_id);
+      params.point_ids[tail] = point_id;
+    } else {
+      atomicAdd(params.count, 1);
     }
   }
 }
 
 extern "C" __global__ void __raygen__gpuspatial() {
+  using point_t = gpuspatial::ShaderPointType;
+  constexpr int n_dim = point_t::n_dim;
   float tmin = 0;
   float tmax = FLT_MIN;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < params.points2.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.points.size();
        i += optixGetLaunchDimensions().x) {
-    const auto& p = params.points2[i];
+    const auto& p = params.points[i];
+    if (p.empty()) {
+      continue;
+    }
 
-    float3 origin;
+    float3 origin{0, 0, 0};
 
-    origin.x = p.get_coordinate(0);
-    origin.y = p.get_coordinate(1);
-    origin.z = 0;
+    for (int dim = 0; dim < n_dim; dim++) {
+      (&origin.x)[dim] = p.get_coordinate(dim);
+    }
     float3 dir = {0, 0, 1};
 
     optixTrace(params.handle, origin, dir, tmin, tmax, 0, OptixVisibilityMask(255),
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu
index 97cb948d1..05066d793 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu
@@ -14,11 +14,11 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/geom/line_segment.cuh"
-#include "gpuspatial/geom/ray_crossing_counter.cuh"
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/relate/relate.cuh"
-#include "shader_config.h"
+#include "gpuspatial/geom/ray_crossing_counter.hpp"
+#include "gpuspatial/relate/relate.hpp"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "gpuspatial/utils/helpers.cuh"
+#include "shader_config.hpp"
 
 #include <cuda_runtime.h>
 #include <optix_device.h>
@@ -41,32 +41,34 @@ extern "C" __global__ void __intersection__gpuspatial() {
   auto point_on_seg = optixGetPayload_5();
   auto point_part_id = optixGetPayload_6();
   const auto& polygons = params.polygons;
-  auto point_idx = params.ids[query_idx].first;
-  auto polygon_idx = params.ids[query_idx].second;
+  auto point_idx = params.query_point_ids[query_idx];
+  auto polygon_idx = params.query_polygon_ids[query_idx];
   auto hit_polygon_idx = params.aabb_poly_ids[aabb_id];
   auto hit_ring_idx = params.aabb_ring_ids[aabb_id];
+  const auto& vertex_offsets = params.aabb_vertex_offsets[aabb_id];
   // the seg being hit is not from the query polygon
   if (hit_polygon_idx != polygon_idx || hit_ring_idx != ring_idx) {
     return;
   }
 
-  uint32_t local_v1_idx = aabb_id - params.seg_begins[reordered_polygon_idx];
-  uint32_t global_v1_idx = v_offset + local_v1_idx;
-  uint32_t global_v2_idx = global_v1_idx + 1;
+  auto ring = polygons[polygon_idx].get_ring(ring_idx);
+  RayCrossingCounter locator(crossing_count, point_on_seg);
 
-  auto vertices = polygons.get_vertices();
-  // segment being hit
-  const auto& v1 = vertices[global_v1_idx];
-  const auto& v2 = vertices[global_v2_idx];
+  // For each segment in the AABB, count crossings
+  for (auto vertex_offset = vertex_offsets.first; vertex_offset < vertex_offsets.second;
+       ++vertex_offset) {
+    const auto& v1 = ring.get_point(vertex_offset);
+    const auto& v2 = ring.get_point(vertex_offset + 1);
 
-  RayCrossingCounter locator(crossing_count, point_on_seg);
-  if (!params.points.empty()) {
-    const auto& p = params.points[point_idx];
-    locator.countSegment(p, v1, v2);
-  } else if (!params.multi_points.empty()) {
-    const auto& p = params.multi_points[point_idx].get_point(point_part_id);
-    locator.countSegment(p, v1, v2);
+    if (!params.points.empty()) {
+      const auto& p = params.points[point_idx];
+      locator.countSegment(p, v1, v2);
+    } else if (!params.multi_points.empty()) {
+      const auto& p = params.multi_points[point_idx].get_point(point_part_id);
+      locator.countSegment(p, v1, v2);
+    }
   }
+
   optixSetPayload_4(locator.get_crossing_count());
   optixSetPayload_5(locator.get_point_on_segment());
 }
@@ -74,32 +76,30 @@ extern "C" __global__ void __intersection__gpuspatial() {
 extern "C" __global__ void __raygen__gpuspatial() {
   using namespace gpuspatial;
   using point_t = gpuspatial::ShaderPointType;
-  const auto& ids = params.ids;
   const auto& polygons = params.polygons;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < ids.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.query_size;
        i += optixGetLaunchDimensions().x) {
-    auto point_idx = ids[i].first;
-    auto polygon_idx = ids[i].second;
+    auto point_idx = params.query_point_ids[i];
+    auto polygon_idx = params.query_polygon_ids[i];
 
-    auto it = thrust::lower_bound(thrust::seq, params.polygon_ids.begin(),
-                                  params.polygon_ids.end(), polygon_idx);
-    assert(it != params.polygon_ids.end());
-    uint32_t reordered_polygon_idx = thrust::distance(params.polygon_ids.begin(), it);
-    assert(params.polygon_ids[reordered_polygon_idx] == polygon_idx);
+    auto it = thrust::lower_bound(thrust::seq, params.uniq_polygon_ids.begin(),
+                                  params.uniq_polygon_ids.end(), polygon_idx);
+    assert(it != params.uniq_polygon_ids.end());
+    uint32_t reordered_polygon_idx =
+        thrust::distance(params.uniq_polygon_ids.begin(), it);
+    assert(params.uniq_polygon_ids[reordered_polygon_idx] == polygon_idx);
 
     auto handle_point = [&](const point_t& p, uint32_t point_part_id, int& IM) {
-      float3 origin;
-      // each polygon takes a z-plane
-      origin.x = p.x();
-      origin.y = p.y();
+      assert(!p.empty());
       // cast ray toward positive x-axis
       float3 dir = {1, 0, 0};
       const auto& polygon = polygons[polygon_idx];
       const auto& mbr = polygon.get_mbr();
       auto width = mbr.get_max().x() - mbr.get_min().x();
       float tmin = 0;
-      float tmax = width;
+      // ensure the floating number is greater than the double
+      float tmax = next_float_from_double(width, 1, 2);
 
       // first polygon offset
       uint32_t ring_offset = polygons.get_prefix_sum_polygons()[polygon_idx];
@@ -119,7 +119,11 @@ extern "C" __global__ void __raygen__gpuspatial() {
       IM |= IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D;
       uint32_t ring = 0;
       locator.Init();
-      origin.z = reordered_polygon_idx;
+      float3 origin;
+      // each polygon takes a z-plane
+      origin.x = p.x();
+      origin.y = p.y();
+      origin.z = polygon_idx;
       // test exterior
       optixTrace(params.handle, origin, dir, tmin, tmax, 0, OptixVisibilityMask(255),
                  OPTIX_RAY_FLAG_NONE,             // OPTIX_RAY_FLAG_NONE,
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.h b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.cuh
similarity index 95%
rename from c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.h
rename to c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.cuh
index 447590a26..1e920400b 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.cuh
@@ -17,9 +17,9 @@
 
 #pragma once
 
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <optix.h>
 #include <thrust/swap.h>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/shader_config.h b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/shader_config.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/shader_config.h
rename to c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/shader_config.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
new file mode 100644
index 000000000..9f76af495
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
@@ -0,0 +1,682 @@
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "gpuspatial/index/rt_spatial_index.cuh"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "gpuspatial/utils/launcher.hpp"
+#include "gpuspatial/utils/logger.hpp"
+#include "gpuspatial/utils/morton_code.hpp"
+#include "gpuspatial/utils/stopwatch.hpp"
+
+#include "rt/shaders/shader_id.hpp"
+
+#include "rmm/exec_policy.hpp"
+
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#define OPTIX_MAX_RAYS (1lu << 30)
+
+namespace gpuspatial {
+namespace detail {
+
+template <typename POINT_T>
+static rmm::device_uvector<OptixAabb> ComputeAABBs(rmm::cuda_stream_view stream,
+                                                   const ArrayView<Box<POINT_T>>& mbrs) {
+  rmm::device_uvector<OptixAabb> aabbs(mbrs.size(), stream);
+
+  thrust::transform(rmm::exec_policy_nosync(stream), mbrs.begin(), mbrs.end(),
+                    aabbs.begin(), [] __device__(const Box<POINT_T>& mbr) {
+                      // handle empty boxes
+                      if (mbr.get_min().empty() || mbr.get_max().empty()) {
+                        // empty box
+                        OptixAabb empty_aabb;
+                        empty_aabb.minX = empty_aabb.minY = empty_aabb.minZ = 0.0f;
+                        empty_aabb.maxX = empty_aabb.maxY = empty_aabb.maxZ = -1.0f;
+                        return empty_aabb;
+                      }
+                      return mbr.ToOptixAabb();
+                    });
+  return std::move(aabbs);
+}
+
+template <typename POINT_T, typename INDEX_T>
+rmm::device_uvector<OptixAabb> ComputeAABBs(
+    rmm::cuda_stream_view stream, rmm::device_uvector<POINT_T>& points,
+    rmm::device_uvector<INDEX_T>& prefix_sum,
+    rmm::device_uvector<INDEX_T>& reordered_indices, int group_size,
+    rmm::device_uvector<Box<POINT_T>>& mbrs) {
+  using scalar_t = typename POINT_T::scalar_t;
+  using box_t = Box<POINT_T>;
+  constexpr int n_dim = POINT_T::n_dim;
+  static_assert(n_dim == 2 || n_dim == 3, "Only 2D and 3D points are supported");
+  POINT_T min_world_corner, max_world_corner;
+
+  min_world_corner.set_max();
+  max_world_corner.set_min();
+
+  for (int dim = 0; dim < n_dim; dim++) {
+    auto min_val = thrust::transform_reduce(
+        rmm::exec_policy_nosync(stream), points.begin(), points.end(),
+        [=] __device__(const POINT_T& p) -> scalar_t { return p.get_coordinate(dim); },
+        std::numeric_limits<scalar_t>::max(), thrust::minimum<scalar_t>());
+    auto max_val = thrust::transform_reduce(
+        rmm::exec_policy_nosync(stream), points.begin(), points.end(),
+        [=] __device__(const POINT_T& p) -> scalar_t { return p.get_coordinate(dim); },
+        std::numeric_limits<scalar_t>::lowest(), thrust::maximum<scalar_t>());
+    min_world_corner.set_coordinate(dim, min_val);
+    max_world_corner.set_coordinate(dim, max_val);
+  }
+
+  auto np = points.size();
+  rmm::device_uvector<uint32_t> morton_codes(np, stream);
+  // compute morton codes and reorder indices
+  thrust::transform(rmm::exec_policy_nosync(stream), points.begin(), points.end(),
+                    morton_codes.begin(), [=] __device__(const POINT_T& p) {
+                      POINT_T norm_p;
+
+                      for (int dim = 0; dim < n_dim; dim++) {
+                        auto min_val = min_world_corner.get_coordinate(dim);
+                        auto max_val = max_world_corner.get_coordinate(dim);
+                        auto extent = min_val == max_val ? 1 : max_val - min_val;
+                        auto norm_val = (p.get_coordinate(dim) - min_val) / extent;
+                        norm_p.set_coordinate(dim, norm_val);
+                      }
+                      return detail::morton_code(norm_p.get_vec());
+                    });
+  reordered_indices.resize(np, stream);
+  thrust::sequence(rmm::exec_policy_nosync(stream), reordered_indices.begin(),
+                   reordered_indices.end());
+  thrust::sort_by_key(rmm::exec_policy_nosync(stream), morton_codes.begin(),
+                      morton_codes.end(), reordered_indices.begin());
+  auto n_aabbs = (np + group_size - 1) / group_size;
+  mbrs.resize(n_aabbs, stream);
+  rmm::device_uvector<OptixAabb> aabbs(n_aabbs, stream);
+  rmm::device_uvector<INDEX_T> np_per_aabb(n_aabbs, stream);
+
+  auto* p_reordered_indices = reordered_indices.data();
+  auto* p_aabbs = aabbs.data();
+  auto* p_np_per_aabb = np_per_aabb.data();
+  ArrayView<POINT_T> v_points(points);
+  ArrayView<box_t> v_mbrs(mbrs);
+  // each warp takes an AABB and processes points_per_aabb points
+  LaunchKernel(stream, [=] __device__() mutable {
+    using WarpReduce = cub::WarpReduce<scalar_t>;
+    // One temp storage slot per active warp
+    __shared__ typename WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
+    const int warp_id = threadIdx.x / 32;
+    const int lane_id = threadIdx.x % 32;
+    // Calculate global ID of the warp to stride through AABBs
+    const int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+    const int total_warps = (gridDim.x * blockDim.x) / 32;
+
+    // Grid-Stride Loop: Each warp processes one AABB (one group of points)
+    for (uint32_t aabb_id = global_warp_id; aabb_id < n_aabbs; aabb_id += total_warps) {
+      INDEX_T idx_begin = aabb_id * group_size;
+      INDEX_T idx_end = thrust::min((INDEX_T)np, (INDEX_T)(idx_begin + group_size));
+      int count = idx_end - idx_begin;
+
+      // 1. Initialize Thread-Local Accumulators (Registers)
+      // Initialize to limits so empty/out-of-bounds threads don't affect reduction
+      scalar_t thread_min[n_dim];
+      scalar_t thread_max[n_dim];
+
+#pragma unroll
+      for (int d = 0; d < n_dim; d++) {
+        thread_min[d] = std::numeric_limits<scalar_t>::max();
+        thread_max[d] = std::numeric_limits<scalar_t>::lowest();
+      }
+
+      // 2. Loop over the points in the group (Stride by 32)
+      // Every thread processes roughly group_size/32 points
+      for (int i = lane_id; i < count; i += 32) {
+        // Load index (Coalesced access to indices)
+        INDEX_T point_idx = p_reordered_indices[idx_begin + i];
+
+        // Load Point (Indirect access - unavoidable due to reordering)
+        const POINT_T& p = v_points[point_idx];
+
+// Accumulate min/max locally in registers
+#pragma unroll
+        for (int d = 0; d < n_dim; d++) {
+          scalar_t val = p.get_coordinate(d);
+          thread_min[d] = thrust::min(thread_min[d], val);
+          thread_max[d] = thrust::max(thread_max[d], val);
+        }
+      }
+
+      // 3. Warp Reduction (Perform once per dimension per AABB)
+      POINT_T final_min, final_max;
+#pragma unroll
+      for (int d = 0; d < n_dim; d++) {
+        // CUB WarpReduce handles the cross-lane communication
+        scalar_t agg_min =
+            WarpReduce(temp_storage[warp_id]).Reduce(thread_min[d], thrust::minimum<>());
+        scalar_t agg_max =
+            WarpReduce(temp_storage[warp_id]).Reduce(thread_max[d], thrust::maximum<>());
+
+        // Only lane 0 holds the valid reduction result
+        if (lane_id == 0) {
+          final_min.set_coordinate(d, agg_min);
+          final_max.set_coordinate(d, agg_max);
+        }
+      }
+
+      // 4. Store Results to Global Memory
+      if (lane_id == 0) {
+        p_np_per_aabb[aabb_id] = count;
+
+        if (count > 0) {
+          box_t ext_mbr(final_min, final_max);
+          v_mbrs[aabb_id] = ext_mbr;
+          p_aabbs[aabb_id] = ext_mbr.ToOptixAabb();
+        } else {
+          // Handle empty AABB case
+          OptixAabb empty_aabb;
+          empty_aabb.minX = empty_aabb.minY = empty_aabb.minZ = 0.0f;
+          empty_aabb.maxX = empty_aabb.maxY = empty_aabb.maxZ = -1.0f;
+          v_mbrs[aabb_id] = box_t();
+          p_aabbs[aabb_id] = empty_aabb;
+        }
+      }
+    }
+  });
+  prefix_sum.resize(n_aabbs + 1, stream);
+  prefix_sum.set_element_to_zero_async(0, stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), np_per_aabb.begin(),
+                         np_per_aabb.end(), prefix_sum.begin() + 1);
+#ifndef NDEBUG
+  auto* p_prefix_sum = prefix_sum.data();
+
+  thrust::for_each(rmm::exec_policy_nosync(stream), thrust::counting_iterator<size_t>(0),
+                   thrust::counting_iterator<size_t>(aabbs.size()),
+                   [=] __device__(size_t aabb_idx) {
+                     auto begin = p_prefix_sum[aabb_idx];
+                     auto end = p_prefix_sum[aabb_idx + 1];
+                     const auto& aabb = p_aabbs[aabb_idx];
+
+                     for (auto i = begin; i < end; i++) {
+                       auto point_idx = p_reordered_indices[i];
+                       const auto& p = v_points[point_idx];
+                       for (int dim = 0; dim < n_dim; dim++) {
+                         auto coord = p.get_coordinate(dim);
+                         assert(coord >= (&aabb.minX)[dim] && coord <= (&aabb.maxX)[dim]);
+                         assert(v_mbrs[aabb_idx].covers(p));
+                       }
+                     }
+                   });
+#endif
+  return std::move(aabbs);
+}
+
+template <typename POINT_T, typename INDEX_T>
+void RefineExactPoints(rmm::cuda_stream_view stream, ArrayView<POINT_T> build_points,
+                       ArrayView<POINT_T> probe_points, ArrayView<INDEX_T> prefix_sum,
+                       ArrayView<INDEX_T> reordered_indices, ArrayView<INDEX_T> rect_ids,
+                       ArrayView<INDEX_T> point_ids, Queue<INDEX_T>& build_indices,
+                       ArrayView<INDEX_T> probe_indices) {
+  auto d_queue = build_indices.DeviceObject();
+
+  LaunchKernel(stream, [=] __device__() mutable {
+    auto lane_id = threadIdx.x % 32;
+    auto global_warp_id = TID_1D / 32;
+    auto n_warps = TOTAL_THREADS_1D / 32;
+
+    for (uint32_t i = global_warp_id; i < rect_ids.size(); i += n_warps) {
+      auto rect_id = rect_ids[i];
+      auto point_id = point_ids[i];
+      auto build_point_begin = prefix_sum[rect_id];
+      auto build_point_end = prefix_sum[rect_id + 1];
+
+      for (uint32_t j = lane_id + build_point_begin; j < build_point_end;
+           j += WARP_SIZE) {
+        auto build_point_id = reordered_indices[j];
+        const auto& build_point = build_points[build_point_id];
+        const auto& probe_point = probe_points[point_id];
+        if (build_point == probe_point) {
+          auto tail = d_queue.Append(build_point_id);
+          probe_indices[tail] = point_id;
+        }
+      }
+    }
+  });
+}
+}  // namespace detail
+
+template <typename SCALAR_T, int N_DIM>
+RTSpatialIndex<SCALAR_T, N_DIM>::RTSpatialIndex(const RTSpatialIndexConfig& config)
+    : config_(config),
+      stream_pool_(std::make_unique<rmm::cuda_stream_pool>(config_.concurrency)),
+      indexing_points_(false),
+      handle_(0) {}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::Clear() {
+  GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), Clear", this,
+                      rmm::available_device_memory().first / 1024 / 1024);
+  auto stream = rmm::cuda_stream_default;
+  bvh_buffer_.resize(0, stream);
+  bvh_buffer_.shrink_to_fit(stream);
+  rects_.resize(0, stream);
+  rects_.shrink_to_fit(stream);
+  points_.resize(0, stream);
+  points_.shrink_to_fit(stream);
+  stream.synchronize();
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::PushBuild(const box_t* rects, uint32_t n_rects) {
+  GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), PushBuild, rectangles %zu", this,
+                      rmm::available_device_memory().first / 1024 / 1024, n_rects);
+  if (n_rects == 0) return;
+  auto stream = rmm::cuda_stream_default;
+  auto prev_size = rects_.size();
+
+  rects_.resize(rects_.size() + n_rects, stream);
+  CUDA_CHECK(cudaMemcpyAsync(rects_.data() + prev_size, rects, sizeof(box_t) * n_rects,
+                             cudaMemcpyHostToDevice, stream));
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::FinishBuilding() {
+  auto stream = rmm::cuda_stream_default;
+
+  indexing_points_ = thrust::all_of(rmm::exec_policy_nosync(stream), rects_.begin(),
+                                    rects_.end(), [] __device__(const box_t& box) {
+                                      bool is_point = true;
+                                      for (int dim = 0; dim < n_dim; dim++) {
+                                        is_point &= box.get_min(dim) == box.get_max(dim);
+                                      }
+                                      return is_point;
+                                    });
+
+  rmm::device_uvector<OptixAabb> aabbs{0, stream};
+  if (indexing_points_) {
+    points_.resize(rects_.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream), rects_.begin(), rects_.end(),
+                      points_.begin(),
+                      [] __device__(const box_t& box) { return box.get_min(); });
+    aabbs = std::move(detail::ComputeAABBs(stream, points_, point_ranges_,
+                                           reordered_point_indices_,
+                                           config_.n_points_per_aabb, rects_));
+  } else {
+    aabbs = std::move(detail::ComputeAABBs(stream, ArrayView<box_t>(rects_)));
+  }
+
+  handle_ = config_.rt_engine->BuildAccelCustom(stream, ArrayView<OptixAabb>(aabbs),
+                                                bvh_buffer_, config_.prefer_fast_build,
+                                                config_.compact);
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), FinishBuilding Index on %s, Total geoms: %zu",
+      this, rmm::available_device_memory().first / 1024 / 1024,
+      indexing_points_ ? "Points" : "Rectangles", numGeometries());
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t* rects, uint32_t n_rects,
+                                            std::vector<uint32_t>* build_indices,
+                                            std::vector<uint32_t>* probe_indices) {
+  // Formulating point and box queries into ray tracing queries:
+  // Reference: "Geng L, Lee R, Zhang X. LibRTS: A Spatial Indexing Library by Ray
+  // Tracing. InProceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and
+  // Practice of Parallel Programming 2025"
+  if (n_rects == 0) return;
+  SpatialIndexContext ctx;
+  auto stream = stream_pool_->get_stream();
+  rmm::device_uvector<box_t> d_rects(n_rects, stream);
+  rmm::device_uvector<point_t> d_points{0, stream};
+
+  CUDA_CHECK(cudaMemcpyAsync(d_rects.data(), rects, sizeof(box_t) * n_rects,
+                             cudaMemcpyHostToDevice, stream));
+
+  bool probe_points = thrust::all_of(rmm::exec_policy_nosync(stream), d_rects.begin(),
+                                     d_rects.end(), [] __device__(const box_t& box) {
+                                       bool is_point = true;
+                                       for (int dim = 0; dim < n_dim; dim++) {
+                                         is_point &= box.get_min(dim) == box.get_max(dim);
+                                       }
+                                       return is_point;
+                                     });
+
+  if (probe_points) {
+    d_points.resize(d_rects.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream), d_rects.begin(), d_rects.end(),
+                      d_points.begin(),
+                      [] __device__(const box_t& box) { return box.get_min(); });
+    d_rects.resize(0, stream);
+    d_rects.shrink_to_fit(stream);
+
+  } else {
+    // Build a BVH over the MBRs of the stream geometries
+#ifdef GPUSPATIAL_PROFILING
+    ctx.timer.start(stream);
+#endif
+    rmm::device_uvector<OptixAabb> aabbs(n_rects, stream);
+    thrust::transform(rmm::exec_policy_nosync(stream), d_rects.begin(), d_rects.end(),
+                      aabbs.begin(),
+                      [] __device__(const box_t& mbr) { return mbr.ToOptixAabb(); });
+    ctx.handle = config_.rt_engine->BuildAccelCustom(
+        stream, ArrayView<OptixAabb>(aabbs), ctx.bvh_buffer, config_.prefer_fast_build,
+        config_.compact);
+#ifdef GPUSPATIAL_PROFILING
+    ctx.bvh_build_ms = ctx.timer.stop(stream);
+#endif
+  }
+
+  ctx.counter = std::make_unique<rmm::device_scalar<uint32_t>>(0, stream);
+
+  bool swap_ids = false;
+
+  auto query = [&](bool counting) {
+#ifdef GPUSPATIAL_PROFILING
+    ctx.timer.start(stream);
+#endif
+    if (indexing_points_) {
+      if (probe_points) {
+        handleBuildPoint(ctx, ArrayView<point_t>(d_points), counting);
+      } else {
+        handleBuildPoint(ctx, ArrayView<box_t>(d_rects), counting);
+        swap_ids = true;
+      }
+    } else {
+      if (probe_points) {
+        handleBuildBox(ctx, ArrayView<point_t>(d_points), counting);
+      } else {
+        handleBuildBox(ctx, ArrayView<box_t>(d_rects), counting);
+      }
+    }
+#ifdef GPUSPATIAL_PROFILING
+    ctx.rt_ms += ctx.timer.stop(stream);
+#endif
+  };
+
+  // first pass: counting
+  query(true /* counting */);
+
+  auto cap = ctx.counter->value(stream);
+  if (cap == 0) {
+    return;
+  }
+  allocateResultBuffer(ctx, cap);
+  // second pass: retrieve results
+  query(false /* counting */);
+
+  auto result_size = ctx.build_indices.size(stream);
+  ArrayView<index_t> v_build_indices(ctx.build_indices.data(), result_size);
+  ArrayView<index_t> v_probe_indices(ctx.probe_indices.data(), result_size);
+
+  if (swap_ids) {
+    // IMPORTANT: In this case, the BVH is built on probe side and points are
+    // cast on the build side, so the result pairs are (probe_id, build_id) instead of
+    // (build_id, probe_id). We need to swap the output buffers to correct this.
+    std::swap(v_build_indices, v_probe_indices);
+  }
+
+#ifdef GPUSPATIAL_PROFILING
+  Stopwatch sw;
+  sw.start();
+#endif
+  build_indices->resize(result_size);
+  CUDA_CHECK(cudaMemcpyAsync(build_indices->data(), v_build_indices.data(),
+                             sizeof(index_t) * result_size, cudaMemcpyDeviceToHost,
+                             stream));
+
+  probe_indices->resize(result_size);
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices->data(), v_probe_indices.data(),
+                             sizeof(index_t) * result_size, cudaMemcpyDeviceToHost,
+                             stream));
+  stream.synchronize();
+#ifdef GPUSPATIAL_PROFILING
+  sw.stop();
+  ctx.copy_res_ms = sw.ms();
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), Probe %s, Size: %zu, Results: %zu, Alloc: %.2f ms, BVH Build: %.2f ms, RT: %.2f ms, Copy res: %.2f ms",
+      this, rmm::available_device_memory().first / 1024 / 1024,
+      probe_points ? "Points" : "Rectangles",
+      probe_points ? d_points.size() : d_rects.size(), build_indices->size(),
+      ctx.alloc_ms, ctx.bvh_build_ms, ctx.rt_ms, ctx.copy_res_ms);
+#endif
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildPoint(SpatialIndexContext& ctx,
+                                                       ArrayView<point_t> points,
+                                                       bool counting) const {
+  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
+
+  ctx.shader_id = GetPointQueryShaderId<point_t>();
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects = ArrayView<box_t>(rects_);
+  launch_params.points = points;
+  launch_params.handle = handle_;
+
+  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, points.size());
+
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+
+    CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                               sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                               ctx.stream));
+
+    filter(ctx, dim_x);
+  } else {
+    auto cap = ctx.build_indices.capacity();
+    Queue<index_t> rect_ids;
+    rmm::device_uvector<index_t> point_ids(cap, ctx.stream);
+
+    rect_ids.Init(ctx.stream, cap);
+
+    launch_params.count = nullptr;
+    launch_params.rect_ids = rect_ids.DeviceObject();
+    launch_params.point_ids = ArrayView<index_t>(point_ids);
+
+    CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                               sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                               ctx.stream));
+
+    filter(ctx, dim_x);
+
+    detail::RefineExactPoints<point_t, index_t>(
+        ctx.stream, ArrayView<point_t>(points_), points,
+        ArrayView<index_t>(point_ranges_), ArrayView<index_t>(reordered_point_indices_),
+        ArrayView<index_t>(rect_ids.data(), rect_ids.size(ctx.stream)),
+        ArrayView<index_t>(point_ids), ctx.build_indices,
+        ArrayView<index_t>(ctx.probe_indices));
+  }
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildPoint(SpatialIndexContext& ctx,
+                                                       ArrayView<box_t> rects,
+                                                       bool counting) const {
+  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
+
+  ctx.shader_id = GetPointQueryShaderId<point_t>();
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects = rects;
+  launch_params.points = ArrayView<point_t>(points_);
+  launch_params.handle = ctx.handle;
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+  } else {
+    launch_params.count = nullptr;
+    launch_params.rect_ids = ctx.build_indices.DeviceObject();
+    launch_params.point_ids = ArrayView<index_t>(ctx.probe_indices);
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                             ctx.stream));
+
+  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, points_.size());
+
+  filter(ctx, dim_x);
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildBox(SpatialIndexContext& ctx,
+                                                     ArrayView<point_t> points,
+                                                     bool counting) const {
+  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
+
+  ctx.shader_id = GetPointQueryShaderId<point_t>();
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects = ArrayView<box_t>(rects_);
+  launch_params.points = points;
+  launch_params.handle = handle_;
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+  } else {
+    launch_params.count = nullptr;
+    launch_params.rect_ids = ctx.build_indices.DeviceObject();
+    launch_params.point_ids =
+        ArrayView<index_t>(ctx.probe_indices.data(), ctx.probe_indices.size());
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                             ctx.stream));
+
+  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, points.size());
+
+  filter(ctx, dim_x);
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildBox(SpatialIndexContext& ctx,
+                                                     ArrayView<box_t> rects,
+                                                     bool counting) const {
+  // forward cast: cast rays from stream geometries with the BVH of build geometries
+  {
+    auto dim_x = std::min(OPTIX_MAX_RAYS, rects.size());
+
+    prepareLaunchParamsBoxQuery(ctx, rects, true /* forward */, counting);
+    filter(ctx, dim_x);
+  }
+  // backward cast: cast rays from the build geometries with the BVH of stream geometries
+  {
+    auto dim_x = std::min(OPTIX_MAX_RAYS, rects_.size());
+
+    prepareLaunchParamsBoxQuery(ctx, rects, false /* forward */, counting);
+    filter(ctx, dim_x);
+  }
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::allocateResultBuffer(SpatialIndexContext& ctx,
+                                                           uint32_t capacity) const {
+#ifdef GPUSPATIAL_PROFILING
+  ctx.timer.start(ctx.stream);
+#endif
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), Allocate result buffer, memory consumption %zu MB, capacity %u",
+      this, rmm::available_device_memory().first / 1024 / 1024,
+      (uint64_t)capacity * 2 * sizeof(index_t) / 1024 / 1024, capacity);
+
+  ctx.build_indices.Init(ctx.stream, capacity);
+  ctx.probe_indices.resize(capacity, ctx.stream);
+#ifdef GPUSPATIAL_PROFILING
+  ctx.alloc_ms += ctx.timer.stop(ctx.stream);
+#endif
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::prepareLaunchParamsBoxQuery(
+    SpatialIndexContext& ctx, ArrayView<box_t> probe_rects, bool forward,
+    bool counting) const {
+  using launch_params_t = detail::LaunchParamsBoxQuery<point_t>;
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects1 = ArrayView<box_t>(rects_);
+  launch_params.rects2 = probe_rects;
+
+  if (forward) {
+    launch_params.handle = handle_;
+    ctx.shader_id = GetBoxQueryForwardShaderId<point_t>();
+  } else {
+    launch_params.handle = ctx.handle;
+    ctx.shader_id = GetBoxQueryBackwardShaderId<point_t>();
+  }
+
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+  } else {
+    launch_params.count = nullptr;
+    launch_params.rect1_ids = ctx.build_indices.DeviceObject();
+    launch_params.rect2_ids = ArrayView<index_t>(ctx.probe_indices);
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                             ctx.stream));
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::filter(SpatialIndexContext& ctx,
+                                             uint32_t dim_x) const {
+#ifdef GPUSPATIAL_PROFILING
+  ctx.timer.start(ctx.stream);
+#endif
+  if (dim_x > 0) {
+    config_.rt_engine->Render(ctx.stream, ctx.shader_id, dim3{dim_x, 1, 1},
+                              ArrayView<char>((char*)ctx.launch_params_buffer.data(),
+                                              ctx.launch_params_buffer.size()));
+  }
+#ifdef GPUSPATIAL_PROFILING
+  ctx.rt_ms += ctx.timer.stop(ctx.stream);
+#endif
+}
+
+template <typename SCALAR_T, int N_DIM>
+std::unique_ptr<SpatialIndex<SCALAR_T, N_DIM>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config) {
+  auto index = std::make_unique<RTSpatialIndex<SCALAR_T, N_DIM>>(config);
+  GPUSPATIAL_LOG_INFO(
+      "Create RTSpatialIndex %p, fast_build = %d, compact = %d, concurrency = %d",
+      index.get(), config.prefer_fast_build, config.compact, config.concurrency);
+  return std::move(index);
+}
+
+template std::unique_ptr<SpatialIndex<float, 2>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+template std::unique_ptr<SpatialIndex<float, 3>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+template std::unique_ptr<SpatialIndex<double, 2>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+template std::unique_ptr<SpatialIndex<double, 3>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
new file mode 100644
index 000000000..22e6748ec
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
@@ -0,0 +1,442 @@
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "gpuspatial/loader/parallel_wkb_loader.hpp"
+#include "gpuspatial/refine/rt_spatial_refiner.cuh"
+#include "gpuspatial/relate/relate_engine.cuh"
+#include "gpuspatial/utils/logger.hpp"
+
+#include "rt/shaders/shader_id.hpp"
+
+#include "rmm/cuda_stream_pool.hpp"
+#include "rmm/exec_policy.hpp"
+
+#include <thrust/gather.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include <future>
+#include <locale>
+#include <numeric>
+#include <vector>
+
+#define OPTIX_MAX_RAYS (1lu << 30)
+
+namespace gpuspatial {
+
+namespace detail {
+template <typename INDEX_IT>
+void ReorderIndices(rmm::cuda_stream_view stream, INDEX_IT index_begin,
+                    INDEX_IT index_end,
+                    rmm::device_uvector<uint32_t>& sorted_uniq_indices,
+                    rmm::device_uvector<uint32_t>& reordered_indices) {
+  auto sorted_begin = sorted_uniq_indices.begin();
+  auto sorted_end = sorted_uniq_indices.end();
+  thrust::transform(rmm::exec_policy_nosync(stream), index_begin, index_end,
+                    reordered_indices.begin(), [=] __device__(uint32_t val) {
+                      auto it =
+                          thrust::lower_bound(thrust::seq, sorted_begin, sorted_end, val);
+                      return thrust::distance(sorted_begin, it);
+                    });
+}
+
+template <typename LoaderT, typename DeviceGeomT>
+struct PipelineSlot {
+  rmm::cuda_stream_view stream;
+  std::unique_ptr<LoaderT> loader;
+  std::future<DeviceGeomT> prep_future;
+
+  RTSpatialRefiner::IndicesMap indices_map;
+
+  // These will be moved out after every batch
+  rmm::device_uvector<uint32_t> d_batch_build_indices;
+  rmm::device_uvector<uint32_t> d_batch_probe_indices;
+
+  PipelineSlot(rmm::cuda_stream_view s, const std::shared_ptr<ThreadPool>& tp,
+               typename LoaderT::Config config)
+      : stream(s), d_batch_build_indices(0, s), d_batch_probe_indices(0, s) {
+    loader = std::make_unique<LoaderT>(tp);
+    loader->Init(config);
+  }
+};
+}  // namespace detail
+
+RTSpatialRefiner::RTSpatialRefiner(const RTSpatialRefinerConfig& config)
+    : config_(config) {
+  thread_pool_ = std::make_shared<ThreadPool>(config_.parsing_threads);
+  stream_pool_ = std::make_unique<rmm::cuda_stream_pool>(config_.concurrency);
+  CUDA_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, config_.stack_size_bytes));
+  wkb_loader_ = std::make_unique<loader_t>(thread_pool_);
+
+  ParallelWkbLoader<point_t, index_t>::Config loader_config;
+
+  loader_config.memory_quota = config_.wkb_parser_memory_quota;
+
+  wkb_loader_->Init(loader_config);
+}
+
+void RTSpatialRefiner::Clear() {
+  auto stream = rmm::cuda_stream_default;
+  wkb_loader_->Clear(stream);
+  build_geometries_.Clear(stream);
+}
+
+void RTSpatialRefiner::PushBuild(const ArrowArrayView* build_array) {
+  auto stream = rmm::cuda_stream_default;
+
+  wkb_loader_->Parse(stream, build_array, 0, build_array->length);
+}
+
+void RTSpatialRefiner::FinishBuilding() {
+  auto stream = rmm::cuda_stream_default;
+  build_geometries_ = std::move(wkb_loader_->Finish(stream));
+}
+
+uint32_t RTSpatialRefiner::Refine(const ArrowArrayView* probe_array, Predicate predicate,
+                                  uint32_t* build_indices, uint32_t* probe_indices,
+                                  uint32_t len) {
+  if (len == 0) {
+    return 0;
+  }
+
+  if (config_.pipeline_batches > 1) {
+    return RefinePipelined(probe_array, predicate, build_indices, probe_indices, len);
+  }
+
+  SpatialRefinerContext ctx;
+  ctx.cuda_stream = stream_pool_->get_stream();
+
+  IndicesMap probe_indices_map;
+  rmm::device_uvector<uint32_t> d_probe_indices(len, ctx.cuda_stream);
+
+  CUDA_CHECK(cudaMemcpyAsync(d_probe_indices.data(), probe_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             ctx.cuda_stream));
+
+  buildIndicesMap(ctx.cuda_stream, d_probe_indices.begin(), d_probe_indices.end(),
+                  probe_indices_map);
+
+  loader_t loader(thread_pool_);
+  loader_t::Config loader_config;
+  loader_config.memory_quota = config_.wkb_parser_memory_quota / config_.concurrency;
+
+  loader.Init(loader_config);
+  loader.Parse(ctx.cuda_stream, probe_array, probe_indices_map.h_uniq_indices.begin(),
+               probe_indices_map.h_uniq_indices.end());
+  auto probe_geoms = std::move(loader.Finish(ctx.cuda_stream));
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Loaded Geometries, ProbeArray %ld, Loaded %u, Type %s",
+      this, rmm::available_device_memory().first / 1024 / 1024, probe_array->length,
+      probe_geoms.num_features(),
+      GeometryTypeToString(probe_geoms.get_geometry_type()).c_str());
+
+  RelateEngine<point_t, index_t> relate_engine(&build_geometries_,
+                                               config_.rt_engine.get());
+  RelateEngine<point_t, index_t>::Config re_config;
+
+  re_config.memory_quota = config_.relate_engine_memory_quota / config_.concurrency;
+  re_config.bvh_fast_build = config_.prefer_fast_build;
+  re_config.bvh_compact = config_.compact;
+
+  relate_engine.set_config(re_config);
+
+  rmm::device_uvector<uint32_t> d_build_indices(len, ctx.cuda_stream);
+  CUDA_CHECK(cudaMemcpyAsync(d_build_indices.data(), build_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             ctx.cuda_stream));
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Evaluating %u Geometry Pairs with Predicate %s",
+      this, rmm::available_device_memory().first / 1024 / 1024, len,
+      PredicateToString(predicate));
+
+#ifdef GPUSPATIAL_PROFILING
+  ctx.timer.start(ctx.cuda_stream);
+#endif
+
+  relate_engine.Evaluate(ctx.cuda_stream, probe_geoms, predicate, d_build_indices,
+                         probe_indices_map.d_reordered_indices);
+  auto new_size = d_build_indices.size();
+#ifdef GPUSPATIAL_PROFILING
+  float refine_ms = ctx.timer.stop(ctx.cuda_stream);
+  GPUSPATIAL_LOG_INFO("RTSpatialRefiner %p (Free %zu MB), Refine time %f, new size %zu",
+                      this, rmm::available_device_memory().first / 1024 / 1024, refine_ms,
+                      new_size);
+#endif
+
+  d_probe_indices.resize(new_size, ctx.cuda_stream);
+
+  thrust::gather(rmm::exec_policy_nosync(ctx.cuda_stream),
+                 probe_indices_map.d_reordered_indices.begin(),
+                 probe_indices_map.d_reordered_indices.end(),
+                 probe_indices_map.d_uniq_indices.begin(), d_probe_indices.begin());
+
+  if (config_.sort_probe_indices) {
+    thrust::sort_by_key(rmm::exec_policy_nosync(ctx.cuda_stream), d_probe_indices.begin(),
+                        d_probe_indices.end(), d_build_indices.begin());
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(build_indices, d_build_indices.data(),
+                             sizeof(uint32_t) * new_size, cudaMemcpyDeviceToHost,
+                             ctx.cuda_stream));
+
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices, d_probe_indices.data(),
+                             sizeof(uint32_t) * new_size, cudaMemcpyDeviceToHost,
+                             ctx.cuda_stream));
+  ctx.cuda_stream.synchronize();
+  return new_size;
+}
+
+uint32_t RTSpatialRefiner::RefinePipelined(const ArrowArrayView* probe_array,
+                                           Predicate predicate, uint32_t* build_indices,
+                                           uint32_t* probe_indices, uint32_t len) {
+  if (len == 0) return 0;
+  auto main_stream = stream_pool_->get_stream();
+
+  rmm::device_uvector<uint32_t> d_build_indices(len, main_stream);
+  rmm::device_uvector<uint32_t> d_probe_indices(len, main_stream);
+
+  CUDA_CHECK(cudaMemcpyAsync(d_build_indices.data(), build_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             main_stream));
+  CUDA_CHECK(cudaMemcpyAsync(d_probe_indices.data(), probe_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             main_stream));
+
+  thrust::sort_by_key(rmm::exec_policy_nosync(main_stream), d_probe_indices.begin(),
+                      d_probe_indices.end(), d_build_indices.begin());
+
+  rmm::device_uvector<uint32_t> d_final_build_indices(len, main_stream);
+  rmm::device_uvector<uint32_t> d_final_probe_indices(len, main_stream);
+
+  uint32_t tail_offset = 0;
+
+  // Capture device ID for thread safety
+  int device_id;
+  CUDA_CHECK(cudaGetDevice(&device_id));
+
+  // Pipeline Config
+  const int NUM_SLOTS = 2;
+  int n_batches = config_.pipeline_batches;
+  size_t batch_size = (len + n_batches - 1) / n_batches;
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p, pipeline refinement, total len %u, batches %d, batch size %zu",
+      this, len, n_batches, batch_size);
+
+  // Resource allocation for slots
+  using loader_t = ParallelWkbLoader<point_t, index_t>;
+  loader_t::Config loader_config;
+  loader_config.memory_quota =
+      config_.wkb_parser_memory_quota / config_.concurrency / NUM_SLOTS;
+
+  rmm::cuda_stream_pool local_pool(NUM_SLOTS);
+  std::vector<std::unique_ptr<detail::PipelineSlot<loader_t, dev_geometries_t>>> slots;
+
+  for (int i = 0; i < NUM_SLOTS; ++i) {
+    slots.push_back(std::make_unique<detail::PipelineSlot<loader_t, dev_geometries_t>>(
+        local_pool.get_stream(), thread_pool_, loader_config));
+  }
+
+  // Engine Setup (Shared across slots)
+  RelateEngine<point_t, index_t> relate_engine(&build_geometries_,
+                                               config_.rt_engine.get());
+  RelateEngine<point_t, index_t>::Config re_config;
+  re_config.memory_quota =
+      config_.relate_engine_memory_quota / config_.concurrency / NUM_SLOTS;
+  re_config.bvh_fast_build = config_.prefer_fast_build;
+  re_config.bvh_compact = config_.compact;
+  relate_engine.set_config(re_config);
+
+  // --- BACKGROUND TASK (CPU Phase) ---
+  // This lambda handles: buildIndicesMap + WKB Parsing
+  auto prepare_batch_task = [&](detail::PipelineSlot<loader_t, dev_geometries_t>* slot,
+                                size_t offset, size_t count) {
+    // 1. Critical: Set context for this thread
+    CUDA_CHECK(cudaSetDevice(device_id));
+
+    // 2. Wait for GPU to finish previous work on this slot
+    slot->stream.synchronize();
+
+    // 3. Prepare Indices (CPU + H2D)
+    const uint32_t* batch_probe_ptr = d_probe_indices.data() + offset;
+    buildIndicesMap(slot->stream, batch_probe_ptr, batch_probe_ptr + count,
+                    slot->indices_map);
+
+    // 4. Parse WKB (CPU Heavy)
+    slot->loader->Clear(slot->stream);
+    slot->loader->Parse(slot->stream, probe_array,
+                        slot->indices_map.h_uniq_indices.begin(),
+                        slot->indices_map.h_uniq_indices.end());
+
+    // Return future geometries (H2D copy happens on Finish)
+    return slot->loader->Finish(slot->stream);
+  };
+
+  // --- PIPELINE PRIMING ---
+  // Start processing Batch 0 immediately in background
+  size_t first_batch_len = std::min(batch_size, (size_t)len);
+  slots[0]->prep_future = std::async(std::launch::async, prepare_batch_task,
+                                     slots[0].get(), 0, first_batch_len);
+
+  main_stream.synchronize();  // Ensure allocation is done before main loop
+
+  // --- MAIN PIPELINE LOOP ---
+  for (size_t offset = 0; offset < len; offset += batch_size) {
+    int curr_idx = (offset / batch_size) % NUM_SLOTS;
+    int next_idx = (curr_idx + 1) % NUM_SLOTS;
+    auto& curr_slot = slots[curr_idx];
+    auto& next_slot = slots[next_idx];
+    size_t current_batch_len = std::min(batch_size, len - offset);
+
+    // 1. WAIT & RETRIEVE: Get Geometries from Background Task
+    // This will block only if CPU work for this batch is slower than GPU work for
+    // previous batch
+    dev_geometries_t probe_geoms;
+    if (curr_slot->prep_future.valid()) {
+      probe_geoms = std::move(curr_slot->prep_future.get());
+    }
+
+    // 2. KICKOFF NEXT: Start CPU work for Batch (N+1)
+    size_t next_offset = offset + batch_size;
+    if (next_offset < len) {
+      size_t next_len = std::min(batch_size, len - next_offset);
+      next_slot->prep_future = std::async(std::launch::async, prepare_batch_task,
+                                          next_slot.get(), next_offset, next_len);
+    }
+
+    // 3. GPU EXECUTION PHASE
+    const uint32_t* batch_build_ptr = d_build_indices.data() + offset;
+
+    // Copy build indices for this batch
+    curr_slot->d_batch_build_indices.resize(current_batch_len, curr_slot->stream);
+    CUDA_CHECK(cudaMemcpyAsync(curr_slot->d_batch_build_indices.data(), batch_build_ptr,
+                               sizeof(uint32_t) * current_batch_len,
+                               cudaMemcpyHostToDevice, curr_slot->stream));
+
+    // Relate/Refine
+    // Note: Evaluate filters d_batch_build_indices in-place
+    relate_engine.Evaluate(curr_slot->stream, probe_geoms, predicate,
+                           curr_slot->d_batch_build_indices,
+                           curr_slot->indices_map.d_reordered_indices);
+
+    // 4. GATHER & APPEND RESULTS
+    // We need the size to know how much to gather
+    size_t new_size = curr_slot->d_batch_build_indices.size();
+
+    if (new_size > 0) {
+      // Gather original probe indices
+      curr_slot->d_batch_probe_indices.resize(new_size, curr_slot->stream);
+      thrust::gather(rmm::exec_policy_nosync(curr_slot->stream),
+                     curr_slot->indices_map.d_reordered_indices.begin(),
+                     curr_slot->indices_map.d_reordered_indices.end(),
+                     curr_slot->indices_map.d_uniq_indices.begin(),
+                     curr_slot->d_batch_probe_indices.begin());
+
+      // Append to Final Buffers (Device-to-Device Copy)
+      CUDA_CHECK(cudaMemcpyAsync(d_final_build_indices.data() + tail_offset,
+                                 curr_slot->d_batch_build_indices.data(),
+                                 sizeof(uint32_t) * new_size, cudaMemcpyDeviceToDevice,
+                                 curr_slot->stream));
+
+      CUDA_CHECK(cudaMemcpyAsync(d_final_probe_indices.data() + tail_offset,
+                                 curr_slot->d_batch_probe_indices.data(),
+                                 sizeof(uint32_t) * new_size, cudaMemcpyDeviceToDevice,
+                                 curr_slot->stream));
+
+      tail_offset += new_size;
+    }
+  }
+
+  // --- FINALIZATION ---
+
+  // Wait for all streams to finish writing to final buffers
+  for (auto& slot : slots) {
+    slot->stream.synchronize();
+  }
+
+  // Shrink probe vector to actual size for sorting
+  d_final_probe_indices.resize(tail_offset, main_stream);
+  d_final_build_indices.resize(tail_offset, main_stream);
+
+  if (config_.sort_probe_indices) {
+    thrust::sort_by_key(rmm::exec_policy_nosync(main_stream),
+                        d_final_probe_indices.begin(),
+                        d_final_probe_indices.end(),  // Sort only valid range
+                        d_final_build_indices.begin());
+  }
+
+  // Final Copy to Host
+  CUDA_CHECK(cudaMemcpyAsync(build_indices, d_final_build_indices.data(),
+                             sizeof(uint32_t) * tail_offset, cudaMemcpyDeviceToHost,
+                             main_stream));
+
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices, d_final_probe_indices.data(),
+                             sizeof(uint32_t) * tail_offset, cudaMemcpyDeviceToHost,
+                             main_stream));
+
+  main_stream.synchronize();
+  return tail_offset;
+}
+
+template <typename INDEX_IT>
+void RTSpatialRefiner::buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT index_begin,
+                                       INDEX_IT index_end,
+                                       IndicesMap& indices_map) const {
+  auto len = thrust::distance(index_begin, index_end);
+  auto& d_uniq_indices = indices_map.d_uniq_indices;
+  auto& h_uniq_indices = indices_map.h_uniq_indices;
+
+  d_uniq_indices.resize(len, stream);
+  CUDA_CHECK(cudaMemcpyAsync(d_uniq_indices.data(), index_begin, sizeof(uint32_t) * len,
+                             cudaMemcpyDeviceToDevice, stream));
+
+  thrust::sort(rmm::exec_policy_nosync(stream), d_uniq_indices.begin(),
+               d_uniq_indices.end());
+  auto uniq_end = thrust::unique(rmm::exec_policy_nosync(stream), d_uniq_indices.begin(),
+                                 d_uniq_indices.end());
+  auto uniq_size = thrust::distance(d_uniq_indices.begin(), uniq_end);
+
+  d_uniq_indices.resize(uniq_size, stream);
+  h_uniq_indices.resize(uniq_size);
+
+  CUDA_CHECK(cudaMemcpyAsync(h_uniq_indices.data(), d_uniq_indices.data(),
+                             sizeof(uint32_t) * uniq_size, cudaMemcpyDeviceToHost,
+                             stream));
+
+  auto& d_reordered_indices = indices_map.d_reordered_indices;
+
+  d_reordered_indices.resize(len, stream);
+  detail::ReorderIndices(stream, index_begin, index_end, d_uniq_indices,
+                         d_reordered_indices);
+}
+
+std::unique_ptr<SpatialRefiner> CreateRTSpatialRefiner(
+    const RTSpatialRefinerConfig& config) {
+  auto refiner = std::make_unique<RTSpatialRefiner>(config);
+  GPUSPATIAL_LOG_INFO(
+      "Create RTSpatialRefiner %p, fast_build = %d, compact = %d, "
+      "parsing_threads = %u, concurrency = %u, pipeline_batches = %u, "
+      "wkb_parser_memory_quota = %.2f, relate_engine_memory_quota = %.2f",
+      refiner.get(), config.prefer_fast_build, config.compact, config.parsing_threads,
+      config.concurrency, config.pipeline_batches, config.wkb_parser_memory_quota,
+      config.relate_engine_memory_quota);
+  return std::move(refiner);
+}
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/spatial_joiner.cu b/c/sedona-libgpuspatial/libgpuspatial/src/spatial_joiner.cu
deleted file mode 100644
index 03aafaa27..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/src/spatial_joiner.cu
+++ /dev/null
@@ -1,483 +0,0 @@
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/index/relate_engine.cuh"
-#include "gpuspatial/index/spatial_joiner.cuh"
-#include "gpuspatial/loader/parallel_wkb_loader.h"
-#include "gpuspatial/utils/logger.hpp"
-#include "gpuspatial/utils/stopwatch.h"
-
-#include "rt/shaders/shader_id.hpp"
-
-#include "rmm/exec_policy.hpp"
-
-#define OPTIX_MAX_RAYS (1lu << 30)
-namespace gpuspatial {
-
-namespace detail {
-
-template <int N_DIM>
-static rmm::device_uvector<OptixAabb> ComputeAABBs(
-    rmm::cuda_stream_view stream, const ArrayView<Box<Point<float, N_DIM>>>& mbrs) {
-  rmm::device_uvector<OptixAabb> aabbs(mbrs.size(), stream);
-
-  thrust::transform(rmm::exec_policy_nosync(stream), mbrs.begin(), mbrs.end(),
-                    aabbs.begin(), [] __device__(const Box<Point<float, N_DIM>>& mbr) {
-                      OptixAabb aabb{0, 0, 0, 0, 0, 0};
-                      auto min_corner = mbr.get_min();
-                      auto max_corner = mbr.get_max();
-                      for (int dim = 0; dim < N_DIM; dim++) {
-                        (&aabb.minX)[dim] = min_corner[dim];
-                        (&aabb.maxX)[dim] = max_corner[dim];
-                      }
-                      return aabb;
-                    });
-  return std::move(aabbs);
-}
-
-}  // namespace detail
-
-void SpatialJoiner::Init(const Config* config) {
-  config_ = *dynamic_cast<const SpatialJoinerConfig*>(config);
-  GPUSPATIAL_LOG_INFO("SpatialJoiner %p (Free %zu MB), Initialize, Concurrency %u", this,
-                      rmm::available_device_memory().first / 1024 / 1024,
-                      config_.concurrency);
-  details::RTConfig rt_config = details::get_default_rt_config(config_.ptx_root);
-  rt_engine_.Init(rt_config);
-
-  loader_t::Config loader_config;
-
-  thread_pool_ = std::make_shared<ThreadPool>(config_.parsing_threads);
-  build_loader_ = std::make_unique<loader_t>(thread_pool_);
-  build_loader_->Init(loader_config);
-  stream_pool_ = std::make_unique<rmm::cuda_stream_pool>(config_.concurrency);
-  ctx_pool_ = ObjectPool<SpatialJoinerContext>::create(config_.concurrency);
-  CUDA_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, config_.stack_size_bytes));
-  Clear();
-}
-
-void SpatialJoiner::Clear() {
-  GPUSPATIAL_LOG_INFO("SpatialJoiner %p (Free %zu MB), Clear", this,
-                      rmm::available_device_memory().first / 1024 / 1024);
-  bvh_buffer_ = nullptr;
-  geometry_grouper_.Clear();
-  auto stream = rmm::cuda_stream_default;
-  build_loader_->Clear(stream);
-  build_geometries_.Clear(stream);
-  stream.synchronize();
-}
-
-void SpatialJoiner::PushBuild(const ArrowSchema* schema, const ArrowArray* array,
-                              int64_t offset, int64_t length) {
-  GPUSPATIAL_LOG_INFO("SpatialJoiner %p (Free %zu MB), PushBuild, offset %ld, length %ld",
-                      this, rmm::available_device_memory().first / 1024 / 1024, offset,
-                      length);
-  build_loader_->Parse(rmm::cuda_stream_default, array, offset, length);
-}
-
-void SpatialJoiner::FinishBuilding() {
-  auto stream = rmm::cuda_stream_default;
-
-  build_geometries_ = std::move(build_loader_->Finish(stream));
-
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p (Free %zu MB), FinishBuilding, n_features: %ld, type %s", this,
-      rmm::available_device_memory().first / 1024 / 1024,
-      build_geometries_.num_features(),
-      GeometryTypeToString(build_geometries_.get_geometry_type()));
-
-  if (build_geometries_.get_geometry_type() == GeometryType::kPoint) {
-    geometry_grouper_.Group(stream, build_geometries_, config_.n_points_per_aabb);
-    handle_ = buildBVH(stream, geometry_grouper_.get_aabbs(), bvh_buffer_);
-  } else {
-    auto aabbs = detail::ComputeAABBs(stream, build_geometries_.get_mbrs());
-    handle_ = buildBVH(stream, ArrayView<OptixAabb>(aabbs), bvh_buffer_);
-  }
-
-  relate_engine_ = RelateEngine(&build_geometries_, &rt_engine_);
-  RelateEngine<point_t, index_t>::Config re_config;
-
-  re_config.memory_quota = config_.relate_engine_memory_quota;
-  re_config.bvh_fast_build = config_.prefer_fast_build;
-  re_config.bvh_fast_compact = config_.compact;
-
-  relate_engine_.set_config(re_config);
-}
-
-void SpatialJoiner::PushStream(Context* base_ctx, const ArrowSchema* schema,
-                               const ArrowArray* array, int64_t offset, int64_t length,
-                               Predicate predicate, std::vector<uint32_t>* build_indices,
-                               std::vector<uint32_t>* stream_indices,
-                               int32_t array_index_offset) {
-  auto* ctx = (SpatialJoinerContext*)base_ctx;
-  ctx->cuda_stream = stream_pool_->get_stream();
-
-#ifdef GPUSPATIAL_PROFILING
-  Stopwatch sw;
-  sw.start();
-#endif
-  ctx->array_index_offset = array_index_offset;
-
-  if (ctx->stream_loader == nullptr) {
-    ctx->stream_loader = std::make_unique<loader_t>(thread_pool_);
-    loader_t::Config loader_config;
-
-    ctx->stream_loader->Init(loader_config);
-  }
-  ctx->stream_loader->Parse(ctx->cuda_stream, array, offset, length);
-  ctx->stream_geometries = std::move(ctx->stream_loader->Finish(ctx->cuda_stream));
-
-  auto build_type = build_geometries_.get_geometry_type();
-  auto stream_type = ctx->stream_geometries.get_geometry_type();
-
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p, PushStream, build features %zu, type %s, stream features %zu, type %s",
-      this, build_geometries_.num_features(),
-      GeometryTypeToString(build_geometries_.get_geometry_type()),
-      ctx->stream_geometries.num_features(),
-      GeometryTypeToString(ctx->stream_geometries.get_geometry_type()));
-
-#ifdef GPUSPATIAL_PROFILING
-  sw.stop();
-  ctx->parse_ms += sw.ms();
-#endif
-
-  if (build_type == GeometryType::kPoint) {
-    if (stream_type == GeometryType::kPoint) {
-      handleBuildPointStreamPoint(ctx, predicate, build_indices, stream_indices);
-    } else {
-      handleBuildPointStreamBox(ctx, predicate, build_indices, stream_indices);
-    }
-  } else {
-    if (stream_type == GeometryType::kPoint) {
-      handleBuildBoxStreamPoint(ctx, predicate, build_indices, stream_indices);
-    } else {
-      handleBuildBoxStreamBox(ctx, predicate, build_indices, stream_indices);
-    }
-  }
-#ifdef GPUSPATIAL_PROFILING
-  printf("parse %lf, alloc %lf, filter %lf, refine %lf, copy_res %lf ms\n", ctx->parse_ms,
-         ctx->alloc_ms, ctx->filter_ms, ctx->refine_ms, ctx->copy_res_ms);
-#endif
-}
-
-void SpatialJoiner::handleBuildPointStreamPoint(SpatialJoinerContext* ctx,
-                                                Predicate predicate,
-                                                std::vector<uint32_t>* build_indices,
-                                                std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  ctx->shader_id = GetPointQueryShaderId<point_t>();
-  assert(ctx->stream_geometries.get_geometry_type() == GeometryType::kPoint);
-
-  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  launch_params.grouped = true;
-  launch_params.prefix_sum = geometry_grouper_.get_prefix_sum();
-  launch_params.reordered_indices = geometry_grouper_.get_reordered_indices();
-  launch_params.mbrs1 = ArrayView<box_t>();  // no MBRs for point
-  launch_params.points2 = ctx->stream_geometries.get_points();
-  launch_params.handle = handle_;
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-
-  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, ctx->stream_geometries.num_features());
-
-  filter(ctx, dim_x);
-  refine(ctx, predicate, build_indices, stream_indices);
-}
-
-void SpatialJoiner::handleBuildBoxStreamPoint(SpatialJoinerContext* ctx,
-                                              Predicate predicate,
-                                              std::vector<uint32_t>* build_indices,
-                                              std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  ctx->shader_id = GetPointQueryShaderId<point_t>();
-  assert(ctx->stream_geometries.get_geometry_type() == GeometryType::kPoint);
-
-  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  launch_params.grouped = false;
-  launch_params.mbrs1 = build_geometries_.get_mbrs();
-  launch_params.points2 = ctx->stream_geometries.get_points();
-  launch_params.handle = handle_;
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-
-  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, ctx->stream_geometries.num_features());
-
-  filter(ctx, dim_x);
-  refine(ctx, predicate, build_indices, stream_indices);
-}
-
-void SpatialJoiner::handleBuildPointStreamBox(SpatialJoinerContext* ctx,
-                                              Predicate predicate,
-                                              std::vector<uint32_t>* build_indices,
-                                              std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  ctx->shader_id = GetPointQueryShaderId<point_t>();
-  assert(build_geometries_.get_geometry_type() == GeometryType::kPoint);
-
-  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  auto aabbs = detail::ComputeAABBs(ctx->cuda_stream, ctx->stream_geometries.get_mbrs());
-  auto handle = buildBVH(ctx->cuda_stream, ArrayView<OptixAabb>(aabbs), ctx->bvh_buffer);
-
-  // mbrs1 are from stream; points2 are from build
-  launch_params.grouped = false;
-  launch_params.mbrs1 = ctx->stream_geometries.get_mbrs();
-  launch_params.points2 = build_geometries_.get_points();
-  launch_params.handle = handle;
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-
-  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, build_geometries_.num_features());
-  // IMPORTANT: In this case, the BVH is built from stream geometries and points2 are
-  // build geometries, so the result pairs are (stream_id, build_id) instead of (build_id,
-  // stream_id). We need to swap the output buffers to correct this.
-  filter(ctx, dim_x, true);
-  refine(ctx, predicate, build_indices, stream_indices);
-}
-
-void SpatialJoiner::handleBuildBoxStreamBox(SpatialJoinerContext* ctx,
-                                            Predicate predicate,
-                                            std::vector<uint32_t>* build_indices,
-                                            std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  // forward cast: cast rays from stream geometries with the BVH of build geometries
-  {
-    auto dim_x = std::min(OPTIX_MAX_RAYS, ctx->stream_geometries.num_features());
-
-    prepareLaunchParamsBoxQuery(ctx, true);
-    filter(ctx, dim_x);
-    refine(ctx, predicate, build_indices, stream_indices);
-    ctx->results.Clear(ctx->cuda_stream);  // results have been copied, reuse space
-  }
-  // need allocate again as the previous results buffer has been shrinked to fit
-  allocateResultBuffer(ctx);
-  // backward cast: cast rays from the build geometries with the BVH of stream geometries
-  {
-    auto dim_x = std::min(OPTIX_MAX_RAYS, build_geometries_.num_features());
-    auto v_mbrs = ctx->stream_geometries.get_mbrs();
-    rmm::device_uvector<OptixAabb> aabbs(v_mbrs.size(), ctx->cuda_stream);
-
-    thrust::transform(rmm::exec_policy_nosync(ctx->cuda_stream), v_mbrs.begin(),
-                      v_mbrs.end(), aabbs.begin(),
-                      [] __device__(const box_t& mbr) { return mbr.ToOptixAabb(); });
-
-    // Build a BVH over the MBRs of the stream geometries
-    ctx->handle =
-        buildBVH(ctx->cuda_stream, ArrayView<OptixAabb>(aabbs.data(), aabbs.size()),
-                 ctx->bvh_buffer);
-    prepareLaunchParamsBoxQuery(ctx, false);
-    filter(ctx, dim_x);
-    refine(ctx, predicate, build_indices, stream_indices);
-  }
-}
-
-OptixTraversableHandle SpatialJoiner::buildBVH(
-    const rmm::cuda_stream_view& stream, const ArrayView<OptixAabb>& aabbs,
-    std::unique_ptr<rmm::device_buffer>& buffer) {
-  auto buffer_size_bytes = rt_engine_.EstimateMemoryUsageForAABB(
-      aabbs.size(), config_.prefer_fast_build, config_.compact);
-
-  if (buffer == nullptr || buffer->size() < buffer_size_bytes) {
-    buffer = std::make_unique<rmm::device_buffer>(buffer_size_bytes, stream);
-  }
-
-  return rt_engine_.BuildAccelCustom(stream, aabbs, *buffer, config_.prefer_fast_build,
-                                     config_.compact);
-}
-
-void SpatialJoiner::allocateResultBuffer(SpatialJoinerContext* ctx) {
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  int64_t avail_bytes = rmm::available_device_memory().first;
-  auto stream_type = ctx->stream_geometries.get_geometry_type();
-  if (stream_type != GeometryType::kPoint) {
-    // need to reserve space for the BVH of stream
-    auto n_aabbs = ctx->stream_geometries.get_mbrs().size();
-
-    avail_bytes -= rt_engine_.EstimateMemoryUsageForAABB(
-        n_aabbs, config_.prefer_fast_build, config_.compact);
-  }
-
-  if (avail_bytes <= 0) {
-    throw std::runtime_error(
-        "Not enough memory to allocate result space for spatial index");
-  }
-
-  uint64_t reserve_bytes = ceil(avail_bytes * config_.result_buffer_memory_reserve_ratio);
-  reserve_bytes = reserve_bytes / config_.concurrency + 1;
-  // two index_t for each result pair (build index, stream index) and another index_t for
-  // the temp storage
-  uint32_t n_items = reserve_bytes / (2 * sizeof(index_t) + sizeof(index_t));
-
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p, Allocate result buffer quota %zu MB, queue size %u", this,
-      reserve_bytes / 1024 / 1024, n_items);
-
-  ctx->results.Init(ctx->cuda_stream, n_items);
-  ctx->results.Clear(ctx->cuda_stream);
-#ifdef GPUSPATIAL_PROFILING
-  ctx->alloc_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-}
-
-void SpatialJoiner::prepareLaunchParamsBoxQuery(SpatialJoinerContext* ctx, bool foward) {
-  using launch_params_t = detail::LaunchParamsBoxQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  assert(ctx->stream_geometries.get_geometry_type() != GeometryType::kPoint);
-
-  launch_params.mbrs1 = build_geometries_.get_mbrs();
-  launch_params.mbrs2 = ctx->stream_geometries.get_mbrs();
-  if (foward) {
-    launch_params.handle = handle_;
-    ctx->shader_id = GetBoxQueryForwardShaderId<point_t>();
-  } else {
-    launch_params.handle = ctx->handle;
-    ctx->shader_id = GetBoxQueryBackwardShaderId<point_t>();
-  }
-
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-}
-
-void SpatialJoiner::filter(SpatialJoinerContext* ctx, uint32_t dim_x, bool swap_id) {
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  Stopwatch sw;
-  sw.start();
-  if (dim_x > 0) {
-    rt_engine_.Render(ctx->cuda_stream, ctx->shader_id, dim3{dim_x, 1, 1},
-                      ArrayView<char>((char*)ctx->launch_params_buffer->data(),
-                                      ctx->launch_params_buffer->size()));
-  }
-  auto result_size = ctx->results.size(ctx->cuda_stream);
-  sw.stop();
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p, Filter stage, Launched %u rays, Found %u candidates, time %lf ms",
-      this, dim_x, result_size, sw.ms());
-  if (swap_id && result_size > 0) {
-    // swap the pair (build_id, stream_id) to (stream_id, build_id)
-    thrust::for_each(rmm::exec_policy_nosync(ctx->cuda_stream), ctx->results.data(),
-                     ctx->results.data() + result_size,
-                     [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                       thrust::swap(pair.first, pair.second);
-                     });
-  }
-  ctx->results.shrink_to_fit(ctx->cuda_stream);
-
-#ifdef GPUSPATIAL_PROFILING
-  ctx->filter_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-}
-
-void SpatialJoiner::refine(SpatialJoinerContext* ctx, Predicate predicate,
-                           std::vector<uint32_t>* build_indices,
-                           std::vector<uint32_t>* stream_indices) {
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  relate_engine_.Evaluate(ctx->cuda_stream, ctx->stream_geometries, predicate,
-                          ctx->results);
-#ifdef GPUSPATIAL_PROFILING
-  ctx->refine_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-  auto n_results = ctx->results.size(ctx->cuda_stream);
-
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  rmm::device_uvector<uint32_t> tmp_result_buffer(n_results, ctx->cuda_stream);
-
-  thrust::transform(
-      rmm::exec_policy_nosync(ctx->cuda_stream), ctx->results.data(),
-      ctx->results.data() + n_results, tmp_result_buffer.begin(),
-      [] __device__(const thrust::pair<index_t, index_t>& pair) -> uint32_t {
-        return pair.first;
-      });
-  auto prev_size = build_indices->size();
-  build_indices->resize(build_indices->size() + n_results);
-
-  CUDA_CHECK(cudaMemcpyAsync(build_indices->data() + prev_size, tmp_result_buffer.data(),
-                             sizeof(uint32_t) * n_results, cudaMemcpyDeviceToHost,
-                             ctx->cuda_stream));
-
-  auto array_index_offset = ctx->array_index_offset;
-
-  thrust::transform(
-      rmm::exec_policy_nosync(ctx->cuda_stream), ctx->results.data(),
-      ctx->results.data() + n_results, tmp_result_buffer.begin(),
-      [=] __device__(const thrust::pair<index_t, index_t>& pair) -> uint32_t {
-        return pair.second + array_index_offset;
-      });
-
-  stream_indices->resize(stream_indices->size() + n_results);
-
-  CUDA_CHECK(cudaMemcpyAsync(stream_indices->data() + prev_size, tmp_result_buffer.data(),
-                             sizeof(uint32_t) * n_results, cudaMemcpyDeviceToHost,
-                             ctx->cuda_stream));
-#ifdef GPUSPATIAL_PROFILING
-  ctx->copy_res_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-  ctx->cuda_stream.synchronize();
-}
-
-std::unique_ptr<StreamingJoiner> CreateSpatialJoiner() {
-  return std::make_unique<SpatialJoiner>();
-}
-
-void InitSpatialJoiner(StreamingJoiner* index, const char* ptx_root,
-                       uint32_t concurrency) {
-  SpatialJoiner::SpatialJoinerConfig config;
-  config.ptx_root = ptx_root;
-  config.concurrency = concurrency;
-  index->Init(&config);
-}
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt b/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt
index 719d0909f..2f129cfde 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt
@@ -14,10 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-if(GPUSPATIAL_BUILD_TESTS)
-  add_library(geoarrow_geos geoarrow_geos/geoarrow_geos.c)
-  target_link_libraries(geoarrow_geos PUBLIC GEOS::geos_c geoarrow)
-endif()
 
 if(GPUSPATIAL_BUILD_TESTS)
   enable_testing()
@@ -33,9 +29,8 @@ if(GPUSPATIAL_BUILD_TESTS)
                                 GTest::gmock_main
                                 gpuspatial
                                 GEOS::geos
-                                GEOS::geos_c
-                                geoarrow_geos
-                                nanoarrow::nanoarrow_ipc)
+                                Arrow::arrow_static
+                                Parquet::parquet_static)
 
   add_executable(loader_test array_stream.cc main.cc loader_test.cu)
   target_link_libraries(loader_test
@@ -44,32 +39,37 @@ if(GPUSPATIAL_BUILD_TESTS)
                                 GTest::gmock_main
                                 gpuspatial
                                 GEOS::geos
-                                GEOS::geos_c
                                 Arrow::arrow_static
-                                Parquet::parquet_static
-                                nanoarrow::nanoarrow_ipc)
+                                Parquet::parquet_static)
   target_include_directories(loader_test PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
   target_compile_options(loader_test
                          PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda
                                  --expt-relaxed-constexpr>)
 
-  add_executable(joiner_test main.cc array_stream.cc joiner_test.cu)
-  target_link_libraries(joiner_test
+  add_executable(index_test main.cc index_test.cu)
+  target_link_libraries(index_test
+                        PRIVATE cuda
+                                GTest::gtest_main
+                                GTest::gmock_main
+                                gpuspatial
+                                GEOS::geos)
+  target_compile_options(index_test
+                         PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda
+                                 --expt-relaxed-constexpr>)
+  add_executable(refiner_test main.cc array_stream.cc refiner_test.cu)
+  target_link_libraries(refiner_test
                         PRIVATE cuda
                                 GTest::gtest_main
                                 GTest::gmock_main
                                 gpuspatial
                                 GEOS::geos
-                                GEOS::geos_c
-                                geoarrow_geos
                                 Arrow::arrow_static
-                                Parquet::parquet_static
-                                nanoarrow::nanoarrow_ipc)
-  target_compile_options(joiner_test
+                                Parquet::parquet_static)
+  target_compile_options(refiner_test
                          PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda
                                  --expt-relaxed-constexpr>)
 
-  add_executable(relate_test main.cc array_stream.cc related_test.cu)
+  add_executable(relate_test main.cc array_stream.cc relate_test.cu)
   target_link_libraries(relate_test
                         PRIVATE cuda
                                 GTest::gtest_main
@@ -77,20 +77,26 @@ if(GPUSPATIAL_BUILD_TESTS)
                                 gpuspatial
                                 GEOS::geos
                                 nanoarrow::nanoarrow
-                                nanoarrow::nanoarrow_ipc)
+                                Arrow::arrow_static
+                                Parquet::parquet_static)
   target_compile_options(relate_test
                          PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda
                                  --expt-relaxed-constexpr>)
 
   add_executable(c_wrapper_test main.cc c_wrapper_test.cc array_stream.cc)
-  target_link_libraries(c_wrapper_test PRIVATE GTest::gtest_main GTest::gmock_main
-                                               gpuspatial_c nanoarrow::nanoarrow_ipc)
+  target_link_libraries(c_wrapper_test
+                        PRIVATE GTest::gtest_main
+                                GTest::gmock_main
+                                gpuspatial_c
+                                GEOS::geos
+                                Arrow::arrow_static
+                                Parquet::parquet_static)
 
   include(GoogleTest)
 
   gtest_discover_tests(gpuspatial_testing_test)
   gtest_discover_tests(array_stream_test)
   gtest_discover_tests(loader_test)
-  gtest_discover_tests(joiner_test)
   gtest_discover_tests(relate_test)
+  gtest_discover_tests(c_wrapper_test)
 endif()
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/array_stream.cc b/c/sedona-libgpuspatial/libgpuspatial/test/array_stream.cc
index 3f47b00ab..659679dd4 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/array_stream.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/array_stream.cc
@@ -18,9 +18,12 @@
 #include <vector>
 
 #include "array_stream.hpp"
+#include "test_common.hpp"
 
 #include "nanoarrow/nanoarrow.hpp"
-#include "nanoarrow/nanoarrow_ipc.hpp"
+
+#include "arrow/api.h"
+#include "parquet/arrow/reader.h"
 
 namespace gpuspatial {
 
@@ -106,20 +109,64 @@ class ColumnArrayStream {
   }
 };
 
-void ArrayStreamFromIpc(const std::string& filename, std::string geometry_column,
-                        struct ArrowArrayStream* out) {
-  FILE* file = fopen(filename.c_str(), "rb");
-  if (file == nullptr) {
-    throw std::runtime_error("Failed to open " + filename);
+// Function to read a single Parquet file and extract a column.
+arrow::Status ReadParquetFromFile(
+    arrow::fs::FileSystem* fs,     // 1. Filesystem pointer (e.g., LocalFileSystem)
+    const std::string& file_path,  // 2. Single file path instead of a folder
+    int64_t batch_size, const char* column_name,
+    std::vector<std::shared_ptr<arrow::Array>>& out_arrays) {
+  // 1. Get FileInfo for the single path
+  ARROW_ASSIGN_OR_RAISE(auto file_info, fs->GetFileInfo(file_path));
+
+  // Check if the path points to a file
+  if (file_info.type() != arrow::fs::FileType::File) {
+    return arrow::Status::Invalid("Path is not a file: ", file_path);
   }
 
-  nanoarrow::ipc::UniqueInputStream input_stream;
-  NANOARROW_THROW_NOT_OK(ArrowIpcInputStreamInitFile(input_stream.get(), file, true));
+  // 2. Open the input file
+  ARROW_ASSIGN_OR_RAISE(auto input_file, fs->OpenInputFile(file_info));
+
+  // 3. Open the Parquet file and create an Arrow reader
+  ARROW_ASSIGN_OR_RAISE(auto arrow_reader, parquet::arrow::OpenFile(
+                                               input_file, arrow::default_memory_pool()));
+
+  // 4. Set the batch size
+  arrow_reader->set_batch_size(batch_size);
+
+  // 5. Get the RecordBatchReader
+  auto rb_reader = arrow_reader->GetRecordBatchReader().ValueOrDie();
+  // 6. Read all record batches and extract the column
+  while (true) {
+    std::shared_ptr<arrow::RecordBatch> batch;
+
+    // Read the next batch
+    ARROW_THROW_NOT_OK(rb_reader->ReadNext(&batch));
+
+    // Check for end of stream
+    if (!batch) {
+      break;
+    }
 
-  nanoarrow::UniqueArrayStream inner;
-  NANOARROW_THROW_NOT_OK(
-      ArrowIpcArrayStreamReaderInit(inner.get(), input_stream.get(), nullptr));
-  ColumnArrayStream(std::move(inner), std::move(geometry_column)).ToArrayStream(out);
+    // Extract the specified column and add to the output vector
+    std::shared_ptr<arrow::Array> column_array = batch->GetColumnByName(column_name);
+    if (!column_array) {
+      return arrow::Status::Invalid("Column not found: ", column_name);
+    }
+    out_arrays.push_back(column_array);
+  }
+
+  return arrow::Status::OK();
 }
 
+std::vector<std::shared_ptr<arrow::Array>> ReadParquet(const std::string& path,
+                                                       int batch_size) {
+  using namespace TestUtils;
+
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::shared_ptr<arrow::Array>> build_arrays;
+  ARROW_THROW_NOT_OK(
+      ReadParquetFromFile(fs.get(), path, batch_size, "geometry", build_arrays));
+  return build_arrays;
+}
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/array_stream.hpp b/c/sedona-libgpuspatial/libgpuspatial/test/array_stream.hpp
index 677d758c6..36d881847 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/array_stream.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/array_stream.hpp
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 
+#include "arrow/filesystem/filesystem.h"
 #include "geoarrow/geoarrow.hpp"
 #include "gpuspatial_testing.hpp"
 #include "nanoarrow/nanoarrow.hpp"
@@ -25,8 +26,6 @@ namespace gpuspatial {
 
 void ArrayStreamFromWKT(const std::vector<std::vector<std::string>>& batches,
                         enum GeoArrowType type, struct ArrowArrayStream* out);
-
-void ArrayStreamFromIpc(const std::string& filename, std::string geometry_column,
-                        struct ArrowArrayStream* out);
-
+std::vector<std::shared_ptr<arrow::Array>> ReadParquet(const std::string& path,
+                                                       int batch_size = 100);
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/array_stream_test.cc b/c/sedona-libgpuspatial/libgpuspatial/test/array_stream_test.cc
index b5229cd39..b4f0c5511 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/array_stream_test.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/array_stream_test.cc
@@ -60,41 +60,4 @@ TEST(ArrayStream, StreamFromWkt) {
   EXPECT_EQ(bounder.Bounds().ymax(), 9);
 }
 
-TEST(ArrayStream, StreamFromIpc) {
-  nanoarrow::UniqueArrayStream stream;
-  ArrayStreamFromIpc(TestUtils::GetTestDataPath("arrowipc/test_points.arrows"),
-                     "geometry", stream.get());
-
-  struct ArrowError error{};
-  nanoarrow::UniqueSchema schema;
-  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
-      << error.message;
-  EXPECT_STREQ(schema->name, "geometry");
-
-  nanoarrow::UniqueArray array;
-  int64_t n_batches = 0;
-  int64_t n_rows = 0;
-  testing::WKBBounder bounder;
-  while (true) {
-    ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
-        << error.message;
-    if (array->release == nullptr) {
-      break;
-    }
-
-    n_batches += 1;
-    n_rows += array->length;
-    bounder.Read(array.get());
-    array.reset();
-  }
-
-  ASSERT_EQ(n_batches, 100);
-  ASSERT_EQ(n_rows, 100000);
-
-  EXPECT_NEAR(bounder.Bounds().xmin(), -100, 0.01);
-  EXPECT_NEAR(bounder.Bounds().ymin(), -100, 0.01);
-  EXPECT_NEAR(bounder.Bounds().xmax(), 100, 0.01);
-  EXPECT_NEAR(bounder.Bounds().ymax(), 100, 0.01);
-}
-
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc b/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc
index 60c247399..3de7fadcc 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc
@@ -15,92 +15,305 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "array_stream.hpp"
 #include "test_common.hpp"
 
 #include "gpuspatial/gpuspatial_c.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+
+#include "nanoarrow/nanoarrow.hpp"
+
 #include <random>
 #include <vector>
-#include "array_stream.hpp"
-#include "nanoarrow/nanoarrow.hpp"
 
-namespace TestUtils {
-std::string GetTestDataPath(const std::string& relative_path_to_file);
+TEST(RuntimeTest, InitializeRuntime) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig config;
+
+  std::string ptx_root = TestUtils::GetTestShaderPath();
+  config.ptx_root = ptx_root.c_str();
+  config.device_id = 0;
+  config.use_cuda_memory_pool = false;
+  ASSERT_EQ(runtime.init(&runtime, &config), 0);
+
+  runtime.release(&runtime);
+}
+
+TEST(RuntimeTest, ErrorTest) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig runtime_config;
+
+  runtime_config.ptx_root = "/invalid/path/to/ptx";
+  runtime_config.device_id = 0;
+  runtime_config.use_cuda_memory_pool = false;
+
+  EXPECT_NE(runtime.init(&runtime, &runtime_config), 0);
+
+  const char* raw_error = runtime.get_last_error(&runtime);
+  printf("Error received: %s\n", raw_error);
+
+  std::string error_msg(raw_error);
+
+  EXPECT_NE(error_msg.find("No such file or directory"), std::string::npos)
+      << "Error message was corrupted or incorrect. Got: " << error_msg;
+
+  runtime.release(&runtime);
+}
+
+TEST(SpatialIndexTest, InitializeIndex) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig runtime_config;
+
+  std::string ptx_root = TestUtils::GetTestShaderPath();
+  runtime_config.ptx_root = ptx_root.c_str();
+  runtime_config.device_id = 0;
+  runtime_config.use_cuda_memory_pool = true;
+  runtime_config.cuda_memory_pool_init_precent = 10;
+  ASSERT_EQ(runtime.init(&runtime, &runtime_config), 0);
+
+  SedonaFloatIndex2D index;
+  GpuSpatialIndexConfig index_config;
+
+  index_config.runtime = &runtime;
+  index_config.concurrency = 1;
+
+  ASSERT_EQ(GpuSpatialIndexFloat2DCreate(&index, &index_config), 0);
+
+  index.release(&index);
+  runtime.release(&runtime);
+}
+
+TEST(RefinerTest, InitializeRefiner) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig runtime_config;
+
+  std::string ptx_root = TestUtils::GetTestShaderPath();
+  runtime_config.ptx_root = ptx_root.c_str();
+  runtime_config.device_id = 0;
+  runtime_config.use_cuda_memory_pool = true;
+  runtime_config.cuda_memory_pool_init_precent = 10;
+  ASSERT_EQ(runtime.init(&runtime, &runtime_config), 0);
+
+  SedonaSpatialRefiner refiner;
+  GpuSpatialRefinerConfig refiner_config;
+
+  refiner_config.runtime = &runtime;
+  refiner_config.concurrency = 1;
+
+  ASSERT_EQ(GpuSpatialRefinerCreate(&refiner, &refiner_config), 0);
+
+  refiner.release(&refiner);
+  runtime.release(&runtime);
 }
 
 class CWrapperTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    // Initialize the GpuSpatialJoiner
-    GpuSpatialJoinerCreate(&joiner_);
-    struct GpuSpatialJoinerConfig config_;
-    std::string ptx_root = TestUtils::GetTestDataPath("shaders_ptx");
+    std::string ptx_root = TestUtils::GetTestShaderPath();
+
+    GpuSpatialRuntimeCreate(&runtime_);
+    GpuSpatialRuntimeConfig runtime_config;
+
+    runtime_config.ptx_root = ptx_root.c_str();
+    runtime_config.device_id = 0;
+    runtime_config.use_cuda_memory_pool = true;
+    runtime_config.cuda_memory_pool_init_precent = 10;
+    ASSERT_EQ(runtime_.init(&runtime_, &runtime_config), 0);
 
-    // Set up the configuration
-    config_.concurrency = 2;  // Example concurrency level
-    config_.ptx_root = ptx_root.c_str();
+    GpuSpatialIndexConfig index_config;
 
-    ASSERT_EQ(joiner_.init(&joiner_, &config_), 0);
-    // Initialize the context
+    index_config.runtime = &runtime_;
+    index_config.concurrency = 1;
+
+    ASSERT_EQ(GpuSpatialIndexFloat2DCreate(&index_, &index_config), 0);
+
+    GpuSpatialRefinerConfig refiner_config;
+
+    refiner_config.runtime = &runtime_;
+    refiner_config.concurrency = 1;
+    refiner_config.compress_bvh = false;
+    refiner_config.pipeline_batches = 1;
+
+    ASSERT_EQ(GpuSpatialRefinerCreate(&refiner_, &refiner_config), 0);
   }
 
   void TearDown() override {
-    // Clean up
-    joiner_.release(&joiner_);
+    refiner_.release(&refiner_);
+    index_.release(&index_);
+    runtime_.release(&runtime_);
   }
-
-  struct GpuSpatialJoiner joiner_;
+  GpuSpatialRuntime runtime_;
+  SedonaFloatIndex2D index_;
+  SedonaSpatialRefiner refiner_;
 };
 
 TEST_F(CWrapperTest, InitializeJoiner) {
-  // Test if the joiner initializes correctly
-  struct GpuSpatialJoinerContext context_;
-  joiner_.create_context(&joiner_, &context_);
+  // Define types matching the CWrapper expectation (float for GPU)
+  using coord_t = float;
+  using fpoint_t = gpuspatial::Point<coord_t, 2>;
+  using box_t = gpuspatial::Box<fpoint_t>;
+
+  // 1. Load Data using ReadParquet
+  auto poly_path = TestUtils::GetTestDataPath("synthetic_pip/polygons.parquet");
+  auto point_path = TestUtils::GetTestDataPath("synthetic_pip/points.parquet");
+
+  // ReadParquet loads the file into batches (std::vector<std::shared_ptr<arrow::Array>>)
+  // Assuming batch_size=1000 or similar default inside ReadParquet
+  auto poly_arrays = gpuspatial::ReadParquet(poly_path);
+  auto point_arrays = gpuspatial::ReadParquet(point_path);
+
+  // 2. Setup GEOS C++ Objects
+  auto factory = geos::geom::GeometryFactory::create();
+  geos::io::WKBReader wkb_reader(*factory);
+
+  // Iterate over batches (replacing the stream loop)
+  size_t num_batches = std::min(poly_arrays.size(), point_arrays.size());
+
+  for (size_t i = 0; i < num_batches; i++) {
+    auto build_arrow_arr = poly_arrays[i];
+    auto probe_arrow_arr = point_arrays[i];
+
+    // Export to C-style ArrowArrays for the CWrapper (SUT)
+    nanoarrow::UniqueArray build_array, probe_array;
+    nanoarrow::UniqueSchema build_schema, probe_schema;
+
+    ARROW_THROW_NOT_OK(
+        arrow::ExportArray(*build_arrow_arr, build_array.get(), build_schema.get()));
+    ARROW_THROW_NOT_OK(
+        arrow::ExportArray(*probe_arrow_arr, probe_array.get(), probe_schema.get()));
+
+    // --- Build Phase ---
+
+    // Boxes for the GPU Index (SUT)
+    std::vector<box_t> rects;
+
+    // View for parsing WKB
+    nanoarrow::UniqueArrayView build_view;
+    ArrowError error;
+
+    ASSERT_EQ(ArrowArrayViewInitFromSchema(build_view.get(), build_schema.get(), &error),
+              NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayViewSetArray(build_view.get(), build_array.get(), &error),
+              NANOARROW_OK)
+        << error.message;
 
-  auto poly_path = TestUtils::GetTestDataPath("arrowipc/test_polygons.arrows");
-  auto point_path = TestUtils::GetTestDataPath("arrowipc/test_points.arrows");
-  nanoarrow::UniqueArrayStream poly_stream, point_stream;
+    for (int64_t j = 0; j < build_array->length; j++) {
+      // Parse WKB
+      ArrowStringView wkb = ArrowArrayViewGetStringUnsafe(build_view.get(), j);
+      auto geom = wkb_reader.read(reinterpret_cast<const unsigned char*>(wkb.data),
+                                  wkb.size_bytes);
 
-  gpuspatial::ArrayStreamFromIpc(poly_path, "geometry", poly_stream.get());
-  gpuspatial::ArrayStreamFromIpc(point_path, "geometry", point_stream.get());
+      // Extract Envelope for GPU
+      const auto* env = geom->getEnvelopeInternal();
+      double xmin = 0, ymin = 0, xmax = -1, ymax = -1;
+      if (!env->isNull()) {
+        xmin = env->getMinX();
+        ymin = env->getMinY();
+        xmax = env->getMaxX();
+        ymax = env->getMaxY();
+      }
 
-  nanoarrow::UniqueSchema build_schema, stream_schema;
-  nanoarrow::UniqueArray build_array, stream_array;
-  ArrowError error;
-  ArrowErrorSet(&error, "");
+      // Add to GPU Build Data
+      rects.emplace_back(fpoint_t((float)xmin, (float)ymin),
+                         fpoint_t((float)xmax, (float)ymax));
+    }
 
-  int n_row_groups = 100;
+    // Initialize SUT (CWrapper Index)
+    index_.clear(&index_);
+    ASSERT_EQ(index_.push_build(&index_, (float*)rects.data(), rects.size()), 0);
+    ASSERT_EQ(index_.finish_building(&index_), 0);
 
-  for (int i = 0; i < n_row_groups; i++) {
-    ASSERT_EQ(ArrowArrayStreamGetNext(poly_stream.get(), build_array.get(), &error),
-              NANOARROW_OK);
-    ASSERT_EQ(ArrowArrayStreamGetSchema(poly_stream.get(), build_schema.get(), &error),
-              NANOARROW_OK);
+    // --- Probe Phase ---
+    std::vector<box_t> queries;
 
-    ASSERT_EQ(ArrowArrayStreamGetNext(point_stream.get(), stream_array.get(), &error),
-              NANOARROW_OK);
-    ASSERT_EQ(ArrowArrayStreamGetSchema(point_stream.get(), stream_schema.get(), &error),
-              NANOARROW_OK);
+    nanoarrow::UniqueArrayView probe_view;
 
-    joiner_.push_build(&joiner_, build_schema.get(), build_array.get(), 0,
-                       build_array->length);
-    joiner_.finish_building(&joiner_);
+    ASSERT_EQ(ArrowArrayViewInitFromSchema(probe_view.get(), probe_schema.get(), &error),
+              NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayViewSetArray(probe_view.get(), probe_array.get(), &error),
+              NANOARROW_OK)
+        << error.message;
 
-    joiner_.push_stream(&joiner_, &context_, stream_schema.get(), stream_array.get(), 0,
-                        stream_array->length, GpuSpatialPredicateContains, 0);
+    refiner_.init_schema(&refiner_, build_schema.get(), probe_schema.get());
 
-    void* build_indices_ptr;
-    void* stream_indices_ptr;
+    for (int64_t j = 0; j < probe_array->length; j++) {
+      ArrowBufferView wkb = ArrowArrayViewGetBytesUnsafe(probe_view.get(), j);
+      auto geom = wkb_reader.read(wkb.data.as_uint8, wkb.size_bytes);
+
+      const auto* env = geom->getEnvelopeInternal();
+      double xmin = 0, ymin = 0, xmax = -1, ymax = -1;
+      if (!env->isNull()) {
+        xmin = env->getMinX();
+        ymin = env->getMinY();
+        xmax = env->getMaxX();
+        ymax = env->getMaxY();
+      }
+
+      queries.emplace_back(fpoint_t((float)xmin, (float)ymin),
+                           fpoint_t((float)xmax, (float)ymax));
+    }
+
+    // Run SUT Probe
+    SedonaSpatialIndexContext idx_ctx;
+    index_.create_context(&idx_ctx);
+    index_.probe(&index_, &idx_ctx, (float*)queries.data(), queries.size());
+
+    // Retrieve SUT Results
+    uint32_t* build_indices_ptr;
+    uint32_t* probe_indices_ptr;
     uint32_t build_indices_length;
-    uint32_t stream_indices_length;
+    uint32_t probe_indices_length;
 
-    joiner_.get_build_indices_buffer(&context_, (void**)&build_indices_ptr,
-                                     &build_indices_length);
-    joiner_.get_stream_indices_buffer(&context_, (void**)&stream_indices_ptr,
-                                      &stream_indices_length);
-  }
+    index_.get_build_indices_buffer(&idx_ctx, &build_indices_ptr, &build_indices_length);
+    index_.get_probe_indices_buffer(&idx_ctx, &probe_indices_ptr, &probe_indices_length);
+
+    refiner_.clear(&refiner_);
+    ASSERT_EQ(refiner_.push_build(&refiner_, build_array.get()), 0);
+    ASSERT_EQ(refiner_.finish_building(&refiner_), 0);
+
+    uint32_t new_len;
+    ASSERT_EQ(refiner_.refine(
+                  &refiner_, probe_array.get(),
+                  SedonaSpatialRelationPredicate::SedonaSpatialPredicateContains,
+                  build_indices_ptr, probe_indices_ptr, build_indices_length, &new_len),
+              0);
+
+    std::vector<uint32_t> sut_build_indices(build_indices_ptr,
+                                            build_indices_ptr + new_len);
+    std::vector<uint32_t> sut_stream_indices(probe_indices_ptr,
+                                             probe_indices_ptr + new_len);
 
-  joiner_.destroy_context(&context_);
+    index_.destroy_context(&idx_ctx);
+
+    std::vector<uint32_t> ref_build_indices;
+    std::vector<uint32_t> ref_stream_indices;
+
+    // Wrap single arrays in vectors to match signature
+    std::vector<ArrowArray*> build_inputs = {build_array.get()};
+    std::vector<ArrowArray*> probe_inputs = {probe_array.get()};
+
+    TestUtils::ComputeGeosJoin(build_schema.get(), build_inputs, probe_schema.get(),
+                               probe_inputs, gpuspatial::Predicate::kContains,
+                               ref_build_indices, ref_stream_indices);
+
+    // Compare Results
+    ASSERT_EQ(ref_build_indices.size(), sut_build_indices.size());
+    ASSERT_EQ(ref_stream_indices.size(), sut_stream_indices.size());
+
+    TestUtils::sort_vectors_by_index(ref_build_indices, ref_stream_indices);
+    TestUtils::sort_vectors_by_index(sut_build_indices, sut_stream_indices);
+
+    for (size_t j = 0; j < sut_build_indices.size(); j++) {
+      ASSERT_EQ(ref_build_indices[j], sut_build_indices[j]);
+      ASSERT_EQ(ref_stream_indices[j], sut_stream_indices[j]);
+    }
+  }
 }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_points.arrows b/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_points.arrows
deleted file mode 100644
index 58013b617..000000000
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_points.arrows and /dev/null differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_polygons.arrows b/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_polygons.arrows
deleted file mode 100644
index 99dbd7230..000000000
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_polygons.arrows and /dev/null differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_polygons1.arrows b/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_polygons1.arrows
deleted file mode 100644
index 1c020de4a..000000000
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_polygons1.arrows and /dev/null differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_polygons2.arrows b/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_polygons2.arrows
deleted file mode 100644
index 96ab8e27e..000000000
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/arrowipc/test_polygons2.arrows and /dev/null differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile
deleted file mode 100644
index 5b04c384b..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile
+++ /dev/null
@@ -1,55 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# Variables
-URL := https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/natural-earth/files/natural-earth_cities_geo.parquet
-INPUT_FILE := natural-earth_cities_geo.parquet
-PYTHON_SCRIPT := ../gen_points.py
-OUTPUT_POINTS := generated_points.parquet
-NUM_POINTS := 1000
-
-.PHONY: all clean generate
-
-# The default target runs both download and point generation
-all: $(OUTPUT_POINTS)
-
-# --- Download Target ---
-
-# Target to download the GeoParquet file
-$(INPUT_FILE):
-	@echo "--- Downloading $(INPUT_FILE) ---"
-	# Use curl to download the file. The -L flag handles redirects.
-	curl -L $(URL) -o $(INPUT_FILE)
-	@echo "--- Download complete ---"
-
-# --- Generation Target ---
-
-# Target to generate points, which depends on the input file being present
-$(OUTPUT_POINTS): $(INPUT_FILE)
-	@echo "--- Generating $(NUM_POINTS) random points from $(INPUT_FILE) ---"
-	python $(PYTHON_SCRIPT) $(INPUT_FILE) $(NUM_POINTS) $(OUTPUT_POINTS)
-	@echo "--- Point generation complete. Output: $(OUTPUT_POINTS) ---"
-
-# An explicit target to run generation if you don't want to rely on 'all'
-generate: $(OUTPUT_POINTS)
-
-# --- Cleanup Target ---
-
-# Target to remove all generated and downloaded files
-clean:
-	@echo "--- Cleaning up files ---"
-	rm -f $(INPUT_FILE) $(OUTPUT_POINTS)
-	@echo "--- Cleanup complete ---"
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet
deleted file mode 100644
index 4ad348b3a..000000000
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet and /dev/null differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/natural-earth_cities_geo.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/natural-earth_cities_geo.parquet
deleted file mode 100644
index bc419b494..000000000
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/natural-earth_cities_geo.parquet and /dev/null differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/convert.py b/c/sedona-libgpuspatial/libgpuspatial/test/data/convert.py
new file mode 100644
index 000000000..952a7b53e
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/convert.py
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import geopandas as gpd
+import argparse
+import sys
+import os
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert GeoJSON spatial data to GeoParquet format."
+    )
+
+    parser.add_argument(
+        "-i",
+        "--input",
+        default="polygons.geojson",
+        help="Path to the input GeoJSON file (default: polygons.geojson)",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="polygons.parquet",
+        help="Path for the output Parquet file (default: polygons.parquet)",
+    )
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="Increase output verbosity"
+    )
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.input):
+        print(f"Error: Input file '{args.input}' not found.")
+        sys.exit(1)
+
+    try:
+        if args.verbose:
+            print(f"Loading {args.input}...")
+        gdf = gpd.read_file(args.input)
+
+        if args.verbose:
+            print(f"Writing to {args.output}...")
+        gdf.to_parquet(args.output, index=False)
+
+        print(f"Successfully converted {len(gdf)} geometries to {args.output}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile
index 147a332bd..f154c4416 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile
@@ -19,7 +19,7 @@ URL := https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/natural-e
 INPUT_FILE := natural-earth_countries_geo.parquet
 PYTHON_SCRIPT := ../gen_points.py
 OUTPUT_POINTS := generated_points.parquet
-NUM_POINTS := 1000
+NUM_POINTS := 10000
 
 .PHONY: all clean generate
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet
index 32d8dcc27..70af40443 100644
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet and b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py b/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py
index a02f4a094..b23a89ebc 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py
@@ -47,7 +47,7 @@ def calculate_bbox_and_generate_points(geoparquet_path, n_points, output_path):
 
     # Generate random coordinates
     random_x = np.random.uniform(minx, maxx, n_points)
-    random_y = np.random.uniform(miny, miny, n_points)
+    random_y = np.random.uniform(miny, maxy, n_points)
 
     # 4. Create a GeoDataFrame from the points
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_pip/Makefile b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_pip/Makefile
new file mode 100644
index 000000000..10033f894
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_pip/Makefile
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# Variables
+GEN_URL = https://raw.githubusercontent.com/aseldawy/spider/refs/heads/master/html/cgi/generator.py
+GEN_SCRIPT = /tmp/spider.py
+CONV_SCRIPT = ../convert.py
+
+# Common Generation Params
+DIST = uniform
+COUNT = 10000
+SEED = 1
+
+.PHONY: all clean setup
+
+# Hardcoded main target
+all: points.parquet polygons.parquet
+
+setup:
+	@python3 -c "import geopandas, pyarrow" || (echo "Missing: pip install geopandas pyarrow"; exit 1)
+	@python3 -c "import cgi" || (pip install legacy-cgi)
+
+$(GEN_SCRIPT):
+	@echo "Downloading generator..."
+	curl -sLo $(GEN_SCRIPT) $(GEN_URL)
+
+# --- Point Generation ---
+points.geojson: $(GEN_SCRIPT) setup
+	@echo "Generating points..."
+	python3 $(GEN_SCRIPT) \
+		distribution=$(DIST) \
+		cardinality=$(COUNT) \
+		dimensions=2 \
+		seed=$(SEED) \
+		geometry=point \
+		format=geojson > points.geojson
+
+points.parquet: points.geojson
+	@echo "Converting points to Parquet..."
+	python3 $(CONV_SCRIPT) --input points.geojson --output points.parquet --verbose
+	rm points.geojson
+
+# --- Polygon Generation ---
+polygons.geojson: $(GEN_SCRIPT) setup
+	@echo "Generating polygons..."
+	python3 $(GEN_SCRIPT) \
+		distribution=$(DIST) \
+		cardinality=$(COUNT) \
+		dimensions=2 \
+		seed=$(SEED) \
+		geometry=polygon \
+		polysize=0.01 \
+		maxseg=8 \
+		format=geojson > polygons.geojson
+
+polygons.parquet: polygons.geojson
+	@echo "Converting polygons to Parquet..."
+	python3 $(CONV_SCRIPT) --input polygons.geojson --output polygons.parquet --verbose
+	rm polygons.geojson
+
+clean:
+	rm -f *.geojson *.parquet
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_pip/points.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_pip/points.parquet
new file mode 100644
index 000000000..fc9dee208
Binary files /dev/null and b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_pip/points.parquet differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_pip/polygons.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_pip/polygons.parquet
new file mode 100644
index 000000000..190115e73
Binary files /dev/null and b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_pip/polygons.parquet differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_poly/Makefile b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_poly/Makefile
new file mode 100644
index 000000000..6385cc908
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_poly/Makefile
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# Variables
+GEN_URL = https://raw.githubusercontent.com/aseldawy/spider/refs/heads/master/html/cgi/generator.py
+GEN_SCRIPT = /tmp/spider.py
+CONV_SCRIPT = ../convert.py
+
+# Common Generation Params
+DIST = uniform
+COUNT = 10000
+SEED1 = 1
+SEED2 = 2
+
+.PHONY: all clean setup
+
+# Build all parquet files
+all: polygons1.parquet polygons2.parquet
+
+setup:
+	@python3 -c "import geopandas, pyarrow" || (echo "Missing: pip install geopandas pyarrow"; exit 1)
+	@python3 -c "import cgi" || (pip install legacy-cgi)
+
+$(GEN_SCRIPT):
+	@echo "Downloading generator..."
+	@curl -sLo $(GEN_SCRIPT) $(GEN_URL)
+
+# --- Polygon 1 Generation & Conversion ---
+polygons1.parquet: $(GEN_SCRIPT) setup
+	@echo "Generating and converting polygons 1..."
+	python3 $(GEN_SCRIPT) \
+		distribution=$(DIST) \
+		cardinality=$(COUNT) \
+		dimensions=2 \
+		seed=$(SEED1) \
+		geometry=polygon \
+		polysize=0.01 \
+		maxseg=8 \
+		format=geojson > polygons1.geojson
+	python3 $(CONV_SCRIPT) --input polygons1.geojson --output polygons1.parquet --verbose
+	rm polygons1.geojson
+
+# --- Polygon 2 Generation & Conversion ---
+polygons2.parquet: $(GEN_SCRIPT) setup
+	@echo "Generating and converting polygons 2..."
+	python3 $(GEN_SCRIPT) \
+		distribution=$(DIST) \
+		cardinality=$(COUNT) \
+		dimensions=2 \
+		seed=$(SEED2) \
+		geometry=polygon \
+		polysize=0.01 \
+		maxseg=8 \
+		format=geojson > polygons2.geojson
+	python3 $(CONV_SCRIPT) --input polygons2.geojson --output polygons2.parquet --verbose
+	rm polygons2.geojson
+
+clean:
+	rm -f *.geojson *.parquet
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_poly/polygons1.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_poly/polygons1.parquet
new file mode 100644
index 000000000..190115e73
Binary files /dev/null and b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_poly/polygons1.parquet differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_poly/polygons2.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_poly/polygons2.parquet
new file mode 100644
index 000000000..dcc52ffd3
Binary files /dev/null and b/c/sedona-libgpuspatial/libgpuspatial/test/data/synthetic_poly/polygons2.parquet differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/geoarrow_geos/geoarrow_geos.c b/c/sedona-libgpuspatial/libgpuspatial/test/geoarrow_geos/geoarrow_geos.c
deleted file mode 100644
index 04691ea1d..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/test/geoarrow_geos/geoarrow_geos.c
+++ /dev/null
@@ -1,1099 +0,0 @@
-
-#include <errno.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define GEOS_USE_ONLY_R_API
-#include <geos_c.h>
-#include "geoarrow/geoarrow.h"
-
-#include <assert.h>
-#include <math.h>
-#include "geoarrow_geos.h"
-
-const char* GeoArrowGEOSVersionGEOS(void) { return GEOSversion(); }
-
-const char* GeoArrowGEOSVersionGeoArrow(void) { return GeoArrowVersion(); }
-
-struct GeoArrowGEOSArrayBuilder {
-  GEOSContextHandle_t handle;
-  struct GeoArrowError error;
-  struct GeoArrowBuilder builder;
-  struct GeoArrowWKTWriter wkt_writer;
-  struct GeoArrowWKBWriter wkb_writer;
-  struct GeoArrowVisitor v;
-  struct GeoArrowCoordView coords_view;
-  double* coords;
-};
-
-GeoArrowGEOSErrorCode GeoArrowGEOSArrayBuilderCreate(
-    GEOSContextHandle_t handle, struct ArrowSchema* schema,
-    struct GeoArrowGEOSArrayBuilder** out) {
-  struct GeoArrowGEOSArrayBuilder* builder =
-      (struct GeoArrowGEOSArrayBuilder*)malloc(sizeof(struct GeoArrowGEOSArrayBuilder));
-  if (builder == NULL) {
-    *out = NULL;
-    return ENOMEM;
-  }
-
-  memset(builder, 0, sizeof(struct GeoArrowGEOSArrayBuilder));
-  *out = builder;
-
-  struct GeoArrowSchemaView schema_view;
-  GEOARROW_RETURN_NOT_OK(GeoArrowSchemaViewInit(&schema_view, schema, &builder->error));
-  switch (schema_view.type) {
-    case GEOARROW_TYPE_WKT:
-      GEOARROW_RETURN_NOT_OK(GeoArrowWKTWriterInit(&builder->wkt_writer));
-      GeoArrowWKTWriterInitVisitor(&builder->wkt_writer, &builder->v);
-      break;
-    case GEOARROW_TYPE_WKB:
-      GEOARROW_RETURN_NOT_OK(GeoArrowWKBWriterInit(&builder->wkb_writer));
-      GeoArrowWKBWriterInitVisitor(&builder->wkb_writer, &builder->v);
-      break;
-    default:
-      assert(0);
-  }
-
-  builder->handle = handle;
-  builder->v.error = &builder->error;
-  return GEOARROW_OK;
-}
-
-static GeoArrowErrorCode GeoArrowGEOSArrayBuilderEnsureCoords(
-    struct GeoArrowGEOSArrayBuilder* builder, uint32_t n_coords, int n_dims) {
-  int64_t n_required = n_coords * n_dims;
-  int64_t n_current = builder->coords_view.n_coords * builder->coords_view.n_values;
-  if (n_required > n_current) {
-    if ((n_current * 2) > n_required) {
-      n_required = n_current * 2;
-    }
-
-    builder->coords = (double*)realloc(builder->coords, n_required * sizeof(double));
-    if (builder->coords == NULL) {
-      builder->coords_view.n_coords = 0;
-      return ENOMEM;
-    }
-  }
-
-  builder->coords_view.n_coords = n_coords;
-  builder->coords_view.n_values = n_dims;
-  builder->coords_view.coords_stride = n_dims;
-  for (int i = 0; i < n_dims; i++) {
-    builder->coords_view.values[i] = builder->coords + i;
-  }
-
-  return GEOARROW_OK;
-}
-
-void GeoArrowGEOSArrayBuilderDestroy(struct GeoArrowGEOSArrayBuilder* builder) {
-  if (builder->coords != NULL) {
-    free(builder->coords);
-  }
-
-  if (builder->builder.private_data != NULL) {
-    GeoArrowBuilderReset(&builder->builder);
-  }
-
-  if (builder->wkt_writer.private_data != NULL) {
-    GeoArrowWKTWriterReset(&builder->wkt_writer);
-  }
-
-  if (builder->wkb_writer.private_data != NULL) {
-    GeoArrowWKBWriterReset(&builder->wkb_writer);
-  }
-
-  free(builder);
-}
-
-const char* GeoArrowGEOSArrayBuilderGetLastError(
-    struct GeoArrowGEOSArrayBuilder* builder) {
-  return builder->error.message;
-}
-
-GeoArrowGEOSErrorCode GeoArrowGEOSArrayBuilderFinish(
-    struct GeoArrowGEOSArrayBuilder* builder, struct ArrowArray* out) {
-  if (builder->wkt_writer.private_data != NULL) {
-    return GeoArrowWKTWriterFinish(&builder->wkt_writer, out, &builder->error);
-  } else if (builder->wkb_writer.private_data != NULL) {
-    return GeoArrowWKBWriterFinish(&builder->wkb_writer, out, &builder->error);
-  } else if (builder->builder.private_data != NULL) {
-    return GeoArrowBuilderFinish(&builder->builder, out, &builder->error);
-  } else {
-    GeoArrowErrorSet(&builder->error, "Invalid state");
-    return EINVAL;
-  }
-}
-
-static GeoArrowErrorCode VisitCoords(struct GeoArrowGEOSArrayBuilder* builder,
-                                     const GEOSCoordSequence* seq,
-                                     struct GeoArrowVisitor* v) {
-  unsigned int size = 0;
-  int result = GEOSCoordSeq_getSize_r(builder->handle, seq, &size);
-  if (result == 0) {
-    GeoArrowErrorSet(v->error, "GEOSCoordSeq_getSize_r() failed");
-    return ENOMEM;
-  }
-
-  if (size == 0) {
-    return GEOARROW_OK;
-  }
-
-  unsigned int dims = 0;
-  result = GEOSCoordSeq_getDimensions_r(builder->handle, seq, &dims);
-  if (result == 0) {
-    GeoArrowErrorSet(v->error, "GEOSCoordSeq_getDimensions_r() failed");
-    return ENOMEM;
-  }
-
-  // Make sure we have enough space to copy the coordinates into
-  GEOARROW_RETURN_NOT_OK(GeoArrowGEOSArrayBuilderEnsureCoords(builder, size, dims));
-
-  // Not sure exactly how M coordinates work in GEOS yet
-  result =
-      GEOSCoordSeq_copyToBuffer_r(builder->handle, seq, builder->coords, dims == 3, 0);
-  if (result == 0) {
-    GeoArrowErrorSet(v->error, "GEOSCoordSeq_copyToBuffer_r() failed");
-    return ENOMEM;
-  }
-
-  // Call the visitor method
-  GEOARROW_RETURN_NOT_OK(v->coords(v, &builder->coords_view));
-
-  return GEOARROW_OK;
-}
-
-static GeoArrowErrorCode VisitGeometry(struct GeoArrowGEOSArrayBuilder* builder,
-                                       const GEOSGeometry* geom,
-                                       struct GeoArrowVisitor* v) {
-  if (geom == NULL) {
-    GEOARROW_RETURN_NOT_OK(v->null_feat(v));
-    return GEOARROW_OK;
-  }
-
-  int type_id = GEOSGeomTypeId_r(builder->handle, geom);
-  int coord_dimension = GEOSGeom_getCoordinateDimension_r(builder->handle, geom);
-
-  enum GeoArrowGeometryType geoarrow_type = GEOARROW_GEOMETRY_TYPE_GEOMETRY;
-  enum GeoArrowDimensions geoarrow_dims = GEOARROW_DIMENSIONS_UNKNOWN;
-
-  // Not sure how M dimensions work yet
-  switch (coord_dimension) {
-    case 2:
-      geoarrow_dims = GEOARROW_DIMENSIONS_XY;
-      break;
-    case 3:
-      geoarrow_dims = GEOARROW_DIMENSIONS_XYZ;
-      break;
-    default:
-      GeoArrowErrorSet(v->error, "Unexpected GEOSGeom_getCoordinateDimension_r: %d",
-                       coord_dimension);
-      return EINVAL;
-  }
-
-  switch (type_id) {
-    case GEOS_POINT:
-      geoarrow_type = GEOARROW_GEOMETRY_TYPE_POINT;
-      break;
-    case GEOS_LINESTRING:
-    case GEOS_LINEARRING:
-      geoarrow_type = GEOARROW_GEOMETRY_TYPE_LINESTRING;
-      break;
-    case GEOS_POLYGON:
-      geoarrow_type = GEOARROW_GEOMETRY_TYPE_POLYGON;
-      break;
-    case GEOS_MULTIPOINT:
-      geoarrow_type = GEOARROW_GEOMETRY_TYPE_MULTIPOINT;
-      break;
-    case GEOS_MULTILINESTRING:
-      geoarrow_type = GEOARROW_GEOMETRY_TYPE_MULTILINESTRING;
-      break;
-    case GEOS_MULTIPOLYGON:
-      geoarrow_type = GEOARROW_GEOMETRY_TYPE_MULTIPOLYGON;
-      break;
-    case GEOS_GEOMETRYCOLLECTION:
-      geoarrow_type = GEOARROW_GEOMETRY_TYPE_GEOMETRYCOLLECTION;
-      break;
-    default:
-      GeoArrowErrorSet(v->error, "Unexpected GEOSGeomTypeId: %d", type_id);
-      return EINVAL;
-  }
-
-  GEOARROW_RETURN_NOT_OK(v->geom_start(v, geoarrow_type, geoarrow_dims));
-
-  switch (type_id) {
-    case GEOS_POINT:
-    case GEOS_LINESTRING:
-    case GEOS_LINEARRING: {
-      const GEOSCoordSequence* seq = GEOSGeom_getCoordSeq_r(builder->handle, geom);
-      if (seq == NULL) {
-        GeoArrowErrorSet(v->error, "GEOSGeom_getCoordSeq_r() failed");
-        return ENOMEM;
-      }
-
-      GEOARROW_RETURN_NOT_OK(VisitCoords(builder, seq, v));
-      break;
-    }
-
-    case GEOS_POLYGON: {
-      if (GEOSisEmpty_r(builder->handle, geom)) {
-        break;
-      }
-
-      const GEOSGeometry* ring = GEOSGetExteriorRing_r(builder->handle, geom);
-      if (ring == NULL) {
-        GeoArrowErrorSet(v->error, "GEOSGetExteriorRing_r() failed");
-        return ENOMEM;
-      }
-
-      GEOARROW_RETURN_NOT_OK(v->ring_start(v));
-      const GEOSCoordSequence* seq = GEOSGeom_getCoordSeq_r(builder->handle, ring);
-      if (seq == NULL) {
-        GeoArrowErrorSet(v->error, "GEOSGeom_getCoordSeq_r() failed");
-        return ENOMEM;
-      }
-
-      GEOARROW_RETURN_NOT_OK(VisitCoords(builder, seq, v));
-      GEOARROW_RETURN_NOT_OK(v->ring_end(v));
-
-      int size = GEOSGetNumInteriorRings_r(builder->handle, geom);
-      for (int i = 0; i < size; i++) {
-        ring = GEOSGetInteriorRingN_r(builder->handle, geom, i);
-        if (ring == NULL) {
-          GeoArrowErrorSet(v->error, "GEOSGetInteriorRingN_r() failed");
-          return ENOMEM;
-        }
-
-        GEOARROW_RETURN_NOT_OK(v->ring_start(v));
-        seq = GEOSGeom_getCoordSeq_r(builder->handle, ring);
-        if (seq == NULL) {
-          GeoArrowErrorSet(v->error, "GEOSGeom_getCoordSeq_r() failed");
-          return ENOMEM;
-        }
-
-        GEOARROW_RETURN_NOT_OK(VisitCoords(builder, seq, v));
-        GEOARROW_RETURN_NOT_OK(v->ring_end(v));
-      }
-
-      break;
-    }
-
-    case GEOS_MULTIPOINT:
-    case GEOS_MULTILINESTRING:
-    case GEOS_MULTIPOLYGON:
-    case GEOS_GEOMETRYCOLLECTION: {
-      int size = GEOSGetNumGeometries_r(builder->handle, geom);
-      for (int i = 0; i < size; i++) {
-        const GEOSGeometry* child = GEOSGetGeometryN_r(builder->handle, geom, i);
-        if (child == NULL) {
-          GeoArrowErrorSet(v->error, "GEOSGetGeometryN_r() failed");
-          return ENOMEM;
-        }
-
-        GEOARROW_RETURN_NOT_OK(VisitGeometry(builder, child, v));
-      }
-
-      break;
-    }
-    default:
-      GeoArrowErrorSet(v->error, "Unexpected GEOSGeomTypeId: %d", type_id);
-      return EINVAL;
-  }
-
-  GEOARROW_RETURN_NOT_OK(v->geom_end(v));
-  return GEOARROW_OK;
-}
-
-GeoArrowGEOSErrorCode GeoArrowGEOSArrayBuilderAppend(
-    struct GeoArrowGEOSArrayBuilder* builder, const GEOSGeometry** geom, size_t geom_size,
-    size_t* n_appended) {
-  *n_appended = 0;
-
-  for (size_t i = 0; i < geom_size; i++) {
-    GEOARROW_RETURN_NOT_OK(builder->v.feat_start(&builder->v));
-    GEOARROW_RETURN_NOT_OK(VisitGeometry(builder, geom[i], &builder->v));
-    GEOARROW_RETURN_NOT_OK(builder->v.feat_end(&builder->v));
-    *n_appended = i + 1;
-  }
-
-  return GEOARROW_OK;
-}
-
-// This should really be in nanoarrow and/or geoarrow
-struct GeoArrowGEOSBitmapReader {
-  const uint8_t* bits;
-  int64_t byte_i;
-  int bit_i;
-  uint8_t byte;
-};
-
-static inline void GeoArrowGEOSBitmapReaderInit(
-    struct GeoArrowGEOSBitmapReader* bitmap_reader, const uint8_t* bits, int64_t offset) {
-  memset(bitmap_reader, 0, sizeof(struct GeoArrowGEOSBitmapReader));
-  bitmap_reader->bits = bits;
-
-  if (bits != NULL) {
-    bitmap_reader->byte_i = offset / 8;
-    bitmap_reader->bit_i = offset % 8;
-    if (bitmap_reader->bit_i == 0) {
-      bitmap_reader->bit_i = 7;
-      bitmap_reader->byte_i--;
-    } else {
-      bitmap_reader->bit_i--;
-    }
-  }
-}
-
-static inline int8_t GeoArrowGEOSBitmapReaderNextIsNull(
-    struct GeoArrowGEOSBitmapReader* bitmap_reader) {
-  if (bitmap_reader->bits == NULL) {
-    return 0;
-  }
-
-  if (++bitmap_reader->bit_i == 8) {
-    bitmap_reader->byte = bitmap_reader->bits[++bitmap_reader->byte_i];
-    bitmap_reader->bit_i = 0;
-  }
-
-  return (bitmap_reader->byte & (1 << bitmap_reader->bit_i)) == 0;
-}
-
-struct GeoArrowGEOSArrayReader {
-  GEOSContextHandle_t handle;
-  struct GeoArrowError error;
-  struct GeoArrowArrayView array_view;
-  // In order to use GeoArrow's read capability we need to write a visitor-based
-  // constructor for GEOS geometries, which is complicated and may or may not be
-  // faster than GEOS' own readers.
-  GEOSWKTReader* wkt_reader;
-  GEOSWKBReader* wkb_reader;
-  // In-progress items that we might need to clean up if an error was returned
-  int64_t n_geoms[2];
-  GEOSGeometry** geoms[2];
-  struct GeoArrowGEOSBitmapReader bitmap_reader;
-  // GEOS' WKT reader needs null-terminated strings, but Arrow stores them in
-  // buffers without the null terminator. Thus, we need a bounce buffer to copy
-  // each WKT item into before passing to GEOS' reader.
-  size_t wkt_temp_size;
-  char* wkt_temp;
-};
-
-static GeoArrowErrorCode GeoArrowGEOSArrayReaderEnsureScratch(
-    struct GeoArrowGEOSArrayReader* reader, int64_t n_geoms, int level) {
-  if (n_geoms <= reader->n_geoms[level]) {
-    return GEOARROW_OK;
-  }
-
-  if ((reader->n_geoms[level] * 2) > n_geoms) {
-    n_geoms = reader->n_geoms[level] * 2;
-  }
-
-  reader->geoms[level] =
-      (GEOSGeometry**)realloc(reader->geoms[level], n_geoms * sizeof(GEOSGeometry*));
-  if (reader->geoms[level] == NULL) {
-    reader->n_geoms[level] = 0;
-    return ENOMEM;
-  }
-
-  memset(reader->geoms[level], 0, n_geoms * sizeof(GEOSGeometry*));
-  return GEOARROW_OK;
-}
-
-static void GeoArrowGEOSArrayReaderResetScratch(struct GeoArrowGEOSArrayReader* reader) {
-  for (int level = 0; level < 2; level++) {
-    for (int64_t i = 0; i < reader->n_geoms[level]; i++) {
-      if (reader->geoms[level][i] != NULL) {
-        GEOSGeom_destroy_r(reader->handle, reader->geoms[level][i]);
-        reader->geoms[level][i] = NULL;
-      }
-    }
-  }
-}
-
-static GeoArrowErrorCode GeoArrowGEOSArrayReaderEnsureWKTTemp(
-    struct GeoArrowGEOSArrayReader* reader, int64_t item_size) {
-  if (item_size <= reader->wkt_temp_size) {
-    return GEOARROW_OK;
-  }
-
-  if ((reader->wkt_temp_size * 2) > item_size) {
-    item_size = reader->wkt_temp_size * 2;
-  }
-
-  reader->wkt_temp = (char*)realloc(reader->wkt_temp, item_size);
-  if (reader->wkt_temp == NULL) {
-    reader->wkt_temp_size = 0;
-    return ENOMEM;
-  }
-
-  return GEOARROW_OK;
-}
-
-GeoArrowGEOSErrorCode GeoArrowGEOSArrayReaderCreate(
-    GEOSContextHandle_t handle, struct ArrowSchema* schema,
-    struct GeoArrowGEOSArrayReader** out) {
-  struct GeoArrowGEOSArrayReader* reader =
-      (struct GeoArrowGEOSArrayReader*)malloc(sizeof(struct GeoArrowGEOSArrayReader));
-  if (reader == NULL) {
-    *out = NULL;
-    return ENOMEM;
-  }
-
-  memset(reader, 0, sizeof(struct GeoArrowGEOSArrayReader));
-  *out = reader;
-
-  reader->handle = handle;
-  GEOARROW_RETURN_NOT_OK(
-      GeoArrowArrayViewInitFromSchema(&reader->array_view, schema, &reader->error));
-
-  return GEOARROW_OK;
-}
-
-const char* GeoArrowGEOSArrayReaderGetLastError(struct GeoArrowGEOSArrayReader* reader) {
-  return reader->error.message;
-}
-
-static GeoArrowErrorCode MakeGeomFromWKB(struct GeoArrowGEOSArrayReader* reader,
-                                         size_t offset, size_t length, GEOSGeometry** out,
-                                         size_t* n_out) {
-  offset += reader->array_view.offset[0];
-
-  GeoArrowGEOSBitmapReaderInit(&reader->bitmap_reader, reader->array_view.validity_bitmap,
-                               offset);
-
-  for (size_t i = 0; i < length; i++) {
-    if (GeoArrowGEOSBitmapReaderNextIsNull(&reader->bitmap_reader)) {
-      out[i] = NULL;
-      *n_out += 1;
-      continue;
-    }
-
-    int64_t data_offset = reader->array_view.offsets[0][i + offset];
-    int64_t data_size = reader->array_view.offsets[0][i + offset + 1] - data_offset;
-
-    out[i] = GEOSWKBReader_read_r(reader->handle, reader->wkb_reader,
-                                  reader->array_view.data + data_offset, data_size);
-    if (out[i] == NULL) {
-      GeoArrowErrorSet(&reader->error, "[%ld] GEOSWKBReader_read_r() failed", (long)i);
-      return ENOMEM;
-    }
-
-    *n_out += 1;
-  }
-
-  return GEOARROW_OK;
-}
-
-static GeoArrowErrorCode MakeGeomFromWKT(struct GeoArrowGEOSArrayReader* reader,
-                                         size_t offset, size_t length, GEOSGeometry** out,
-                                         size_t* n_out) {
-  offset += reader->array_view.offset[0];
-
-  GeoArrowGEOSBitmapReaderInit(&reader->bitmap_reader, reader->array_view.validity_bitmap,
-                               offset);
-
-  for (size_t i = 0; i < length; i++) {
-    if (GeoArrowGEOSBitmapReaderNextIsNull(&reader->bitmap_reader)) {
-      out[i] = NULL;
-      *n_out += 1;
-      continue;
-    }
-
-    int64_t data_offset = reader->array_view.offsets[0][i];
-    int64_t data_size = reader->array_view.offsets[0][i + 1] - data_offset;
-
-    // GEOSWKTReader_read_r() requires a null-terminated string. To ensure that, we
-    // copy into memory we own and add the null-terminator ourselves.
-    GEOARROW_RETURN_NOT_OK(GeoArrowGEOSArrayReaderEnsureWKTTemp(reader, data_size + 1));
-    memcpy(reader->wkt_temp, reader->array_view.data + data_offset, data_size);
-    reader->wkt_temp[data_size] = '\0';
-
-    out[i] = GEOSWKTReader_read_r(reader->handle, reader->wkt_reader, reader->wkt_temp);
-    if (out[i] == NULL) {
-      GeoArrowErrorSet(&reader->error, "[%ld] GEOSWKBReader_read_r() failed", (long)i);
-      return ENOMEM;
-    }
-
-    *n_out += 1;
-  }
-
-  return GEOARROW_OK;
-}
-
-static GeoArrowErrorCode MakeCoordSeq(struct GeoArrowGEOSArrayReader* reader,
-                                      size_t offset, size_t length,
-                                      GEOSCoordSequence** out) {
-  offset += reader->array_view.offset[reader->array_view.n_offsets];
-  struct GeoArrowCoordView* coords = &reader->array_view.coords;
-  const double* z = NULL;
-  const double* m = NULL;
-
-  switch (reader->array_view.schema_view.dimensions) {
-    case GEOARROW_DIMENSIONS_XYZ:
-      z = coords->values[2];
-      break;
-    case GEOARROW_DIMENSIONS_XYM:
-      m = coords->values[2];
-      break;
-    case GEOARROW_DIMENSIONS_XYZM:
-      z = coords->values[2];
-      m = coords->values[3];
-      break;
-    default:
-      break;
-  }
-
-  GEOSCoordSequence* seq;
-
-  switch (reader->array_view.schema_view.coord_type) {
-    case GEOARROW_COORD_TYPE_SEPARATE:
-      seq = GEOSCoordSeq_copyFromArrays_r(reader->handle, coords->values[0] + offset,
-                                          coords->values[1] + offset, z, m, length);
-      break;
-    case GEOARROW_COORD_TYPE_INTERLEAVED:
-      seq = GEOSCoordSeq_copyFromBuffer_r(reader->handle,
-                                          coords->values[0] + (offset * coords->n_values),
-                                          length, z != NULL, m != NULL);
-      break;
-    default:
-      GeoArrowErrorSet(&reader->error, "Unsupported coord type");
-      return ENOTSUP;
-  }
-
-  if (seq == NULL) {
-    GeoArrowErrorSet(&reader->error, "GEOSCoordSeq_copyFromArrays_r() failed");
-    return ENOMEM;
-  }
-
-  *out = seq;
-  return GEOARROW_OK;
-}
-
-static GeoArrowErrorCode MakePoints(struct GeoArrowGEOSArrayReader* reader, size_t offset,
-                                    size_t length, GEOSGeometry** out, size_t* n_out) {
-  int top_level =
-      reader->array_view.schema_view.geometry_type == GEOARROW_GEOMETRY_TYPE_POINT;
-  if (top_level) {
-    GeoArrowGEOSBitmapReaderInit(&reader->bitmap_reader,
-                                 reader->array_view.validity_bitmap,
-                                 reader->array_view.offset[0] + offset);
-  }
-
-  GEOSCoordSequence* seq = NULL;
-  for (size_t i = 0; i < length; i++) {
-    if (top_level && GeoArrowGEOSBitmapReaderNextIsNull(&reader->bitmap_reader)) {
-      out[i] = NULL;
-      *n_out += 1;
-      continue;
-    }
-
-    GEOARROW_RETURN_NOT_OK(MakeCoordSeq(reader, offset + i, 1, &seq));
-    out[i] = GEOSGeom_createPoint_r(reader->handle, seq);
-    if (out[i] == NULL) {
-      GEOSCoordSeq_destroy_r(reader->handle, seq);
-      GeoArrowErrorSet(&reader->error, "[%ld] GEOSGeom_createPoint_r() failed", (long)i);
-      return ENOMEM;
-    }
-
-    *n_out += 1;
-  }
-
-  return GEOARROW_OK;
-}
-
-static GeoArrowErrorCode MakeLinestrings(struct GeoArrowGEOSArrayReader* reader,
-                                         size_t offset, size_t length, GEOSGeometry** out,
-                                         size_t* n_out) {
-  offset += reader->array_view.offset[reader->array_view.n_offsets - 1];
-  const int32_t* coord_offsets =
-      reader->array_view.offsets[reader->array_view.n_offsets - 1];
-
-  int top_level =
-      reader->array_view.schema_view.geometry_type == GEOARROW_GEOMETRY_TYPE_LINESTRING;
-  if (top_level) {
-    GeoArrowGEOSBitmapReaderInit(&reader->bitmap_reader,
-                                 reader->array_view.validity_bitmap, offset);
-  }
-
-  GEOSCoordSequence* seq = NULL;
-  for (size_t i = 0; i < length; i++) {
-    if (top_level && GeoArrowGEOSBitmapReaderNextIsNull(&reader->bitmap_reader)) {
-      out[i] = NULL;
-      *n_out += 1;
-      continue;
-    }
-
-    GEOARROW_RETURN_NOT_OK(
-        MakeCoordSeq(reader, coord_offsets[offset + i],
-                     coord_offsets[offset + i + 1] - coord_offsets[offset + i], &seq));
-    out[i] = GEOSGeom_createLineString_r(reader->handle, seq);
-    if (out[i] == NULL) {
-      GEOSCoordSeq_destroy_r(reader->handle, seq);
-      GeoArrowErrorSet(&reader->error, "[%ld] GEOSGeom_createLineString_r() failed",
-                       (long)i);
-      return ENOMEM;
-    }
-
-    *n_out += 1;
-  }
-
-  return GEOARROW_OK;
-}
-
-static GeoArrowErrorCode MakeLinearrings(struct GeoArrowGEOSArrayReader* reader,
-                                         size_t offset, size_t length,
-                                         GEOSGeometry** out) {
-  offset += reader->array_view.offset[reader->array_view.n_offsets - 1];
-  const int32_t* coord_offsets =
-      reader->array_view.offsets[reader->array_view.n_offsets - 1];
-
-  GEOSCoordSequence* seq = NULL;
-  for (size_t i = 0; i < length; i++) {
-    GEOARROW_RETURN_NOT_OK(
-        MakeCoordSeq(reader, coord_offsets[offset + i],
-                     coord_offsets[offset + i + 1] - coord_offsets[offset + i], &seq));
-    out[i] = GEOSGeom_createLinearRing_r(reader->handle, seq);
-    if (out[i] == NULL) {
-      GEOSCoordSeq_destroy_r(reader->handle, seq);
-      GeoArrowErrorSet(&reader->error, "[%ld] GEOSGeom_createLinearRing_r() failed",
-                       (long)i);
-      return ENOMEM;
-    }
-  }
-
-  return GEOARROW_OK;
-}
-
-static GeoArrowErrorCode MakePolygons(struct GeoArrowGEOSArrayReader* reader,
-                                      size_t offset, size_t length, GEOSGeometry** out,
-                                      size_t* n_out) {
-  offset += reader->array_view.offset[reader->array_view.n_offsets - 2];
-  const int32_t* ring_offsets =
-      reader->array_view.offsets[reader->array_view.n_offsets - 2];
-
-  int top_level =
-      reader->array_view.schema_view.geometry_type == GEOARROW_GEOMETRY_TYPE_POLYGON;
-  if (top_level) {
-    GeoArrowGEOSBitmapReaderInit(&reader->bitmap_reader,
-                                 reader->array_view.validity_bitmap, offset);
-  }
-
-  for (size_t i = 0; i < length; i++) {
-    if (top_level && GeoArrowGEOSBitmapReaderNextIsNull(&reader->bitmap_reader)) {
-      out[i] = NULL;
-      *n_out += 1;
-      continue;
-    }
-
-    int64_t ring_offset = ring_offsets[offset + i];
-    int64_t n_rings = ring_offsets[offset + i + 1] - ring_offset;
-
-    if (n_rings == 0) {
-      out[i] = GEOSGeom_createEmptyPolygon_r(reader->handle);
-    } else {
-      GEOARROW_RETURN_NOT_OK(GeoArrowGEOSArrayReaderEnsureScratch(reader, n_rings, 0));
-      GEOARROW_RETURN_NOT_OK(
-          MakeLinearrings(reader, ring_offset, n_rings, reader->geoms[0]));
-      out[i] = GEOSGeom_createPolygon_r(reader->handle, reader->geoms[0][0],
-                                        reader->geoms[0] + 1, n_rings - 1);
-      memset(reader->geoms[0], 0, n_rings * sizeof(GEOSGeometry*));
-    }
-
-    if (out[i] == NULL) {
-      GeoArrowErrorSet(&reader->error, "[%ld] GEOSGeom_createPolygon_r() failed",
-                       (long)i);
-      return ENOMEM;
-    }
-
-    *n_out += 1;
-  }
-
-  return GEOARROW_OK;
-}
-
-typedef GeoArrowErrorCode (*GeoArrowGEOSPartMaker)(struct GeoArrowGEOSArrayReader* reader,
-                                                   size_t offset, size_t length,
-                                                   GEOSGeometry** out, size_t* n_out);
-
-static GeoArrowErrorCode MakeCollection(struct GeoArrowGEOSArrayReader* reader,
-                                        size_t offset, size_t length, GEOSGeometry** out,
-                                        int geom_level, int offset_level, int geos_type,
-                                        GeoArrowGEOSPartMaker part_maker, size_t* n_out) {
-  offset += reader->array_view.offset[reader->array_view.n_offsets - offset_level];
-  const int32_t* part_offsets =
-      reader->array_view.offsets[reader->array_view.n_offsets - offset_level];
-
-  // Currently collections are always outer geometries
-  GeoArrowGEOSBitmapReaderInit(&reader->bitmap_reader, reader->array_view.validity_bitmap,
-                               offset);
-
-  size_t part_n_out = 0;
-  for (size_t i = 0; i < length; i++) {
-    if (GeoArrowGEOSBitmapReaderNextIsNull(&reader->bitmap_reader)) {
-      out[i] = NULL;
-      *n_out += 1;
-      continue;
-    }
-
-    int64_t part_offset = part_offsets[offset + i];
-    int64_t n_parts = part_offsets[offset + i + 1] - part_offset;
-
-    if (n_parts == 0) {
-      out[i] = GEOSGeom_createEmptyCollection_r(reader->handle, geos_type);
-    } else {
-      GEOARROW_RETURN_NOT_OK(
-          GeoArrowGEOSArrayReaderEnsureScratch(reader, n_parts, geom_level));
-      GEOARROW_RETURN_NOT_OK(part_maker(reader, part_offset, n_parts,
-                                        reader->geoms[geom_level], &part_n_out));
-      out[i] = GEOSGeom_createCollection_r(reader->handle, geos_type,
-                                           reader->geoms[geom_level], n_parts);
-      memset(reader->geoms[geom_level], 0, n_parts * sizeof(GEOSGeometry*));
-    }
-
-    if (out[i] == NULL) {
-      GeoArrowErrorSet(&reader->error, "[%ld] GEOSGeom_createEmptyCollection_r() failed",
-                       (long)i);
-      return ENOMEM;
-    }
-
-    *n_out += 1;
-  }
-
-  return GEOARROW_OK;
-}
-
-GeoArrowGEOSErrorCode GeoArrowGEOSArrayReaderRead(struct GeoArrowGEOSArrayReader* reader,
-                                                  struct ArrowArray* array, size_t offset,
-                                                  size_t length, GEOSGeometry** out,
-                                                  size_t* n_out) {
-  GeoArrowGEOSArrayReaderResetScratch(reader);
-
-  GEOARROW_RETURN_NOT_OK(
-      GeoArrowArrayViewSetArray(&reader->array_view, array, &reader->error));
-
-  GeoArrowGEOSBitmapReaderInit(&reader->bitmap_reader, NULL, 0);
-
-  memset(out, 0, sizeof(GEOSGeometry*) * length);
-  *n_out = 0;
-
-  GeoArrowErrorCode result;
-  switch (reader->array_view.schema_view.type) {
-    case GEOARROW_TYPE_WKB:
-      if (reader->wkb_reader == NULL) {
-        reader->wkb_reader = GEOSWKBReader_create_r(reader->handle);
-        if (reader->wkb_reader == NULL) {
-          GeoArrowErrorSet(&reader->error, "GEOSWKBReader_create_r() failed");
-          return ENOMEM;
-        }
-      }
-
-      result = MakeGeomFromWKB(reader, offset, length, out, n_out);
-      break;
-    case GEOARROW_TYPE_WKT:
-      if (reader->wkt_reader == NULL) {
-        reader->wkt_reader = GEOSWKTReader_create_r(reader->handle);
-        if (reader->wkt_reader == NULL) {
-          GeoArrowErrorSet(&reader->error, "GEOSWKTReader_create_r() failed");
-          return ENOMEM;
-        }
-      }
-
-      result = MakeGeomFromWKT(reader, offset, length, out, n_out);
-      break;
-    default:
-      switch (reader->array_view.schema_view.geometry_type) {
-        case GEOARROW_GEOMETRY_TYPE_POINT:
-          result = MakePoints(reader, offset, length, out, n_out);
-          break;
-        case GEOARROW_GEOMETRY_TYPE_LINESTRING:
-          result = MakeLinestrings(reader, offset, length, out, n_out);
-          break;
-        case GEOARROW_GEOMETRY_TYPE_POLYGON:
-          result = MakePolygons(reader, offset, length, out, n_out);
-          break;
-        case GEOARROW_GEOMETRY_TYPE_MULTIPOINT:
-          result = MakeCollection(reader, offset, length, out, 0, 1, GEOS_MULTIPOINT,
-                                  &MakePoints, n_out);
-          break;
-        case GEOARROW_GEOMETRY_TYPE_MULTILINESTRING:
-          result = MakeCollection(reader, offset, length, out, 0, 2, GEOS_MULTILINESTRING,
-                                  &MakeLinestrings, n_out);
-          break;
-        case GEOARROW_GEOMETRY_TYPE_MULTIPOLYGON:
-          result = MakeCollection(reader, offset, length, out, 1, 3, GEOS_MULTIPOLYGON,
-                                  &MakePolygons, n_out);
-          break;
-        default:
-          GeoArrowErrorSet(&reader->error,
-                           "GeoArrowGEOSArrayReaderRead not implemented for array type");
-          return ENOTSUP;
-      }
-  }
-
-  return result;
-}
-
-void GeoArrowGEOSArrayReaderDestroy(struct GeoArrowGEOSArrayReader* reader) {
-  if (reader->wkt_reader != NULL) {
-    GEOSWKTReader_destroy_r(reader->handle, reader->wkt_reader);
-  }
-
-  if (reader->wkb_reader != NULL) {
-    GEOSWKBReader_destroy_r(reader->handle, reader->wkb_reader);
-  }
-
-  GeoArrowGEOSArrayReaderResetScratch(reader);
-
-  for (int i = 0; i < 2; i++) {
-    if (reader->geoms[i] != NULL) {
-      free(reader->geoms[i]);
-    }
-  }
-
-  if (reader->wkt_temp != NULL) {
-    free(reader->wkt_temp);
-  }
-
-  free(reader);
-}
-
-struct GeoArrowGEOSSchemaCalculator {
-  int geometry_type;
-  int dimensions;
-};
-
-GeoArrowGEOSErrorCode GeoArrowGEOSSchemaCalculatorCreate(
-    struct GeoArrowGEOSSchemaCalculator** out) {
-  struct GeoArrowGEOSSchemaCalculator* calc =
-      (struct GeoArrowGEOSSchemaCalculator*)malloc(
-          sizeof(struct GeoArrowGEOSSchemaCalculator));
-  if (calc == NULL) {
-    *out = NULL;
-    return ENOMEM;
-  }
-
-  calc->geometry_type = -1;
-  calc->dimensions = GEOARROW_DIMENSIONS_UNKNOWN;
-  *out = calc;
-
-  return GEOARROW_OK;
-}
-
-static int GeometryType2(int x, int y) {
-  switch (x) {
-    case -1:
-      return y;
-    case GEOARROW_GEOMETRY_TYPE_GEOMETRY:
-      return x;
-    case GEOARROW_GEOMETRY_TYPE_POINT:
-      switch (y) {
-        case -1:
-          return x;
-        case GEOARROW_TYPE_POINT:
-        case GEOARROW_TYPE_MULTIPOINT:
-          return y;
-        default:
-          return GEOARROW_GEOMETRY_TYPE_GEOMETRY;
-      }
-    case GEOARROW_GEOMETRY_TYPE_LINESTRING:
-      switch (y) {
-        case -1:
-          return x;
-        case GEOARROW_TYPE_LINESTRING:
-        case GEOARROW_TYPE_MULTILINESTRING:
-          return y;
-        default:
-          return GEOARROW_GEOMETRY_TYPE_GEOMETRY;
-      }
-    case GEOARROW_GEOMETRY_TYPE_POLYGON:
-      switch (y) {
-        case -1:
-          return x;
-        case GEOARROW_TYPE_POLYGON:
-        case GEOARROW_TYPE_MULTIPOLYGON:
-          return y;
-        default:
-          return GEOARROW_GEOMETRY_TYPE_GEOMETRY;
-      }
-    case GEOARROW_GEOMETRY_TYPE_MULTIPOINT:
-      switch (y) {
-        case -1:
-          return x;
-        case GEOARROW_TYPE_POINT:
-        case GEOARROW_TYPE_MULTIPOINT:
-          return x;
-        default:
-          return GEOARROW_GEOMETRY_TYPE_GEOMETRY;
-      }
-    case GEOARROW_GEOMETRY_TYPE_MULTILINESTRING:
-      switch (y) {
-        case -1:
-          return x;
-        case GEOARROW_TYPE_LINESTRING:
-        case GEOARROW_TYPE_MULTILINESTRING:
-          return x;
-        default:
-          return GEOARROW_GEOMETRY_TYPE_GEOMETRY;
-      }
-    case GEOARROW_GEOMETRY_TYPE_MULTIPOLYGON:
-      switch (y) {
-        case -1:
-          return x;
-        case GEOARROW_TYPE_POLYGON:
-        case GEOARROW_TYPE_MULTIPOLYGON:
-          return x;
-        default:
-          return GEOARROW_GEOMETRY_TYPE_GEOMETRY;
-      }
-    case GEOARROW_GEOMETRY_TYPE_GEOMETRYCOLLECTION:
-      switch (y) {
-        case -1:
-          return x;
-        case GEOARROW_GEOMETRY_TYPE_GEOMETRYCOLLECTION:
-          return x;
-        default:
-          return GEOARROW_GEOMETRY_TYPE_GEOMETRY;
-      }
-    default:
-      return GEOARROW_GEOMETRY_TYPE_GEOMETRY;
-  }
-}
-
-static int Dimensions2(int x, int y) {
-  switch (x) {
-    case GEOARROW_DIMENSIONS_UNKNOWN:
-      return y;
-    case GEOARROW_DIMENSIONS_XY:
-      switch (y) {
-        case GEOARROW_DIMENSIONS_UNKNOWN:
-          return x;
-        default:
-          return y;
-      }
-    case GEOARROW_DIMENSIONS_XYZ:
-      switch (y) {
-        case GEOARROW_DIMENSIONS_UNKNOWN:
-          return x;
-        case GEOARROW_DIMENSIONS_XYM:
-          return GEOARROW_DIMENSIONS_XYZM;
-        default:
-          return y;
-      }
-    case GEOARROW_DIMENSIONS_XYM:
-      switch (y) {
-        case GEOARROW_DIMENSIONS_UNKNOWN:
-          return x;
-        case GEOARROW_DIMENSIONS_XYZ:
-          return GEOARROW_DIMENSIONS_XYZM;
-        default:
-          return y;
-      }
-    default:
-      return GEOARROW_DIMENSIONS_XYZM;
-  }
-}
-
-void GeoArrowGEOSSchemaCalculatorIngest(struct GeoArrowGEOSSchemaCalculator* calc,
-                                        const int32_t* wkb_type, size_t n) {
-  for (size_t i = 0; i < n; i++) {
-    if (wkb_type[i] == 0) {
-      continue;
-    }
-
-    calc->geometry_type = GeometryType2(calc->geometry_type, wkb_type[i] % 1000);
-    calc->dimensions = Dimensions2(calc->dimensions, wkb_type[i] / 1000);
-  }
-}
-
-GeoArrowGEOSErrorCode GeoArrowGEOSSchemaCalculatorFinish(
-    struct GeoArrowGEOSSchemaCalculator* calc, enum GeoArrowGEOSEncoding encoding,
-    struct ArrowSchema* out) {
-  enum GeoArrowCoordType coord_type;
-  switch (encoding) {
-    case GEOARROW_GEOS_ENCODING_WKT:
-    case GEOARROW_GEOS_ENCODING_WKB:
-      return GeoArrowGEOSMakeSchema(encoding, 0, out);
-    case GEOARROW_GEOS_ENCODING_GEOARROW:
-      coord_type = GEOARROW_COORD_TYPE_INTERLEAVED;
-      break;
-    case GEOARROW_GEOS_ENCODING_GEOARROW_INTERLEAVED:
-      coord_type = GEOARROW_COORD_TYPE_INTERLEAVED;
-      break;
-    default:
-      return EINVAL;
-  }
-
-  enum GeoArrowGeometryType geometry_type;
-  switch (calc->geometry_type) {
-    case GEOARROW_GEOMETRY_TYPE_POINT:
-    case GEOARROW_GEOMETRY_TYPE_LINESTRING:
-    case GEOARROW_GEOMETRY_TYPE_POLYGON:
-    case GEOARROW_GEOMETRY_TYPE_MULTIPOINT:
-    case GEOARROW_GEOMETRY_TYPE_MULTILINESTRING:
-    case GEOARROW_GEOMETRY_TYPE_MULTIPOLYGON:
-      geometry_type = (enum GeoArrowGeometryType)calc->geometry_type;
-      break;
-    case -1:
-      // We don't have an "empty"/"null" type to return, but "POINT" is also
-      // not quite right.
-    default:
-      return GeoArrowGEOSMakeSchema(GEOARROW_GEOS_ENCODING_WKB, 0, out);
-  }
-
-  enum GeoArrowDimensions dimensions;
-  switch (calc->dimensions) {
-    case GEOARROW_DIMENSIONS_UNKNOWN:
-      dimensions = GEOARROW_DIMENSIONS_XY;
-      break;
-    case GEOARROW_DIMENSIONS_XY:
-    case GEOARROW_DIMENSIONS_XYZ:
-    case GEOARROW_DIMENSIONS_XYM:
-    case GEOARROW_DIMENSIONS_XYZM:
-      dimensions = (enum GeoArrowDimensions)calc->dimensions;
-      break;
-    default:
-      return GeoArrowGEOSMakeSchema(GEOARROW_GEOS_ENCODING_WKB, 0, out);
-  }
-
-  enum GeoArrowType type = GeoArrowMakeType(geometry_type, dimensions, coord_type);
-  GEOARROW_RETURN_NOT_OK(GeoArrowSchemaInitExtension(out, type));
-  return GEOARROW_OK;
-}
-
-void GeoArrowGEOSSchemaCalculatorDestroy(struct GeoArrowGEOSSchemaCalculator* calc) {
-  free(calc);
-}
-
-GeoArrowGEOSErrorCode GeoArrowGEOSMakeSchema(int32_t encoding, int32_t wkb_type,
-                                             struct ArrowSchema* out) {
-  enum GeoArrowType type = GEOARROW_TYPE_UNINITIALIZED;
-  enum GeoArrowGeometryType geometry_type = GEOARROW_GEOMETRY_TYPE_GEOMETRY;
-  enum GeoArrowDimensions dimensions = GEOARROW_DIMENSIONS_UNKNOWN;
-  enum GeoArrowCoordType coord_type = GEOARROW_COORD_TYPE_UNKNOWN;
-
-  switch (encoding) {
-    case GEOARROW_GEOS_ENCODING_WKT:
-      type = GEOARROW_TYPE_WKT;
-      break;
-    case GEOARROW_GEOS_ENCODING_WKB:
-      type = GEOARROW_TYPE_WKB;
-      break;
-    case GEOARROW_GEOS_ENCODING_GEOARROW:
-      coord_type = GEOARROW_COORD_TYPE_SEPARATE;
-      break;
-    case GEOARROW_GEOS_ENCODING_GEOARROW_INTERLEAVED:
-      coord_type = GEOARROW_COORD_TYPE_INTERLEAVED;
-      break;
-    default:
-      return EINVAL;
-  }
-
-  if (type == GEOARROW_TYPE_UNINITIALIZED) {
-    geometry_type = wkb_type % 1000;
-    dimensions = wkb_type / 1000 + 1;
-    type = GeoArrowMakeType(geometry_type, dimensions, coord_type);
-  }
-
-  GEOARROW_RETURN_NOT_OK(GeoArrowSchemaInitExtension(out, type));
-  return GEOARROW_OK;
-}
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/geoarrow_geos/geoarrow_geos.h b/c/sedona-libgpuspatial/libgpuspatial/test/geoarrow_geos/geoarrow_geos.h
deleted file mode 100644
index 35a36c538..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/test/geoarrow_geos/geoarrow_geos.h
+++ /dev/null
@@ -1,176 +0,0 @@
-
-#ifndef GEOARROW_GEOS_H_INCLUDED
-#define GEOARROW_GEOS_H_INCLUDED
-
-#include <geos_c.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Extra guard for versions of Arrow without the canonical guard
-#ifndef ARROW_FLAG_DICTIONARY_ORDERED
-
-#ifndef ARROW_C_DATA_INTERFACE
-#define ARROW_C_DATA_INTERFACE
-
-#define ARROW_FLAG_DICTIONARY_ORDERED 1
-#define ARROW_FLAG_NULLABLE 2
-#define ARROW_FLAG_MAP_KEYS_SORTED 4
-
-struct ArrowSchema {
-  // Array type description
-  const char* format;
-  const char* name;
-  const char* metadata;
-  int64_t flags;
-  int64_t n_children;
-  struct ArrowSchema** children;
-  struct ArrowSchema* dictionary;
-
-  // Release callback
-  void (*release)(struct ArrowSchema*);
-  // Opaque producer-specific data
-  void* private_data;
-};
-
-struct ArrowArray {
-  // Array data description
-  int64_t length;
-  int64_t null_count;
-  int64_t offset;
-  int64_t n_buffers;
-  int64_t n_children;
-  const void** buffers;
-  struct ArrowArray** children;
-  struct ArrowArray* dictionary;
-
-  // Release callback
-  void (*release)(struct ArrowArray*);
-  // Opaque producer-specific data
-  void* private_data;
-};
-
-#endif  // ARROW_C_DATA_INTERFACE
-
-#endif
-
-#define GEOARROW_GEOS_OK 0
-
-enum GeoArrowGEOSEncoding {
-  GEOARROW_GEOS_ENCODING_UNKNOWN = 0,
-  GEOARROW_GEOS_ENCODING_WKT,
-  GEOARROW_GEOS_ENCODING_WKB,
-  GEOARROW_GEOS_ENCODING_GEOARROW,
-  GEOARROW_GEOS_ENCODING_GEOARROW_INTERLEAVED
-};
-
-typedef int GeoArrowGEOSErrorCode;
-
-const char* GeoArrowGEOSVersionGEOS(void);
-
-const char* GeoArrowGEOSVersionGeoArrow(void);
-
-struct GeoArrowGEOSArrayBuilder;
-
-GeoArrowGEOSErrorCode GeoArrowGEOSArrayBuilderCreate(
-    GEOSContextHandle_t handle, struct ArrowSchema* schema,
-    struct GeoArrowGEOSArrayBuilder** out);
-
-void GeoArrowGEOSArrayBuilderDestroy(struct GeoArrowGEOSArrayBuilder* builder);
-
-const char* GeoArrowGEOSArrayBuilderGetLastError(
-    struct GeoArrowGEOSArrayBuilder* builder);
-
-GeoArrowGEOSErrorCode GeoArrowGEOSArrayBuilderAppend(
-    struct GeoArrowGEOSArrayBuilder* builder, const GEOSGeometry** geom, size_t geom_size,
-    size_t* n_appended);
-
-GeoArrowGEOSErrorCode GeoArrowGEOSArrayBuilderFinish(
-    struct GeoArrowGEOSArrayBuilder* builder, struct ArrowArray* out);
-
-struct GeoArrowGEOSArrayReader;
-
-GeoArrowGEOSErrorCode GeoArrowGEOSArrayReaderCreate(GEOSContextHandle_t handle,
-                                                    struct ArrowSchema* schema,
-                                                    struct GeoArrowGEOSArrayReader** out);
-
-const char* GeoArrowGEOSArrayReaderGetLastError(struct GeoArrowGEOSArrayReader* reader);
-
-GeoArrowGEOSErrorCode GeoArrowGEOSArrayReaderRead(struct GeoArrowGEOSArrayReader* reader,
-                                                  struct ArrowArray* array, size_t offset,
-                                                  size_t length, GEOSGeometry** out,
-                                                  size_t* n_out);
-
-void GeoArrowGEOSArrayReaderDestroy(struct GeoArrowGEOSArrayReader* reader);
-
-struct GeoArrowGEOSSchemaCalculator;
-
-GeoArrowGEOSErrorCode GeoArrowGEOSSchemaCalculatorCreate(
-    struct GeoArrowGEOSSchemaCalculator** out);
-
-void GeoArrowGEOSSchemaCalculatorIngest(struct GeoArrowGEOSSchemaCalculator* calc,
-                                        const int32_t* wkb_type, size_t n);
-
-GeoArrowGEOSErrorCode GeoArrowGEOSSchemaCalculatorFinish(
-    struct GeoArrowGEOSSchemaCalculator* calc, enum GeoArrowGEOSEncoding encoding,
-    struct ArrowSchema* out);
-
-void GeoArrowGEOSSchemaCalculatorDestroy(struct GeoArrowGEOSSchemaCalculator* calc);
-
-GeoArrowGEOSErrorCode GeoArrowGEOSMakeSchema(int32_t encoding, int32_t wkb_type,
-                                             struct ArrowSchema* out);
-
-static inline int32_t GeoArrowGEOSWKBType(GEOSContextHandle_t handle,
-                                          const GEOSGeometry* geom) {
-  if (geom == NULL || GEOSGetNumCoordinates_r(handle, geom) == 0) {
-    return 0;
-  }
-
-  int n_dim = GEOSGeom_getCoordinateDimension_r(handle, geom);
-
-  // Not sure how GEOS handles M in newer versions
-  int32_t wkb_type;
-  if (n_dim == 3) {
-    wkb_type = 2000;
-  } else {
-    wkb_type = 0;
-  }
-
-  int type_id = GEOSGeomTypeId_r(handle, geom);
-  switch (type_id) {
-    case GEOS_POINT:
-      wkb_type += 1;
-      break;
-    case GEOS_LINEARRING:
-    case GEOS_LINESTRING:
-      wkb_type += 2;
-      break;
-    case GEOS_POLYGON:
-      wkb_type += 3;
-      break;
-    case GEOS_MULTIPOINT:
-      wkb_type += 4;
-      break;
-    case GEOS_MULTILINESTRING:
-      wkb_type += 5;
-      break;
-    case GEOS_MULTIPOLYGON:
-      wkb_type += 6;
-      break;
-    case GEOS_GEOMETRYCOLLECTION:
-      wkb_type += 7;
-      break;
-    default:
-      break;
-  }
-
-  return wkb_type;
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/geoarrow_geos/geoarrow_geos.hpp b/c/sedona-libgpuspatial/libgpuspatial/test/geoarrow_geos/geoarrow_geos.hpp
deleted file mode 100644
index 29c768cf7..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/test/geoarrow_geos/geoarrow_geos.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-
-#include <vector>
-
-#include "geoarrow_geos.h"
-
-namespace geoarrow {
-
-namespace geos {
-
-class GeometryVector {
- public:
-  GeometryVector(GEOSContextHandle_t handle) : handle_(handle) {}
-
-  GeometryVector(GeometryVector&& rhs)
-      : handle_(rhs.handle_), data_(std::move(rhs.data_)) {
-    rhs.data_.clear();
-  }
-
-  GeometryVector(GeometryVector& rhs) = delete;
-
-  void reset(size_t offset, size_t length = 1) {
-    for (size_t i = 0; i < length; i++) {
-      GEOSGeometry* item = data_[offset + i];
-      if (item != nullptr) {
-        GEOSGeom_destroy_r(handle_, item);
-      }
-    }
-  }
-
-  ~GeometryVector() { reset(0, data_.size()); }
-
-  void reserve(size_t n) { data_.reserve(n); }
-
-  size_t size() { return data_.size(); }
-
-  GEOSGeometry* take_ownership_of(size_t i) {
-    GEOSGeometry* item = data_[i];
-    data_[i] = nullptr;
-    return item;
-  }
-
-  const GEOSGeometry* borrow(size_t i) { return data_[i]; }
-
-  void set(size_t i, GEOSGeometry* value) {
-    reset(i);
-    data_[i] = value;
-  }
-
-  const GEOSGeometry** data() { return const_cast<const GEOSGeometry**>(data_.data()); }
-
-  GEOSGeometry** mutable_data() { return data_.data(); }
-
-  void resize(size_t n) {
-    size_t current_size = size();
-    if (n >= current_size) {
-      data_.resize(n);
-      for (size_t i = current_size; i < n; i++) {
-        data_[i] = nullptr;
-      }
-    } else {
-      reset(n, current_size - n);
-      data_.resize(n);
-    }
-  }
-
- private:
-  GEOSContextHandle_t handle_;
-  std::vector<GEOSGeometry*> data_;
-};
-
-class ArrayBuilder {
- public:
-  ArrayBuilder() : builder_(nullptr) {}
-
-  ArrayBuilder(ArrayBuilder&& rhs) : builder_(rhs.builder_) { rhs.builder_ = nullptr; }
-
-  ArrayBuilder(ArrayBuilder& rhs) = delete;
-
-  ~ArrayBuilder() {
-    if (builder_ != nullptr) {
-      GeoArrowGEOSArrayBuilderDestroy(builder_);
-    }
-  }
-
-  const char* GetLastError() {
-    if (builder_ == nullptr) {
-      return "";
-    } else {
-      return GeoArrowGEOSArrayBuilderGetLastError(builder_);
-    }
-  }
-
-  GeoArrowGEOSErrorCode InitFromEncoding(GEOSContextHandle_t handle,
-                                         GeoArrowGEOSEncoding encoding,
-                                         int wkb_type = 0) {
-    ArrowSchema tmp_schema;
-    tmp_schema.release = nullptr;
-    int result = GeoArrowGEOSMakeSchema(encoding, wkb_type, &tmp_schema);
-    if (result != GEOARROW_GEOS_OK) {
-      return result;
-    }
-
-    result = InitFromSchema(handle, &tmp_schema);
-    tmp_schema.release(&tmp_schema);
-    return result;
-  }
-
-  GeoArrowGEOSErrorCode InitFromSchema(GEOSContextHandle_t handle, ArrowSchema* schema) {
-    if (builder_ != nullptr) {
-      GeoArrowGEOSArrayBuilderDestroy(builder_);
-    }
-
-    return GeoArrowGEOSArrayBuilderCreate(handle, schema, &builder_);
-  }
-
-  GeoArrowGEOSErrorCode Append(const GEOSGeometry** geom, size_t geom_size,
-                               size_t* n_appended) {
-    return GeoArrowGEOSArrayBuilderAppend(builder_, geom, geom_size, n_appended);
-  }
-
-  GeoArrowGEOSErrorCode Finish(struct ArrowArray* out) {
-    return GeoArrowGEOSArrayBuilderFinish(builder_, out);
-  }
-
- private:
-  GeoArrowGEOSArrayBuilder* builder_;
-};
-
-class ArrayReader {
- public:
-  ArrayReader() : reader_(nullptr) {}
-
-  ArrayReader(ArrayReader&& rhs) : reader_(rhs.reader_) { rhs.reader_ = nullptr; }
-
-  ArrayReader(ArrayReader& rhs) = delete;
-
-  ~ArrayReader() {
-    if (reader_ != nullptr) {
-      GeoArrowGEOSArrayReaderDestroy(reader_);
-    }
-  }
-
-  const char* GetLastError() {
-    if (reader_ == nullptr) {
-      return "";
-    } else {
-      return GeoArrowGEOSArrayReaderGetLastError(reader_);
-    }
-  }
-
-  GeoArrowGEOSErrorCode InitFromEncoding(GEOSContextHandle_t handle,
-                                         GeoArrowGEOSEncoding encoding,
-                                         int wkb_type = 0) {
-    ArrowSchema tmp_schema;
-    tmp_schema.release = nullptr;
-    int result = GeoArrowGEOSMakeSchema(encoding, wkb_type, &tmp_schema);
-    if (result != GEOARROW_GEOS_OK) {
-      return result;
-    }
-
-    result = InitFromSchema(handle, &tmp_schema);
-    tmp_schema.release(&tmp_schema);
-    return result;
-  }
-
-  GeoArrowGEOSErrorCode InitFromSchema(GEOSContextHandle_t handle, ArrowSchema* schema) {
-    if (reader_ != nullptr) {
-      GeoArrowGEOSArrayReaderDestroy(reader_);
-    }
-
-    return GeoArrowGEOSArrayReaderCreate(handle, schema, &reader_);
-  }
-
-  GeoArrowGEOSErrorCode Read(ArrowArray* array, int64_t offset, int64_t length,
-                             GEOSGeometry** out, size_t* n_out) {
-    return GeoArrowGEOSArrayReaderRead(reader_, array, offset, length, out, n_out);
-  }
-
- private:
-  GeoArrowGEOSArrayReader* reader_;
-};
-
-class SchemaCalculator {
- public:
-  SchemaCalculator() : calc_(nullptr) { GeoArrowGEOSSchemaCalculatorCreate(&calc_); }
-
-  SchemaCalculator(SchemaCalculator&& rhs) : calc_(rhs.calc_) { rhs.calc_ = nullptr; }
-
-  SchemaCalculator(SchemaCalculator& rhs) = delete;
-
-  ~SchemaCalculator() {
-    if (calc_ != nullptr) {
-      GeoArrowGEOSSchemaCalculatorDestroy(calc_);
-    }
-  }
-
-  void Ingest(const int32_t* wkb_type, size_t n) {
-    GeoArrowGEOSSchemaCalculatorIngest(calc_, wkb_type, n);
-  }
-
-  GeoArrowGEOSErrorCode Finish(enum GeoArrowGEOSEncoding encoding, ArrowSchema* out) {
-    return GeoArrowGEOSSchemaCalculatorFinish(calc_, encoding, out);
-  }
-
- private:
-  GeoArrowGEOSSchemaCalculator* calc_;
-};
-
-}  // namespace geos
-
-}  // namespace geoarrow
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/index_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/index_test.cu
new file mode 100644
index 000000000..42f5769e2
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/index_test.cu
@@ -0,0 +1,300 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "array_stream.hpp"
+#include "gpuspatial/index/rt_spatial_index.cuh"
+#include "test_common.hpp"
+
+#include <geos/geom/Envelope.h>
+#include <geos/index/ItemVisitor.h>
+#include <geos/index/strtree/STRtree.h>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <numeric>  // For std::iota
+#include <random>
+#include <vector>
+
+namespace gpuspatial {
+template <typename T>
+struct SpatialIndexTest : public ::testing::Test {
+  using index_t = RTSpatialIndex<typename T::scalar_t, T::n_dim>;
+  std::shared_ptr<RTEngine> rt_engine;
+  index_t index;
+
+  SpatialIndexTest() {
+    auto ptx_root = TestUtils::GetTestShaderPath();
+
+    rt_engine = std::make_shared<RTEngine>();
+    rt_engine->Init(get_default_rt_config(ptx_root));
+    RTSpatialIndexConfig config;
+    config.rt_engine = rt_engine;
+    index = std::move(index_t(config));
+  }
+};
+using PointTypes = ::testing::Types<Point<float, 2>, Point<double, 2>>;
+TYPED_TEST_SUITE(SpatialIndexTest, PointTypes);
+
+template <typename POINT_T>
+std::vector<Box<POINT_T>> GeneratePoints(size_t n, std::mt19937& rng) {
+  using scalar_t = typename POINT_T::scalar_t;
+  std::vector<Box<POINT_T>> rects(n);
+
+  for (size_t i = 0; i < n; i++) {
+    POINT_T p;
+    for (int dim = 0; dim < POINT_T::n_dim; dim++) {
+      std::uniform_real_distribution<scalar_t> dist(-180.0, 180.0);
+      p.set_coordinate(dim, dist(rng));
+    }
+    rects[i] = Box<POINT_T>(p, p);
+  }
+  return rects;
+}
+
+template <typename POINT_T>
+std::vector<Box<POINT_T>> GenerateRects(size_t n, std::mt19937& rng) {
+  using scalar_t = typename POINT_T::scalar_t;
+  std::vector<Box<POINT_T>> rects(n);
+  std::uniform_real_distribution<scalar_t> distSize(0.0, 100);
+
+  for (size_t i = 0; i < n; ++i) {
+    POINT_T min_pt, max_pt, size_pt;
+
+    for (int dim = 0; dim < POINT_T::n_dim; dim++) {
+      std::uniform_real_distribution<scalar_t> dist(-180.0, 180.0);
+      min_pt.set_coordinate(dim, dist(rng));
+      size_pt.set_coordinate(dim, distSize(rng));
+    }
+    max_pt = min_pt + size_pt;
+    rects[i] = Box<POINT_T>(min_pt, max_pt);
+  }
+  return rects;
+}
+
+template <typename POINT_T>
+void ComputeReference(const std::vector<Box<POINT_T>>& build,
+                      const std::vector<Box<POINT_T>>& probe,
+                      std::vector<uint32_t>& build_indices,
+                      std::vector<uint32_t>& probe_indices) {
+  geos::index::strtree::STRtree tree;
+
+  // FIX: Create a storage container for envelopes that persists
+  // for the lifetime of the tree usage.
+  std::vector<geos::geom::Envelope> build_envelopes;
+  build_envelopes.reserve(build.size());
+
+  // 2. Build Phase
+  for (uint32_t j = 0; j < build.size(); j++) {
+    auto min_corner = build[j].get_min();
+    auto max_corner = build[j].get_max();
+
+    // Emplace the envelope into our persistent vector
+    build_envelopes.emplace_back(min_corner.x(), max_corner.x(), min_corner.y(),
+                                 max_corner.y());
+
+    // Pass the address of the element inside the vector
+    // Note: We reserved memory above, so pointers shouldn't be invalidated by resizing
+    tree.insert(&build_envelopes.back(),
+                reinterpret_cast<void*>(static_cast<uintptr_t>(j)));
+  }
+
+  tree.build();
+
+  // 3. Define Visitor (No changes needed here)
+  class InteractionVisitor : public geos::index::ItemVisitor {
+   public:
+    const std::vector<Box<POINT_T>>* build;
+    const std::vector<Box<POINT_T>>* probe;
+    std::vector<uint32_t>* b_indices;
+    std::vector<uint32_t>* p_indices;
+    uint32_t current_probe_idx;
+
+    void visitItem(void* item) override {
+      uintptr_t build_idx_ptr = reinterpret_cast<uintptr_t>(item);
+      uint32_t build_idx = static_cast<uint32_t>(build_idx_ptr);
+
+      // Refinement step
+      if ((*build)[build_idx].intersects((*probe)[current_probe_idx])) {
+        b_indices->push_back(build_idx);
+        p_indices->push_back(current_probe_idx);
+      }
+    }
+  };
+
+  InteractionVisitor visitor;
+  visitor.build = &build;
+  visitor.probe = &probe;
+  visitor.b_indices = &build_indices;
+  visitor.p_indices = &probe_indices;
+
+  // 4. Probe Phase
+  for (uint32_t i = 0; i < probe.size(); i++) {
+    auto min_corner = probe[i].get_min();
+    auto max_corner = probe[i].get_max();
+
+    // It is safe to create this on the stack here because `query`
+    // finishes executing before `search_env` goes out of scope.
+    geos::geom::Envelope search_env(min_corner.x(), max_corner.x(), min_corner.y(),
+                                    max_corner.y());
+
+    visitor.current_probe_idx = i;
+    tree.query(&search_env, visitor);
+  }
+}
+
+template <typename T, typename U>
+void sort_vectors(std::vector<T>& v1, std::vector<U>& v2) {
+  if (v1.size() != v2.size()) return;
+
+  // 1. Create indices [0, 1, 2, ..., N-1]
+  std::vector<size_t> p(v1.size());
+  std::iota(p.begin(), p.end(), 0);
+
+  // 2. Sort indices based on comparing values in v1 and v2
+  std::sort(p.begin(), p.end(), [&](size_t i, size_t j) {
+    if (v1[i] != v1[j]) return v1[i] < v1[j];  // Primary sort by v1
+    return v2[i] < v2[j];                      // Secondary sort by v2
+  });
+
+  // 3. Apply permutation (Reorder v1 and v2 based on sorted indices)
+  // Note: Doing this in-place with O(1) space is complex;
+  // using auxiliary O(N) space is standard.
+  std::vector<T> sorted_v1, sorted_v2;
+  sorted_v1.reserve(v1.size());
+  sorted_v2.reserve(v2.size());
+
+  for (size_t i : p) {
+    sorted_v1.push_back(v1[i]);
+    sorted_v2.push_back(v2[i]);
+  }
+
+  v1 = std::move(sorted_v1);
+  v2 = std::move(sorted_v2);
+}
+
+TYPED_TEST(SpatialIndexTest, PointPoint) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto points1 = GeneratePoints<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(points1.data(), points1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto points2 = GeneratePoints<point_t>(j, gen);
+
+      size_t count = static_cast<size_t>(points1.size() * 0.2);
+
+      // 2. Define the starting point (the last 'count' elements)
+      auto start_it = points1.end() - count;
+
+      // 3. Append to the second vector
+      points2.insert(points2.end(), start_it, points1.end());
+
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(points2.data(), points2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(points1, points2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+
+TYPED_TEST(SpatialIndexTest, BoxPoint) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto rects1 = GenerateRects<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(rects1.data(), rects1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto points2 = GeneratePoints<point_t>(j, gen);
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(points2.data(), points2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(rects1, points2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+
+TYPED_TEST(SpatialIndexTest, PointBox) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto points1 = GeneratePoints<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(points1.data(), points1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto rects2 = GenerateRects<point_t>(j, gen);
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(rects2.data(), rects2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(points1, rects2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+
+TYPED_TEST(SpatialIndexTest, BoxBox) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto rects1 = GenerateRects<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(rects1.data(), rects1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto rects2 = GenerateRects<point_t>(j, gen);
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(rects2.data(), rects2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(rects1, rects2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/joiner_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/joiner_test.cu
deleted file mode 100644
index bbf415592..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/test/joiner_test.cu
+++ /dev/null
@@ -1,438 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#include "array_stream.hpp"
-#include "gpuspatial/index/spatial_joiner.cuh"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "test_common.hpp"
-
-#include "geoarrow_geos/geoarrow_geos.hpp"
-#include "nanoarrow/nanoarrow.hpp"
-
-#include <geoarrow/geoarrow.h>
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include <numeric>  // For std::iota
-
-namespace gpuspatial {
-// Function to read a single Parquet file and extract a column.
-static arrow::Status ReadParquetFromFile(
-    arrow::fs::FileSystem* fs,     // 1. Filesystem pointer (e.g., LocalFileSystem)
-    const std::string& file_path,  // 2. Single file path instead of a folder
-    int64_t batch_size, const char* column_name,
-    std::vector<std::shared_ptr<arrow::Array>>& out_arrays) {
-  // 1. Get FileInfo for the single path
-  ARROW_ASSIGN_OR_RAISE(auto file_info, fs->GetFileInfo(file_path));
-
-  // Check if the path points to a file
-  if (file_info.type() != arrow::fs::FileType::File) {
-    return arrow::Status::Invalid("Path is not a file: ", file_path);
-  }
-
-  std::cout << "--- Processing Parquet file: " << file_path << " ---" << std::endl;
-
-  // 2. Open the input file
-  ARROW_ASSIGN_OR_RAISE(auto input_file, fs->OpenInputFile(file_info));
-
-  // 3. Open the Parquet file and create an Arrow reader
-  ARROW_ASSIGN_OR_RAISE(auto arrow_reader, parquet::arrow::OpenFile(
-                                               input_file, arrow::default_memory_pool()));
-
-  // 4. Set the batch size
-  arrow_reader->set_batch_size(batch_size);
-
-  // 5. Get the RecordBatchReader
-  auto rb_reader = arrow_reader->GetRecordBatchReader().ValueOrDie();
-  // 6. Read all record batches and extract the column
-  while (true) {
-    std::shared_ptr<arrow::RecordBatch> batch;
-
-    // Read the next batch
-    ARROW_THROW_NOT_OK(rb_reader->ReadNext(&batch));
-
-    // Check for end of stream
-    if (!batch) {
-      break;
-    }
-
-    // Extract the specified column and add to the output vector
-    std::shared_ptr<arrow::Array> column_array = batch->GetColumnByName(column_name);
-    if (!column_array) {
-      return arrow::Status::Invalid("Column not found: ", column_name);
-    }
-    out_arrays.push_back(column_array);
-  }
-
-  std::cout << "Finished reading. Total arrays extracted: " << out_arrays.size()
-            << std::endl;
-  return arrow::Status::OK();
-}
-
-using GeosBinaryPredicateFn = char (*)(GEOSContextHandle_t, const GEOSGeometry*,
-                                       const GEOSGeometry*);
-static GeosBinaryPredicateFn GetGeosPredicateFn(Predicate predicate) {
-  switch (predicate) {
-    case Predicate::kContains:
-      return &GEOSContains_r;
-    case Predicate::kIntersects:
-      return &GEOSIntersects_r;
-    case Predicate::kWithin:
-      return &GEOSWithin_r;
-    case Predicate::kEquals:
-      return &GEOSEquals_r;
-    case Predicate::kTouches:
-      return &GEOSTouches_r;
-    default:
-      throw std::out_of_range("Unsupported GEOS predicate enumeration value.");
-  }
-}
-
-void TestJoiner(const std::string& build_parquet_path,
-                const std::string& stream_parquet_path, Predicate predicate,
-                int batch_size = 10) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-  SpatialJoiner::SpatialJoinerConfig config;
-  std::string ptx_root = TestUtils::GetTestShaderPath();
-
-  config.ptx_root = ptx_root.c_str();
-  SpatialJoiner spatial_joiner;
-
-  spatial_joiner.Init(&config);
-  spatial_joiner.Clear();
-
-  geoarrow::geos::ArrayReader reader;
-
-  class GEOSCppHandle {
-   public:
-    GEOSContextHandle_t handle;
-
-    GEOSCppHandle() { handle = GEOS_init_r(); }
-
-    ~GEOSCppHandle() { GEOS_finish_r(handle); }
-  };
-  GEOSCppHandle handle;
-
-  reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
-
-  geoarrow::geos::GeometryVector geom_build(handle.handle);
-
-  auto get_total_length = [](const std::vector<std::shared_ptr<arrow::Array>>& arrays) {
-    size_t total_length = 0;
-    for (const auto& array : arrays) {
-      total_length += array->length();
-    }
-    return total_length;
-  };
-
-  std::vector<std::shared_ptr<arrow::Array>> build_arrays;
-  ARROW_THROW_NOT_OK(ReadParquetFromFile(fs.get(), build_parquet_path, batch_size,
-                                         "geometry", build_arrays));
-
-  // Using GEOS for reference
-  geom_build.resize(get_total_length(build_arrays));
-  size_t tail_build = 0;
-  auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
-
-  for (auto& array : build_arrays) {
-    nanoarrow::UniqueArray unique_array;
-    nanoarrow::UniqueSchema unique_schema;
-
-    ARROW_THROW_NOT_OK(
-        arrow::ExportArray(*array, unique_array.get(), unique_schema.get()));
-
-    spatial_joiner.PushBuild(unique_schema.get(), unique_array.get(), 0,
-                             unique_array->length);
-
-    // geos for reference
-    size_t n_build;
-
-    ASSERT_EQ(reader.Read(unique_array.get(), 0, unique_array->length,
-                          geom_build.mutable_data() + tail_build, &n_build),
-              GEOARROW_GEOS_OK);
-
-    for (size_t offset = tail_build; offset < tail_build + n_build; offset++) {
-      auto* geom = geom_build.borrow(offset);
-      auto* box = GEOSEnvelope_r(handle.handle, geom);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
-      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom);
-      GEOSGeom_destroy_r(handle.handle, box);
-    }
-    tail_build += n_build;
-  }
-  spatial_joiner.FinishBuilding();
-  ASSERT_EQ(GEOSSTRtree_build_r(handle.handle, tree), 1);
-
-  std::vector<std::shared_ptr<arrow::Array>> stream_arrays;
-  ARROW_THROW_NOT_OK(ReadParquetFromFile(
-      fs.get(), stream_parquet_path, batch_size, "geometry", stream_arrays));
-  int array_index_offset = 0;
-  auto context = spatial_joiner.CreateContext();
-
-  for (auto& array : stream_arrays) {
-    nanoarrow::UniqueArray unique_array;
-    nanoarrow::UniqueSchema unique_schema;
-
-    ARROW_THROW_NOT_OK(
-        arrow::ExportArray(*array, unique_array.get(), unique_schema.get()));
-    std::vector<uint32_t> build_indices, stream_indices;
-
-    spatial_joiner.PushStream(context.get(), unique_schema.get(), unique_array.get(), 0,
-                              unique_array->length, predicate, &build_indices,
-                              &stream_indices, array_index_offset);
-
-    geoarrow::geos::GeometryVector geom_stream(handle.handle);
-    size_t n_stream;
-    geom_stream.resize(array->length());
-    ASSERT_EQ(reader.Read(unique_array.get(), 0, unique_array->length,
-                          geom_stream.mutable_data(), &n_stream),
-              GEOARROW_GEOS_OK);
-    struct Payload {
-      GEOSContextHandle_t handle;
-      const GEOSGeometry* geom;
-      int64_t stream_index_offset;
-      std::vector<uint32_t> build_indices;
-      std::vector<uint32_t> stream_indices;
-      Predicate predicate;
-    };
-
-    Payload payload;
-    payload.predicate = predicate;
-    payload.handle = handle.handle;
-
-    payload.stream_index_offset = array_index_offset;
-
-    for (size_t offset = 0; offset < n_stream; offset++) {
-      auto* geom = geom_stream.borrow(offset);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
-      payload.geom = geom;
-
-      GEOSSTRtree_query_r(
-          handle.handle, tree, geom,
-          [](void* item, void* data) {
-            auto* geom_build = (GEOSGeometry*)item;
-            auto* payload = (Payload*)data;
-            auto* geom_stream = payload->geom;
-
-            if (GetGeosPredicateFn(payload->predicate)(payload->handle, geom_build,
-                                                       geom_stream) == 1) {
-              auto build_id = (size_t)GEOSGeom_getUserData_r(payload->handle, geom_build);
-              auto stream_id =
-                  (size_t)GEOSGeom_getUserData_r(payload->handle, geom_stream);
-              payload->build_indices.push_back(build_id);
-              payload->stream_indices.push_back(payload->stream_index_offset + stream_id);
-            }
-          },
-          (void*)&payload);
-    }
-
-    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
-    ASSERT_EQ(payload.stream_indices.size(), stream_indices.size());
-    sort_vectors_by_index(payload.build_indices, payload.stream_indices);
-    sort_vectors_by_index(build_indices, stream_indices);
-    for (size_t j = 0; j < build_indices.size(); j++) {
-      ASSERT_EQ(payload.build_indices[j], build_indices[j]);
-      ASSERT_EQ(payload.stream_indices[j], stream_indices[j]);
-    }
-    array_index_offset += array->length();
-  }
-  GEOSSTRtree_destroy_r(handle.handle, tree);
-}
-
-TEST(JoinerTest, PIPContainsParquet) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-
-  std::vector<std::string> polys{
-      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
-      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
-  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
-                                  GetTestDataPath("countries/generated_points.parquet")};
-
-  for (int i = 0; i < polys.size(); i++) {
-    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
-    auto point_path = TestUtils::GetCanonicalPath(points[i]);
-    TestJoiner(poly_path, point_path, Predicate::kContains, 10);
-  }
-}
-
-TEST(JoinerTest, PIPWithinParquet) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-
-  std::vector<std::string> polys{
-      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
-      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
-  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
-                                  GetTestDataPath("countries/generated_points.parquet")};
-
-  for (int i = 0; i < polys.size(); i++) {
-    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
-    auto point_path = TestUtils::GetCanonicalPath(points[i]);
-    TestJoiner(point_path, poly_path, Predicate::kWithin, 10);
-  }
-}
-
-TEST(JoinerTest, PolyPointIntersectsParquet) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-
-  std::vector<std::string> polys{
-      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
-      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
-  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
-                                  GetTestDataPath("countries/generated_points.parquet")};
-
-  for (int i = 0; i < polys.size(); i++) {
-    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
-    auto point_path = TestUtils::GetCanonicalPath(points[i]);
-    TestJoiner(point_path, poly_path, Predicate::kIntersects, 10);
-  }
-}
-
-TEST(JoinerTest, PolygonPolygonContains) {
-  SpatialJoiner::SpatialJoinerConfig config;
-  std::string ptx_root = TestUtils::GetTestShaderPath();
-  config.ptx_root = ptx_root.c_str();
-  SpatialJoiner spatial_joiner;
-
-  nanoarrow::UniqueArrayStream poly1_stream, poly2_stream;
-
-  auto poly1_path = TestUtils::GetTestDataPath("arrowipc/test_polygons1.arrows");
-  auto poly2_path = TestUtils::GetTestDataPath("arrowipc/test_polygons2.arrows");
-
-  ArrayStreamFromIpc(poly1_path, "geometry", poly1_stream.get());
-  ArrayStreamFromIpc(poly2_path, "geometry", poly2_stream.get());
-
-  nanoarrow::UniqueSchema build_schema, stream_schema;
-  nanoarrow::UniqueArray build_array, stream_array;
-  ArrowError error;
-  ArrowErrorSet(&error, "");
-  int n_row_groups = 100;
-  int array_index_offset = 0;
-  std::vector<uint32_t> build_indices, stream_indices;
-  geoarrow::geos::ArrayReader reader;
-
-  class GEOSCppHandle {
-   public:
-    GEOSContextHandle_t handle;
-
-    GEOSCppHandle() { handle = GEOS_init_r(); }
-
-    ~GEOSCppHandle() { GEOS_finish_r(handle); }
-  };
-  GEOSCppHandle handle;
-
-  reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
-
-  geoarrow::geos::GeometryVector geom_polygons1(handle.handle);
-  geoarrow::geos::GeometryVector geom_polygons2(handle.handle);
-  struct Payload {
-    GEOSContextHandle_t handle;
-    const GEOSGeometry* geom;
-    int64_t build_index_offset;
-    int64_t stream_index_offset;
-    std::vector<int64_t> build_indices;
-    std::vector<int64_t> stream_indices;
-  };
-
-  int64_t build_count = 0;
-  spatial_joiner.Init(&config);
-  for (int i = 0; i < n_row_groups; i++) {
-    ASSERT_EQ(ArrowArrayStreamGetNext(poly1_stream.get(), build_array.get(), &error),
-              NANOARROW_OK);
-    ASSERT_EQ(ArrowArrayStreamGetSchema(poly1_stream.get(), build_schema.get(), &error),
-              NANOARROW_OK);
-
-    ASSERT_EQ(ArrowArrayStreamGetNext(poly2_stream.get(), stream_array.get(), &error),
-              NANOARROW_OK);
-    ASSERT_EQ(ArrowArrayStreamGetSchema(poly2_stream.get(), stream_schema.get(), &error),
-              NANOARROW_OK);
-
-    spatial_joiner.Clear();
-    spatial_joiner.PushBuild(nullptr, build_array.get(), 0, build_array->length);
-    auto context = spatial_joiner.CreateContext();
-
-    build_indices.clear();
-    stream_indices.clear();
-    spatial_joiner.FinishBuilding();
-    spatial_joiner.PushStream(context.get(), nullptr, stream_array.get(), 0,
-                              stream_array->length, Predicate::kContains, &build_indices,
-                              &stream_indices, array_index_offset);
-    geom_polygons1.resize(build_array->length);
-    geom_polygons2.resize(stream_array->length);
-
-    size_t n_polygons1 = 0, n_polygons2 = 0;
-    ASSERT_EQ(reader.Read(build_array.get(), 0, build_array->length,
-                          geom_polygons1.mutable_data(), &n_polygons1),
-              GEOARROW_GEOS_OK);
-    ASSERT_EQ(reader.Read(stream_array.get(), 0, stream_array->length,
-                          geom_polygons2.mutable_data(), &n_polygons2),
-              GEOARROW_GEOS_OK);
-
-    auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
-
-    for (size_t j = 0; j < n_polygons1; j++) {
-      auto* geom_polygon = geom_polygons1.borrow(j);
-      auto* box = GEOSEnvelope_r(handle.handle, geom_polygon);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom_polygon, (void*)j);
-      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom_polygon);
-      GEOSGeom_destroy_r(handle.handle, box);
-    }
-    ASSERT_EQ(GEOSSTRtree_build_r(handle.handle, tree), 1);
-
-    Payload payload;
-    payload.handle = handle.handle;
-
-    payload.build_index_offset = build_count;
-    payload.stream_index_offset = array_index_offset;
-
-    for (size_t j = 0; j < n_polygons2; j++) {
-      auto* geom_poly2 = geom_polygons2.borrow(j);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom_poly2, (void*)j);
-
-      payload.geom = geom_poly2;
-
-      GEOSSTRtree_query_r(
-          handle.handle, tree, geom_poly2,
-          [](void* item, void* data) {
-            auto* polygon1 = (GEOSGeometry*)item;
-            auto* payload = (Payload*)data;
-            auto* polygon2 = payload->geom;
-
-            if (GEOSContains_r(payload->handle, polygon1, polygon2) == 1) {
-              auto polygon1_id =
-                  (size_t)GEOSGeom_getUserData_r(payload->handle, polygon1);
-              auto polygon2_id =
-                  (size_t)GEOSGeom_getUserData_r(payload->handle, polygon2);
-              payload->build_indices.push_back(payload->build_index_offset + polygon1_id);
-              payload->stream_indices.push_back(payload->stream_index_offset +
-                                                polygon2_id);
-            }
-          },
-          (void*)&payload);
-    }
-
-    GEOSSTRtree_destroy_r(handle.handle, tree);
-
-    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
-
-    build_count += build_array->length;
-    array_index_offset += stream_array->length;
-  }
-}
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu
index f8a762974..2d2acf237 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 #include "array_stream.hpp"
-#include "gpuspatial/geom/geometry_collection.cuh"
-#include "gpuspatial/geom/multi_polygon.cuh"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "gpuspatial/utils/pinned_vector.h"
+#include "gpuspatial/geom/geometry_collection.hpp"
+#include "gpuspatial/geom/multi_polygon.hpp"
+#include "gpuspatial/loader/device_geometries.hpp"
+#include "gpuspatial/utils/pinned_vector.hpp"
 #include "nanoarrow/nanoarrow.hpp"
 
-#include "gpuspatial/geom/multi_point.cuh"
+#include "gpuspatial/geom/multi_point.hpp"
 #include "test_common.hpp"
 
 #include <geoarrow/geoarrow.h>
@@ -34,7 +34,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#include "gpuspatial/loader/parallel_wkb_loader.h"
+#include "gpuspatial/loader/parallel_wkb_loader.hpp"
 namespace gpuspatial {
 
 template <typename T>
@@ -45,6 +45,7 @@ TYPED_TEST(WKBLoaderTest, Point) {
   using point_t = typename TypeParam::first_type;
   using index_t = typename TypeParam::second_type;
   nanoarrow::UniqueArrayStream stream;
+  nanoarrow::UniqueSchema schema;
   ArrayStreamFromWKT({{"POINT (0 0)"},
                       {"POINT (10 20)", "POINT (-5.5 -12.3)"},
                       {"POINT (100 -50)", "POINT (3.1415926535 2.7182818284)",
@@ -62,11 +63,19 @@ TYPED_TEST(WKBLoaderTest, Point) {
     nanoarrow::UniqueArray array;
     ArrowError error;
     ArrowErrorSet(&error, "");
-    EXPECT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+    ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
     if (array->length == 0) {
       break;
     }
-    loader.Parse(cuda_stream, array.get(), 0, array->length);
+    nanoarrow::UniqueArrayView array_view;
+    ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error), NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
+    loader.Parse(cuda_stream, array_view.get(), 0, array->length);
   }
 
   auto geometries = loader.Finish(cuda_stream);
@@ -103,13 +112,22 @@ TYPED_TEST(WKBLoaderTest, MultiPoint) {
 
   while (1) {
     nanoarrow::UniqueArray array;
+    nanoarrow::UniqueSchema schema;
     ArrowError error;
     ArrowErrorSet(&error, "");
-    EXPECT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+    ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
     if (array->length == 0) {
       break;
     }
-    loader.Parse(cuda_stream, array.get(), 0, array->length);
+    nanoarrow::UniqueArrayView array_view;
+    ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error), NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
+    loader.Parse(cuda_stream, array_view.get(), 0, array_view->length);
   }
 
   auto geometries = loader.Finish(cuda_stream);
@@ -145,6 +163,7 @@ TYPED_TEST(WKBLoaderTest, PointMultiPoint) {
   using point_t = typename TypeParam::first_type;
   using index_t = typename TypeParam::second_type;
   nanoarrow::UniqueArrayStream stream;
+  nanoarrow::UniqueSchema schema;
   ArrayStreamFromWKT({{"POINT (1 2)", "MULTIPOINT ((3 4), (5 6))"},
                       {"POINT (7 8)", "MULTIPOINT ((9 10))"},
                       {"MULTIPOINT EMPTY", "POINT (11 12)"}},
@@ -158,11 +177,19 @@ TYPED_TEST(WKBLoaderTest, PointMultiPoint) {
     nanoarrow::UniqueArray array;
     ArrowError error;
     ArrowErrorSet(&error, "");
-    EXPECT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+    ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
     if (array->length == 0) {
       break;
     }
-    loader.Parse(cuda_stream, array.get(), 0, array->length);
+    nanoarrow::UniqueArrayView array_view;
+    ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error), NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
+    loader.Parse(cuda_stream, array_view.get(), 0, array_view->length);
   }
 
   auto geometries = loader.Finish(cuda_stream);
@@ -207,6 +234,7 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderWithHoles) {
       GEOARROW_TYPE_WKB, stream.get());
 
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
@@ -215,9 +243,16 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderWithHoles) {
 
   loader.Init();
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
-
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  nanoarrow::UniqueArrayView array_view;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  loader.Parse(cuda_stream, array_view.get(), 0, array_view->length);
   auto geometries = loader.Finish(cuda_stream);
 
   auto points = TestUtils::ToVector(cuda_stream, geometries.get_points());
@@ -327,17 +362,26 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderMultipolygon) {
       GEOARROW_TYPE_WKB, stream.get());
 
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
   rmm::cuda_stream cuda_stream;
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
+  nanoarrow::UniqueArrayView array_view;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   ParallelWkbLoader<point_t, index_t> loader;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, array_view.get(), 0, array_view->length);
 
   auto geometries = loader.Finish(cuda_stream);
   const auto& offsets = geometries.get_offsets();
@@ -431,6 +475,7 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderMultipolygonLocate) {
       GEOARROW_TYPE_WKB, stream.get());
 
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
@@ -438,9 +483,17 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderMultipolygonLocate) {
   rmm::cuda_stream cuda_stream;
 
   loader.Init();
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
-
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  nanoarrow::UniqueArrayView array_view;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error),
+            NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  loader.Parse(cuda_stream, array_view.get(), 0, array_view->length);
 
   auto geometries = loader.Finish(cuda_stream);
   const auto& offsets = geometries.get_offsets();
@@ -498,18 +551,25 @@ TYPED_TEST(WKBLoaderTest, MixTypes) {
       },
       GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
   rmm::cuda_stream cuda_stream;
-
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
-
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  nanoarrow::UniqueArrayView array_view;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   ParallelWkbLoader<point_t, index_t> loader;
 
   loader.Init();
 
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, array_view.get(), 0, array_view->length);
   auto geometries = loader.Finish(cuda_stream);
   const auto& offsets = geometries.get_offsets();
 
@@ -598,19 +658,26 @@ TYPED_TEST(WKBLoaderTest, GeomCollection) {
         "MULTIPOLYGON(((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 30, 15 5), (20 15, 35 15, 35 25, 20 25, 20 15)))"}},
       GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
   rmm::cuda_stream cuda_stream;
-
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
-
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  nanoarrow::UniqueArrayView array_view;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   ParallelWkbLoader<point_t, index_t> loader;
   typename ParallelWkbLoader<point_t, index_t>::Config config;
 
   loader.Init(config);
 
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, array_view.get(), 0, array_view->length);
   auto geometries = loader.Finish(cuda_stream);
 
   const auto& offsets = geometries.get_offsets();
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/main.cc b/c/sedona-libgpuspatial/libgpuspatial/test/main.cc
index a8b3c21f3..f89c68fcf 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/main.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/main.cc
@@ -17,6 +17,8 @@
 #include <filesystem>  // Requires C++17
 #include <iostream>
 #include <string>
+
+#include "gpuspatial_testing.hpp"
 #include "gtest/gtest.h"
 
 namespace TestUtils {
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/refiner_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/refiner_test.cu
new file mode 100644
index 000000000..b724bdcb0
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/refiner_test.cu
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "array_stream.hpp"
+#include "test_common.hpp"
+
+#include "gpuspatial/index/rt_spatial_index.hpp"
+#include "gpuspatial/loader/device_geometries.hpp"
+#include "gpuspatial/refine/rt_spatial_refiner.hpp"
+
+#include "nanoarrow/nanoarrow.hpp"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "gpuspatial/index/rt_spatial_index.cuh"
+#include "gpuspatial/refine/rt_spatial_refiner.cuh"
+
+namespace gpuspatial {
+
+void TestJoiner(ArrowSchema* build_schema, std::vector<ArrowArray*>& build_arrays,
+                ArrowSchema* probe_schema, std::vector<ArrowArray*>& probe_arrays,
+                Predicate predicate, bool pipelined = false) {
+  using namespace TestUtils;
+  using coord_t = double;
+  using fpoint_t = Point<coord_t, 2>;
+  using box_t = Box<fpoint_t>;
+
+  auto rt_engine = std::make_shared<RTEngine>();
+  {
+    std::string ptx_root = TestUtils::GetTestShaderPath();
+    auto config = get_default_rt_config(ptx_root);
+    rt_engine->Init(config);
+  }
+
+  RTSpatialIndexConfig idx_config;
+  idx_config.rt_engine = rt_engine;
+  auto rt_index = CreateRTSpatialIndex<coord_t, 2>(idx_config);
+
+  RTSpatialRefinerConfig refiner_config;
+  refiner_config.rt_engine = rt_engine;
+  if (pipelined) {
+    refiner_config.pipeline_batches = 10;
+  }
+  auto rt_refiner = CreateRTSpatialRefiner(refiner_config);
+
+  // Initialize GEOS C++ components
+  auto geos_factory = geos::geom::GeometryFactory::create();
+  geos::io::WKBReader wkb_reader(*geos_factory);
+  geos::index::strtree::STRtree tree(10);
+
+  // Storage for GEOS geometries to ensure they outlive the tree
+  // The STRtree stores raw pointers, so we must own the objects
+  std::vector<std::unique_ptr<geos::geom::Geometry>> build_geoms_storage;
+  size_t total_build_length = 0;
+  for (auto& array : build_arrays) {
+    total_build_length += array->length;
+  }
+  build_geoms_storage.reserve(total_build_length);
+
+  size_t tail_build = 0;
+  ArrowError error;
+
+  // --- Build Phase ---
+  for (auto& array : build_arrays) {
+    nanoarrow::UniqueArrayView array_view;
+    ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), build_schema, &error),
+              NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array, &error), NANOARROW_OK)
+        << error.message;
+
+    std::vector<box_t> rects;
+    rects.reserve(array->length);
+
+    for (int64_t i = 0; i < array->length; i++) {
+      // Parse WKB
+      ArrowStringView wkb_view = ArrowArrayViewGetStringUnsafe(array_view.get(), i);
+      // Copy the view to a buffer because WKBReader reads from istream or byte array
+      // We can cast directly if the underlying type allows
+      std::stringstream iss;
+      auto geom = wkb_reader.read(reinterpret_cast<const unsigned char*>(wkb_view.data),
+                                  wkb_view.size_bytes);
+
+      // Calculate Envelope for GPU Index
+      const geos::geom::Envelope* env = geom->getEnvelopeInternal();
+
+      double xmin = 0, ymin = 0, xmax = -1, ymax = -1;
+      if (!env->isNull()) {
+        xmin = env->getMinX();
+        ymin = env->getMinY();
+        xmax = env->getMaxX();
+        ymax = env->getMaxY();
+      }
+
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+      rects.push_back(bbox);
+
+      // Store User Data (global offset)
+      size_t global_offset = tail_build + i;
+      geom->setUserData((void*)global_offset);
+
+      // Insert into GEOS STRtree
+      tree.insert(env, geom.get());
+
+      // Transfer ownership to storage vector
+      build_geoms_storage.push_back(std::move(geom));
+    }
+
+    rt_index->PushBuild(rects.data(), rects.size());
+    tail_build += array->length;
+
+    rt_refiner->PushBuild(array_view.get());
+  }
+
+  rt_index->FinishBuilding();
+  rt_refiner->FinishBuilding();
+
+  // --- Probe Phase ---
+  for (auto& probe_array : probe_arrays) {
+    nanoarrow::UniqueArrayView probe_view;
+    ASSERT_EQ(ArrowArrayViewInitFromSchema(probe_view.get(), probe_schema, &error),
+              NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayViewSetArray(probe_view.get(), probe_array, &error), NANOARROW_OK)
+        << error.message;
+
+    std::vector<box_t> queries;
+    std::vector<std::unique_ptr<geos::geom::Geometry>> probe_geoms;
+    probe_geoms.reserve(probe_array->length);
+
+    for (int64_t i = 0; i < probe_array->length; i++) {
+      ArrowBufferView wkb_view = ArrowArrayViewGetBytesUnsafe(probe_view.get(), i);
+      auto geom = wkb_reader.read(wkb_view.data.as_uint8, wkb_view.size_bytes);
+
+      const geos::geom::Envelope* env = geom->getEnvelopeInternal();
+
+      double xmin = 0, ymin = 0, xmax = -1, ymax = -1;
+      if (!env->isNull()) {
+        xmin = env->getMinX();
+        ymin = env->getMinY();
+        xmax = env->getMaxX();
+        ymax = env->getMaxY();
+      }
+
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+      queries.push_back(bbox);
+
+      // Store user data as local offset for verification logic
+      geom->setUserData((void*)i);
+      probe_geoms.push_back(std::move(geom));
+    }
+
+    std::vector<uint32_t> build_indices, stream_indices;
+
+    // GPU Probe
+    rt_index->Probe(queries.data(), queries.size(), &build_indices, &stream_indices);
+
+    // GPU Refine
+    auto new_size = rt_refiner->Refine(probe_view.get(), predicate, build_indices.data(),
+                                       stream_indices.data(), build_indices.size());
+
+    build_indices.resize(new_size);
+    stream_indices.resize(new_size);
+
+    // --- CPU Verification (GEOS C++) ---
+    std::vector<uint32_t> expected_build_indices;
+    std::vector<uint32_t> expected_stream_indices;
+
+    ComputeGeosJoin(build_schema, build_arrays, probe_schema,
+                    std::vector<ArrowArray*>{probe_array}, predicate,
+                    expected_build_indices, expected_stream_indices);
+
+    // Assertions
+    ASSERT_EQ(expected_build_indices.size(), build_indices.size());
+    ASSERT_EQ(expected_stream_indices.size(), stream_indices.size());
+
+    TestUtils::sort_vectors_by_index(expected_build_indices, expected_stream_indices);
+    TestUtils::sort_vectors_by_index(build_indices, stream_indices);
+
+    for (size_t j = 0; j < build_indices.size(); j++) {
+      ASSERT_EQ(expected_build_indices[j], build_indices[j]);
+      ASSERT_EQ(expected_stream_indices[j], stream_indices[j]);
+    }
+  }
+}
+
+TEST(JoinerTest, PIPContains) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{
+      GetTestDataPath("synthetic_pip/polygons.parquet"),
+      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
+  std::vector<std::string> points{GetTestDataPath("synthetic_pip/points.parquet"),
+                                  GetTestDataPath("countries/generated_points.parquet")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    auto poly_arrays = ReadParquet(poly_path, 1000);
+    auto point_arrays = ReadParquet(point_path, 1000);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    for (auto& arr : poly_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly_uniq_arrays.emplace_back().get(),
+                                            poly_uniq_schema.emplace_back().get()));
+    }
+    for (auto& arr : point_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, point_uniq_arrays.emplace_back().get(),
+                                            point_uniq_schema.emplace_back().get()));
+    }
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+    TestJoiner(poly_uniq_schema[0].get(), poly_c_arrays, point_uniq_schema[0].get(),
+               point_c_arrays, Predicate::kContains);
+  }
+}
+
+TEST(JoinerTest, PIPContainsPipelined) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{
+      GetTestDataPath("synthetic_pip/polygons.parquet"),
+      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
+  std::vector<std::string> points{GetTestDataPath("synthetic_pip/points.parquet"),
+                                  GetTestDataPath("countries/generated_points.parquet")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    auto poly_arrays = ReadParquet(poly_path, 1000);
+    auto point_arrays = ReadParquet(point_path, 1000);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    for (auto& arr : poly_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly_uniq_arrays.emplace_back().get(),
+                                            poly_uniq_schema.emplace_back().get()));
+    }
+    for (auto& arr : point_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, point_uniq_arrays.emplace_back().get(),
+                                            point_uniq_schema.emplace_back().get()));
+    }
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+    TestJoiner(poly_uniq_schema[0].get(), poly_c_arrays, point_uniq_schema[0].get(),
+               point_c_arrays, Predicate::kContains, true);
+  }
+}
+
+TEST(JoinerTest, PIPWithin) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{
+      GetTestDataPath("synthetic_pip/polygons.parquet"),
+      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
+  std::vector<std::string> points{GetTestDataPath("synthetic_pip/points.parquet"),
+                                  GetTestDataPath("countries/generated_points.parquet")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    auto poly_arrays = ReadParquet(poly_path, 1000);
+    auto point_arrays = ReadParquet(point_path, 1000);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    for (auto& arr : poly_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly_uniq_arrays.emplace_back().get(),
+                                            poly_uniq_schema.emplace_back().get()));
+    }
+    for (auto& arr : point_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, point_uniq_arrays.emplace_back().get(),
+                                            point_uniq_schema.emplace_back().get()));
+    }
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+    TestJoiner(point_uniq_schema[0].get(), point_c_arrays, poly_uniq_schema[0].get(),
+               poly_c_arrays, Predicate::kWithin);
+  }
+}
+
+TEST(JoinerTest, PolygonPolygonIntersects) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys1{GetTestDataPath("synthetic_poly/polygons1.parquet")};
+  std::vector<std::string> polys2{GetTestDataPath("synthetic_poly/polygons2.parquet")};
+
+  for (int i = 0; i < polys1.size(); i++) {
+    auto poly1_path = TestUtils::GetTestDataPath(polys1[i]);
+    auto poly2_path = TestUtils::GetCanonicalPath(polys2[i]);
+    auto poly1_arrays = ReadParquet(poly1_path, 1000);
+    auto point2_arrays = ReadParquet(poly2_path, 1000);
+    std::vector<nanoarrow::UniqueArray> poly1_uniq_arrays, poly2_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly1_uniq_schema, poly2_uniq_schema;
+
+    for (auto& arr : poly1_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly1_uniq_arrays.emplace_back().get(),
+                                            poly1_uniq_schema.emplace_back().get()));
+    }
+    for (auto& arr : point2_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly2_uniq_arrays.emplace_back().get(),
+                                            poly2_uniq_schema.emplace_back().get()));
+    }
+
+    std::vector<ArrowArray*> poly1_c_arrays, poly2_c_arrays;
+    for (auto& arr : poly1_uniq_arrays) {
+      poly1_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : poly2_uniq_arrays) {
+      poly2_c_arrays.push_back(arr.get());
+    }
+    TestJoiner(poly1_uniq_schema[0].get(), poly1_c_arrays, poly2_uniq_schema[0].get(),
+               poly2_c_arrays, Predicate::kIntersects);
+  }
+}
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/relate_test.cu
similarity index 92%
rename from c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu
rename to c/sedona-libgpuspatial/libgpuspatial/test/relate_test.cu
index fabcd3f5c..623ad59bf 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/relate_test.cu
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #include "array_stream.hpp"
-#include "gpuspatial/loader/parallel_wkb_loader.h"
-#include "gpuspatial/relate/relate.cuh"
-#include "gpuspatial/utils/pinned_vector.h"
+#include "gpuspatial/loader/parallel_wkb_loader.hpp"
+#include "gpuspatial/relate/relate.hpp"
+#include "gpuspatial/utils/pinned_vector.hpp"
 
 #include "test_common.hpp"
 
@@ -58,15 +58,25 @@ void ParseWKTPoint(const char* wkt, POINT_T& point) {
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
+  nanoarrow::UniqueArrayView array_view;
   ArrowError error;
-  ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error),
+            NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, array_view.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   auto h_vec = TestUtils::ToVector(cuda_stream, device_geometries.get_points());
   cuda_stream.synchronize();
@@ -79,15 +89,24 @@ void ParseWKTMultiPoint(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
+  nanoarrow::UniqueArrayView array_view;
   ArrowError error;
-  ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error),
+            NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, array_view.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
 
   ctx.prefix_sum1 = TestUtils::ToVector(
@@ -108,15 +127,24 @@ void ParseWKTLineString(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
+  nanoarrow::UniqueArrayView array_view;
   ArrowError error;
-  ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error),
+            NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, array_view.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream, device_geometries.get_offsets().line_string_offsets.ps_num_points);
@@ -136,15 +164,24 @@ void ParseWKTMultiLineString(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
+  nanoarrow::UniqueArrayView array_view;
   ArrowError error;
-  ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error),
+            NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, array_view.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream,
@@ -169,15 +206,25 @@ void ParseWKTPolygon(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
+  nanoarrow::UniqueArrayView array_view;
   ArrowError error;
-  ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error),
+            NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, array_view.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream, device_geometries.get_offsets().polygon_offsets.ps_num_rings);
@@ -200,15 +247,24 @@ void ParseWKTMultiPolygon(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
+  nanoarrow::UniqueArrayView array_view;
   ArrowError error;
-  ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error),
+            NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, array_view.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream, device_geometries.get_offsets().multi_polygon_offsets.ps_num_parts);
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/test_common.hpp b/c/sedona-libgpuspatial/libgpuspatial/test/test_common.hpp
index ecd9fd460..bf30f1946 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/test_common.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/test_common.hpp
@@ -16,15 +16,26 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/pinned_vector.h"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/relate/predicate.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/pinned_vector.hpp"
 
 #include "gtest/gtest.h"
 #include "rmm/cuda_stream_view.hpp"
 #include "rmm/device_uvector.hpp"
 #include "rmm/exec_policy.hpp"
 
+#include <geos/geom/Envelope.h>
+#include <geos/geom/Geometry.h>
+#include <geos/geom/GeometryFactory.h>
+#include <geos/index/ItemVisitor.h>
+#include <geos/index/strtree/STRtree.h>
+#include <geos/io/WKBReader.h>
+
+#include "nanoarrow/nanoarrow.h"
+#include "nanoarrow/nanoarrow.hpp"
+
 #include "arrow/api.h"
 #include "arrow/c/bridge.h"
 #include "arrow/filesystem/api.h"
@@ -74,7 +85,7 @@ gpuspatial::PinnedVector<T> ToVector(const rmm::cuda_stream_view& stream,
 }
 
 // Function to convert a relative path string to an absolute path string
-std::string GetCanonicalPath(const std::string& relative_path_str) {
+inline std::string GetCanonicalPath(const std::string& relative_path_str) {
   try {
     // 1. Create a path object from the relative string
     std::filesystem::path relative_path = relative_path_str;
@@ -91,6 +102,139 @@ std::string GetCanonicalPath(const std::string& relative_path_str) {
   }
 }
 
+// Helper to evaluate predicates using GEOS C++ API
+static bool EvaluateGeosPredicate(gpuspatial::Predicate predicate,
+                                  const geos::geom::Geometry* geom1,
+                                  const geos::geom::Geometry* geom2) {
+  switch (predicate) {
+    case gpuspatial::Predicate::kContains:
+      return geom1->contains(geom2);
+    case gpuspatial::Predicate::kIntersects:
+      return geom1->intersects(geom2);
+    case gpuspatial::Predicate::kWithin:
+      return geom1->within(geom2);
+    case gpuspatial::Predicate::kEquals:
+      return geom1->equals(geom2);
+    case gpuspatial::Predicate::kTouches:
+      return geom1->touches(geom2);
+    default:
+      throw std::out_of_range("Unsupported GEOS predicate enumeration value.");
+  }
+}
+
+// Helper structure to keep visitor context
+struct JoinVisitorContext {
+  const geos::geom::Geometry* probe_geom;
+  std::vector<uint32_t>* build_indices;
+  std::vector<uint32_t>* probe_indices;
+  size_t current_probe_index;
+  gpuspatial::Predicate predicate;
+};
+
+// GEOS Visitor Implementation
+class JoinVisitor : public geos::index::ItemVisitor {
+ public:
+  JoinVisitorContext* ctx;
+  explicit JoinVisitor(JoinVisitorContext* c) : ctx(c) {}
+
+  void visitItem(void* item) override {
+    const auto* build_geom = static_cast<const geos::geom::Geometry*>(item);
+
+    // Use the existing predicate evaluator from TestUtils
+    if (EvaluateGeosPredicate(ctx->predicate, build_geom, ctx->probe_geom)) {
+      size_t build_idx = (size_t)build_geom->getUserData();
+
+      ctx->build_indices->push_back(static_cast<uint32_t>(build_idx));
+      ctx->probe_indices->push_back(static_cast<uint32_t>(ctx->current_probe_index));
+    }
+  }
+};
+
+inline void ComputeGeosJoin(ArrowSchema* build_schema,
+                            const std::vector<ArrowArray*>& build_arrays,
+                            ArrowSchema* probe_schema,
+                            const std::vector<ArrowArray*>& probe_arrays,
+                            gpuspatial::Predicate predicate,
+                            std::vector<uint32_t>& out_build_indices,
+                            std::vector<uint32_t>& out_probe_indices) {
+  // Initialize GEOS components
+  auto factory = geos::geom::GeometryFactory::create();
+  geos::io::WKBReader wkb_reader(*factory);
+  geos::index::strtree::STRtree tree(10);
+
+  // Storage to keep geometries alive during the operation
+  std::vector<std::unique_ptr<geos::geom::Geometry>> build_geoms_storage;
+  ArrowError error;
+
+  // --- Build Phase ---
+  size_t global_build_offset = 0;
+
+  for (auto* array : build_arrays) {
+    nanoarrow::UniqueArrayView array_view;
+    if (ArrowArrayViewInitFromSchema(array_view.get(), build_schema, &error) !=
+        NANOARROW_OK) {
+      throw std::runtime_error("GEOS Build: Failed to init view: " +
+                               std::string(error.message));
+    }
+    if (ArrowArrayViewSetArray(array_view.get(), array, &error) != NANOARROW_OK) {
+      throw std::runtime_error("GEOS Build: Failed to set array: " +
+                               std::string(error.message));
+    }
+
+    for (int64_t i = 0; i < array->length; i++) {
+      // Parse WKB
+      ArrowStringView wkb_view = ArrowArrayViewGetStringUnsafe(array_view.get(), i);
+      auto geom = wkb_reader.read(reinterpret_cast<const unsigned char*>(wkb_view.data),
+                                  wkb_view.size_bytes);
+
+      // Set global index as user data
+      size_t current_idx = global_build_offset + i;
+      geom->setUserData((void*)current_idx);
+
+      // Insert into Index
+      tree.insert(geom->getEnvelopeInternal(), geom.get());
+
+      // Transfer ownership
+      build_geoms_storage.push_back(std::move(geom));
+    }
+    global_build_offset += array->length;
+  }
+
+  // --- Probe Phase ---
+  size_t global_probe_offset = 0;
+  JoinVisitorContext ctx;
+  ctx.build_indices = &out_build_indices;
+  ctx.probe_indices = &out_probe_indices;
+  ctx.predicate = predicate;
+  JoinVisitor visitor(&ctx);
+
+  for (auto* array : probe_arrays) {
+    nanoarrow::UniqueArrayView array_view;
+    if (ArrowArrayViewInitFromSchema(array_view.get(), probe_schema, &error) !=
+        NANOARROW_OK) {
+      throw std::runtime_error("GEOS Probe: Failed to init view: " +
+                               std::string(error.message));
+    }
+    if (ArrowArrayViewSetArray(array_view.get(), array, &error) != NANOARROW_OK) {
+      throw std::runtime_error("GEOS Probe: Failed to set array: " +
+                               std::string(error.message));
+    }
+
+    for (int64_t i = 0; i < array->length; i++) {
+      ArrowStringView wkb_view = ArrowArrayViewGetStringUnsafe(array_view.get(), i);
+      auto geom = wkb_reader.read(reinterpret_cast<const unsigned char*>(wkb_view.data),
+                                  wkb_view.size_bytes);
+
+      ctx.probe_geom = geom.get();
+      ctx.current_probe_index = global_probe_offset + i;
+
+      // Query the tree
+      tree.query(geom->getEnvelopeInternal(), visitor);
+    }
+    global_probe_offset += array->length;
+  }
+}
+
 template <typename KeyType, typename ValueType>
 void sort_vectors_by_index(std::vector<KeyType>& keys, std::vector<ValueType>& values) {
   // 1. Create an index vector {0, 1, 2, ...}
diff --git a/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json b/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json
index b162d78e2..f593623e8 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json
+++ b/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json
@@ -7,6 +7,7 @@
       "dependencies": [
         "gtest",
         "geos",
+        "zstd",
         {
           "name": "arrow",
           "features": [
diff --git a/c/sedona-libgpuspatial/src/error.rs b/c/sedona-libgpuspatial/src/error.rs
deleted file mode 100644
index 3530e40e8..000000000
--- a/c/sedona-libgpuspatial/src/error.rs
+++ /dev/null
@@ -1,56 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-use arrow_schema::ArrowError;
-use std::fmt;
-use thiserror::Error;
-
-#[derive(Error, Debug)]
-pub enum GpuSpatialError {
-    Arrow(ArrowError),
-    Init(String),
-    PushBuild(String),
-    FinishBuild(String),
-    PushStream(String),
-}
-
-impl From<ArrowError> for GpuSpatialError {
-    fn from(value: ArrowError) -> Self {
-        GpuSpatialError::Arrow(value)
-    }
-}
-
-impl fmt::Display for GpuSpatialError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            GpuSpatialError::Arrow(error) => {
-                write!(f, "{error}")
-            }
-            GpuSpatialError::Init(errmsg) => {
-                write!(f, "Initialization failed: {}", errmsg)
-            }
-            GpuSpatialError::PushBuild(errmsg) => {
-                write!(f, "Push build failed: {}", errmsg)
-            }
-            GpuSpatialError::FinishBuild(errmsg) => {
-                write!(f, "Finish building failed: {}", errmsg)
-            }
-            GpuSpatialError::PushStream(errmsg) => {
-                write!(f, "Push stream failed: {}", errmsg)
-            }
-        }
-    }
-}
diff --git a/c/sedona-libgpuspatial/src/lib.rs b/c/sedona-libgpuspatial/src/lib.rs
index 1bcd4ef43..8d9fd61ce 100644
--- a/c/sedona-libgpuspatial/src/lib.rs
+++ b/c/sedona-libgpuspatial/src/lib.rs
@@ -17,257 +17,4 @@
 
 // Module declarations
 #[cfg(gpu_available)]
-pub mod error;
-#[cfg(gpu_available)]
-mod libgpuspatial;
-#[cfg(gpu_available)]
 mod libgpuspatial_glue_bindgen;
-
-// Import Array trait for len() method (used in gpu_available code)
-#[cfg(gpu_available)]
-use arrow_array::Array;
-
-// Re-exports for GPU functionality
-#[cfg(gpu_available)]
-pub use error::GpuSpatialError;
-#[cfg(gpu_available)]
-pub use libgpuspatial::{GpuSpatialJoinerWrapper, GpuSpatialPredicateWrapper};
-#[cfg(gpu_available)]
-pub use libgpuspatial_glue_bindgen::GpuSpatialJoinerContext;
-
-// Mark GPU types as Send for thread safety
-// SAFETY: The GPU library is designed to be used from multiple threads.
-// Each thread gets its own context, and the underlying GPU library handles thread safety.
-// The raw pointers inside are managed by the C++ library which ensures proper synchronization.
-#[cfg(gpu_available)]
-unsafe impl Send for GpuSpatialJoinerContext {}
-
-#[cfg(gpu_available)]
-unsafe impl Send for libgpuspatial_glue_bindgen::GpuSpatialJoiner {}
-
-#[cfg(gpu_available)]
-unsafe impl Send for GpuSpatialJoinerWrapper {}
-
-// Error type for non-GPU builds
-#[cfg(not(gpu_available))]
-#[derive(Debug, thiserror::Error)]
-pub enum GpuSpatialError {
-    #[error("GPU not available - CUDA not found during build")]
-    GpuNotAvailable,
-}
-
-pub type Result<T> = std::result::Result<T, GpuSpatialError>;
-
-/// High-level wrapper for GPU spatial operations
-pub struct GpuSpatialContext {
-    #[cfg(gpu_available)]
-    joiner: Option<GpuSpatialJoinerWrapper>,
-    #[cfg(gpu_available)]
-    context: Option<GpuSpatialJoinerContext>,
-    initialized: bool,
-}
-
-impl GpuSpatialContext {
-    pub fn new() -> Result<Self> {
-        #[cfg(not(gpu_available))]
-        {
-            Err(GpuSpatialError::GpuNotAvailable)
-        }
-
-        #[cfg(gpu_available)]
-        {
-            Ok(Self {
-                joiner: None,
-                context: None,
-                initialized: false,
-            })
-        }
-    }
-
-    pub fn init(&mut self) -> Result<()> {
-        #[cfg(not(gpu_available))]
-        {
-            Err(GpuSpatialError::GpuNotAvailable)
-        }
-
-        #[cfg(gpu_available)]
-        {
-            let mut joiner = GpuSpatialJoinerWrapper::new();
-
-            // Get PTX path from OUT_DIR
-            let out_path = std::path::PathBuf::from(env!("OUT_DIR"));
-            let ptx_root = out_path.join("share/gpuspatial/shaders");
-            let ptx_root_str = ptx_root
-                .to_str()
-                .ok_or_else(|| GpuSpatialError::Init("Invalid PTX path".to_string()))?;
-
-            // Initialize with concurrency of 1 for now
-            joiner.init(1, ptx_root_str)?;
-
-            // Create context
-            let mut ctx = GpuSpatialJoinerContext {
-                last_error: std::ptr::null(),
-                private_data: std::ptr::null_mut(),
-                build_indices: std::ptr::null_mut(),
-                stream_indices: std::ptr::null_mut(),
-            };
-            joiner.create_context(&mut ctx);
-
-            self.joiner = Some(joiner);
-            self.context = Some(ctx);
-            self.initialized = true;
-            Ok(())
-        }
-    }
-
-    #[cfg(gpu_available)]
-    pub fn get_joiner_mut(&mut self) -> Option<&mut GpuSpatialJoinerWrapper> {
-        self.joiner.as_mut()
-    }
-
-    #[cfg(gpu_available)]
-    pub fn get_context_mut(&mut self) -> Option<&mut GpuSpatialJoinerContext> {
-        self.context.as_mut()
-    }
-
-    pub fn is_initialized(&self) -> bool {
-        self.initialized
-    }
-
-    /// Perform spatial join between two geometry arrays
-    pub fn spatial_join(
-        &mut self,
-        left_geom: arrow_array::ArrayRef,
-        right_geom: arrow_array::ArrayRef,
-        predicate: SpatialPredicate,
-    ) -> Result<(Vec<u32>, Vec<u32>)> {
-        #[cfg(not(gpu_available))]
-        {
-            let _ = (left_geom, right_geom, predicate);
-            Err(GpuSpatialError::GpuNotAvailable)
-        }
-
-        #[cfg(gpu_available)]
-        {
-            if !self.initialized {
-                return Err(GpuSpatialError::Init("Context not initialized".into()));
-            }
-
-            let joiner = self
-                .joiner
-                .as_mut()
-                .ok_or_else(|| GpuSpatialError::Init("GPU joiner not available".into()))?;
-
-            // Clear previous build data
-            joiner.clear();
-
-            // Push build data (left side)
-            log::info!(
-                "DEBUG: Pushing {} geometries to GPU (build side)",
-                left_geom.len()
-            );
-            log::info!("DEBUG: Left array data type: {:?}", left_geom.data_type());
-            if let Some(binary_arr) = left_geom
-                .as_any()
-                .downcast_ref::<arrow_array::BinaryArray>()
-            {
-                log::info!("DEBUG: Left binary array has {} values", binary_arr.len());
-                if binary_arr.len() > 0 {
-                    let first_wkb = binary_arr.value(0);
-                    log::info!(
-                        "DEBUG: First left WKB length: {}, first bytes: {:?}",
-                        first_wkb.len(),
-                        &first_wkb[..8.min(first_wkb.len())]
-                    );
-                }
-            }
-
-            joiner.push_build(&left_geom, 0, left_geom.len() as i64)?;
-            joiner.finish_building()?;
-
-            // Recreate context after building (required by libgpuspatial)
-            let mut new_context = libgpuspatial_glue_bindgen::GpuSpatialJoinerContext {
-                last_error: std::ptr::null(),
-                private_data: std::ptr::null_mut(),
-                build_indices: std::ptr::null_mut(),
-                stream_indices: std::ptr::null_mut(),
-            };
-            joiner.create_context(&mut new_context);
-            self.context = Some(new_context);
-            let context = self.context.as_mut().unwrap();
-            // Push stream data (right side) and perform join
-            let gpu_predicate = predicate.into();
-            joiner.push_stream(
-                context,
-                &right_geom,
-                0,
-                right_geom.len() as i64,
-                gpu_predicate,
-                0, // array_index_offset
-            )?;
-
-            // Get results
-            let build_indices = joiner.get_build_indices_buffer(context).to_vec();
-            let stream_indices = joiner.get_stream_indices_buffer(context).to_vec();
-
-            Ok((build_indices, stream_indices))
-        }
-    }
-}
-
-/// Spatial predicates for GPU operations
-#[repr(u32)]
-#[derive(Debug, PartialEq, Copy, Clone)]
-pub enum SpatialPredicate {
-    Equals = 0,
-    Disjoint = 1,
-    Touches = 2,
-    Contains = 3,
-    Covers = 4,
-    Intersects = 5,
-    Within = 6,
-    CoveredBy = 7,
-}
-
-#[cfg(gpu_available)]
-impl From<SpatialPredicate> for GpuSpatialPredicateWrapper {
-    fn from(pred: SpatialPredicate) -> Self {
-        match pred {
-            SpatialPredicate::Equals => GpuSpatialPredicateWrapper::Equals,
-            SpatialPredicate::Disjoint => GpuSpatialPredicateWrapper::Disjoint,
-            SpatialPredicate::Touches => GpuSpatialPredicateWrapper::Touches,
-            SpatialPredicate::Contains => GpuSpatialPredicateWrapper::Contains,
-            SpatialPredicate::Covers => GpuSpatialPredicateWrapper::Covers,
-            SpatialPredicate::Intersects => GpuSpatialPredicateWrapper::Intersects,
-            SpatialPredicate::Within => GpuSpatialPredicateWrapper::Within,
-            SpatialPredicate::CoveredBy => GpuSpatialPredicateWrapper::CoveredBy,
-        }
-    }
-}
-
-// Cleanup implementation
-impl Drop for GpuSpatialContext {
-    fn drop(&mut self) {
-        #[cfg(gpu_available)]
-        {
-            if let (Some(mut joiner), Some(mut ctx)) = (self.joiner.take(), self.context.take()) {
-                joiner.destroy_context(&mut ctx);
-                joiner.release();
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_context_creation() {
-        let ctx = GpuSpatialContext::new();
-        #[cfg(gpu_available)]
-        assert!(ctx.is_ok());
-        #[cfg(not(gpu_available))]
-        assert!(ctx.is_err());
-    }
-}
diff --git a/c/sedona-libgpuspatial/src/libgpuspatial.rs b/c/sedona-libgpuspatial/src/libgpuspatial.rs
deleted file mode 100644
index 414b92e09..000000000
--- a/c/sedona-libgpuspatial/src/libgpuspatial.rs
+++ /dev/null
@@ -1,509 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use crate::error::GpuSpatialError;
-use crate::libgpuspatial_glue_bindgen::*;
-use arrow_array::{ffi::FFI_ArrowArray, ArrayRef};
-use std::convert::TryFrom;
-use std::ffi::CString;
-use std::mem::transmute;
-use std::os::raw::{c_uint, c_void};
-
-pub struct GpuSpatialJoinerWrapper {
-    joiner: GpuSpatialJoiner,
-}
-
-#[repr(u32)]
-#[derive(Debug, PartialEq, Copy, Clone)]
-pub enum GpuSpatialPredicateWrapper {
-    Equals = 0,
-    Disjoint = 1,
-    Touches = 2,
-    Contains = 3,
-    Covers = 4,
-    Intersects = 5,
-    Within = 6,
-    CoveredBy = 7,
-}
-
-impl TryFrom<c_uint> for GpuSpatialPredicateWrapper {
-    type Error = &'static str;
-
-    fn try_from(v: c_uint) -> Result<Self, Self::Error> {
-        match v {
-            0 => Ok(GpuSpatialPredicateWrapper::Equals),
-            1 => Ok(GpuSpatialPredicateWrapper::Disjoint),
-            2 => Ok(GpuSpatialPredicateWrapper::Touches),
-            3 => Ok(GpuSpatialPredicateWrapper::Contains),
-            4 => Ok(GpuSpatialPredicateWrapper::Covers),
-            5 => Ok(GpuSpatialPredicateWrapper::Intersects),
-            6 => Ok(GpuSpatialPredicateWrapper::Within),
-            7 => Ok(GpuSpatialPredicateWrapper::CoveredBy),
-            _ => Err("Invalid GpuSpatialPredicate value"),
-        }
-    }
-}
-
-impl Default for GpuSpatialJoinerWrapper {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl GpuSpatialJoinerWrapper {
-    pub fn new() -> Self {
-        GpuSpatialJoinerWrapper {
-            joiner: GpuSpatialJoiner {
-                init: None,
-                clear: None,
-                create_context: None,
-                destroy_context: None,
-                push_build: None,
-                finish_building: None,
-                push_stream: None,
-                get_build_indices_buffer: None,
-                get_stream_indices_buffer: None,
-                release: None,
-                private_data: std::ptr::null_mut(),
-                last_error: std::ptr::null(),
-            },
-        }
-    }
-
-    /// # Initializes the GpuSpatialJoiner
-    /// This function should only be called once per joiner instance.
-    ///
-    /// # Arguments
-    /// * `concurrency` - How many threads will call the joiner concurrently.
-    /// * `ptx_root` - The root directory for PTX files.
-    pub fn init(&mut self, concurrency: u32, ptx_root: &str) -> Result<(), GpuSpatialError> {
-        let joiner_ptr: *mut GpuSpatialJoiner = &mut self.joiner;
-
-        unsafe {
-            // Set function pointers to the C functions
-            GpuSpatialJoinerCreate(joiner_ptr);
-        }
-
-        if let Some(init_fn) = self.joiner.init {
-            let c_ptx_root = CString::new(ptx_root).expect("CString::new failed");
-
-            let mut config = GpuSpatialJoinerConfig {
-                concurrency,
-                ptx_root: c_ptx_root.as_ptr(),
-            };
-
-            // This is an unsafe call because it's calling a C function from the bindings.
-            unsafe {
-                if init_fn(&self.joiner as *const _ as *mut _, &mut config) != 0 {
-                    let error_message = self.joiner.last_error;
-                    let c_str = std::ffi::CStr::from_ptr(error_message);
-                    let error_string = c_str.to_string_lossy().into_owned();
-                    return Err(GpuSpatialError::Init(error_string));
-                }
-            }
-        }
-        Ok(())
-    }
-
-    /// # Clears the GpuSpatialJoiner
-    /// This function clears the internal state of the joiner.
-    /// By calling this function, the pushed build data will be cleared.
-    /// You should call this function to reuse the joiner
-    /// instead of building a new one because creating a new joiner is expensive.
-    /// **This method is not thread-safe and should be called from a single thread.**
-    pub fn clear(&mut self) {
-        if let Some(clear_fn) = self.joiner.clear {
-            unsafe {
-                clear_fn(&mut self.joiner as *mut _);
-            }
-        }
-    }
-
-    /// # Pushes an array of WKBs to the build side of the joiner
-    /// This function can be called multiple times to push multiple arrays.
-    /// The joiner will internally parse the WKBs and build a spatial index.
-    /// After pushing all build data, you must call `finish_building()` to build the
-    /// spatial index.
-    /// **This method is not thread-safe and should be called from a single thread.**
-    /// # Arguments
-    /// * `array` - The array of WKBs to push.
-    /// * `offset` - The offset of the array to push.
-    /// * `length` - The length of the array to push.
-    pub fn push_build(
-        &mut self,
-        array: &ArrayRef,
-        offset: i64,
-        length: i64,
-    ) -> Result<(), GpuSpatialError> {
-        log::info!(
-            "DEBUG FFI: push_build called with offset={}, length={}",
-            offset,
-            length
-        );
-        log::info!(
-            "DEBUG FFI: Array length={}, null_count={}",
-            array.len(),
-            array.null_count()
-        );
-
-        // 1. Convert the single ArrayRef to its FFI representation
-        let (ffi_array, _) = arrow_array::ffi::to_ffi(&array.to_data())?;
-
-        log::info!("DEBUG FFI: FFI conversion successful");
-        log::info!("DEBUG FFI: FFI array null_count={}", ffi_array.null_count());
-
-        // 2. Get the raw pointer to the FFI_ArrowArray struct
-        // let arrow_ptr = &mut ffi_array as *mut FFI_ArrowArray as *mut ArrowArray;
-
-        if let Some(push_build_fn) = self.joiner.push_build {
-            unsafe {
-                let ffi_array_ptr: *const ArrowArray =
-                    transmute(&ffi_array as *const FFI_ArrowArray);
-                log::info!("DEBUG FFI: Calling C++ push_build function");
-                if push_build_fn(
-                    &mut self.joiner as *mut _,
-                    std::ptr::null_mut(), // schema is unused currently
-                    ffi_array_ptr as *mut _,
-                    offset,
-                    length,
-                ) != 0
-                {
-                    let error_message = self.joiner.last_error;
-                    let c_str = std::ffi::CStr::from_ptr(error_message);
-                    let error_string = c_str.to_string_lossy().into_owned();
-                    log::error!("DEBUG FFI: push_build failed: {}", error_string);
-                    return Err(GpuSpatialError::PushBuild(error_string));
-                }
-                log::info!("DEBUG FFI: push_build C++ call succeeded");
-            }
-        }
-        Ok(())
-    }
-
-    /// # Finishes building the spatial index
-    /// This function must be called after all build data has been pushed
-    /// using `push_build()`. It builds the spatial index internally on the GPU.
-    /// After calling this function, the joiner is ready to accept stream data
-    /// for spatial join operations.
-    /// **This method is not thread-safe and should be called from a single thread.**
-    pub fn finish_building(&mut self) -> Result<(), GpuSpatialError> {
-        if let Some(finish_building_fn) = self.joiner.finish_building {
-            unsafe {
-                if finish_building_fn(&mut self.joiner as *mut _) != 0 {
-                    let error_message = self.joiner.last_error;
-                    let c_str = std::ffi::CStr::from_ptr(error_message);
-                    let error_string = c_str.to_string_lossy().into_owned();
-                    return Err(GpuSpatialError::FinishBuild(error_string));
-                }
-            }
-        }
-        Ok(())
-    }
-
-    /// # Creates a context for a thread to perform spatial joins
-    /// This function initializes a context that holds thread-specific data for spatial joins and
-    /// pointers to buffers that store the results of spatial joins.
-    /// Each thread that performs spatial joins should have its own context.
-    /// The context is passed to PushStream calls to perform spatial joins.
-    /// The context must be created after the joiner has been initialized.
-    /// It is encouraged to create reuse the context within the same thread to reduce resource allocation overhead.
-    /// The context can be destroyed by calling the `destroy_context` function pointer in the `GpuSpatialJoiner` struct.
-    /// The context should be destroyed before destroying the joiner.
-    /// **This method is thread-safe.**
-    pub fn create_context(&mut self, ctx: &mut GpuSpatialJoinerContext) {
-        if let Some(create_context_fn) = self.joiner.create_context {
-            unsafe {
-                create_context_fn(&mut self.joiner as *mut _, ctx as *mut _);
-            }
-        }
-    }
-
-    pub fn destroy_context(&mut self, ctx: &mut GpuSpatialJoinerContext) {
-        if let Some(destroy_context_fn) = self.joiner.destroy_context {
-            unsafe {
-                destroy_context_fn(ctx as *mut _);
-            }
-        }
-    }
-
-    pub fn push_stream(
-        &mut self,
-        ctx: &mut GpuSpatialJoinerContext,
-        array: &ArrayRef,
-        offset: i64,
-        length: i64,
-        predicate: GpuSpatialPredicateWrapper,
-        array_index_offset: i32,
-    ) -> Result<(), GpuSpatialError> {
-        log::info!(
-            "DEBUG FFI: push_stream called with offset={}, length={}, predicate={:?}",
-            offset,
-            length,
-            predicate
-        );
-        log::info!(
-            "DEBUG FFI: Array length={}, null_count={}",
-            array.len(),
-            array.null_count()
-        );
-
-        // 1. Convert the single ArrayRef to its FFI representation
-        let (ffi_array, _) = arrow_array::ffi::to_ffi(&array.to_data())?;
-
-        log::info!("DEBUG FFI: FFI conversion successful");
-        log::info!("DEBUG FFI: FFI array null_count={}", ffi_array.null_count());
-
-        // 2. Get the raw pointer to the FFI_ArrowArray struct
-        // let arrow_ptr = &mut ffi_array as *mut FFI_ArrowArray as *mut ArrowArray;
-
-        if let Some(push_stream_fn) = self.joiner.push_stream {
-            unsafe {
-                let ffi_array_ptr: *const ArrowArray =
-                    transmute(&ffi_array as *const FFI_ArrowArray);
-                log::info!("DEBUG FFI: Calling C++ push_stream function");
-                if push_stream_fn(
-                    &mut self.joiner as *mut _,
-                    ctx as *mut _,
-                    std::ptr::null_mut(), // schema is unused currently
-                    ffi_array_ptr as *mut _,
-                    offset,
-                    length,
-                    predicate as c_uint,
-                    array_index_offset,
-                ) != 0
-                {
-                    let error_message = ctx.last_error;
-                    let c_str = std::ffi::CStr::from_ptr(error_message);
-                    let error_string = c_str.to_string_lossy().into_owned();
-                    log::error!("DEBUG FFI: push_stream failed: {}", error_string);
-                    return Err(GpuSpatialError::PushStream(error_string));
-                }
-                log::info!("DEBUG FFI: push_stream C++ call succeeded");
-            }
-        }
-        Ok(())
-    }
-
-    pub fn get_build_indices_buffer(&self, ctx: &mut GpuSpatialJoinerContext) -> &[u32] {
-        if let Some(get_build_indices_buffer_fn) = self.joiner.get_build_indices_buffer {
-            let mut build_indices_ptr: *mut c_void = std::ptr::null_mut();
-            let mut build_indices_len: u32 = 0;
-
-            unsafe {
-                get_build_indices_buffer_fn(
-                    ctx as *mut _,
-                    &mut build_indices_ptr as *mut *mut c_void,
-                    &mut build_indices_len as *mut u32,
-                );
-
-                // Check length first - empty vectors return empty slice
-                if build_indices_len == 0 {
-                    return &[];
-                }
-
-                // Validate pointer (should not be null if length > 0)
-                if build_indices_ptr.is_null() {
-                    return &[];
-                }
-
-                // Convert the raw pointer to a slice. This is safe to do because
-                // we've validated the pointer is non-null and length is valid.
-                let typed_ptr = build_indices_ptr as *const u32;
-
-                // Safety: We've checked ptr is non-null and len > 0
-                return std::slice::from_raw_parts(typed_ptr, build_indices_len as usize);
-            }
-        }
-        &[]
-    }
-
-    pub fn get_stream_indices_buffer(&self, ctx: &mut GpuSpatialJoinerContext) -> &[u32] {
-        if let Some(get_stream_indices_buffer_fn) = self.joiner.get_stream_indices_buffer {
-            let mut stream_indices_ptr: *mut c_void = std::ptr::null_mut();
-            let mut stream_indices_len: u32 = 0;
-
-            unsafe {
-                get_stream_indices_buffer_fn(
-                    ctx as *mut _,
-                    &mut stream_indices_ptr as *mut *mut c_void,
-                    &mut stream_indices_len as *mut u32,
-                );
-
-                // Check length first - empty vectors return empty slice
-                if stream_indices_len == 0 {
-                    return &[];
-                }
-
-                // Validate pointer (should not be null if length > 0)
-                if stream_indices_ptr.is_null() {
-                    return &[];
-                }
-
-                // Convert the raw pointer to a slice. This is safe to do because
-                // we've validated the pointer is non-null and length is valid.
-                let typed_ptr = stream_indices_ptr as *const u32;
-
-                // Safety: We've checked ptr is non-null and len > 0
-                return std::slice::from_raw_parts(typed_ptr, stream_indices_len as usize);
-            }
-        }
-        &[]
-    }
-
-    pub fn release(&mut self) {
-        // Call the release function if it exists
-        if let Some(release_fn) = self.joiner.release {
-            unsafe {
-                release_fn(&mut self.joiner as *mut _);
-            }
-        }
-    }
-}
-
-impl Drop for GpuSpatialJoinerWrapper {
-    fn drop(&mut self) {
-        // Call the release function if it exists
-        if let Some(release_fn) = self.joiner.release {
-            unsafe {
-                release_fn(&mut self.joiner as *mut _);
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-    use sedona_expr::scalar_udf::SedonaScalarUDF;
-    use sedona_geos::register::scalar_kernels;
-    use sedona_schema::crs::lnglat;
-    use sedona_schema::datatypes::{Edges, SedonaType, WKB_GEOMETRY};
-    use sedona_testing::create::create_array_storage;
-    use sedona_testing::testers::ScalarUdfTester;
-    use std::env;
-    use std::path::PathBuf;
-
-    #[test]
-    fn test_gpu_joiner_end2end() {
-        let mut joiner = GpuSpatialJoinerWrapper::new();
-
-        let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-        let ptx_root = out_path.join("share/gpuspatial/shaders");
-
-        joiner
-            .init(
-                1,
-                ptx_root.to_str().expect("Failed to convert path to string"),
-            )
-            .expect("Failed to init GpuSpatialJoiner");
-
-        let polygon_values =  &[
-            Some("POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"),
-            Some("POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))"),
-            Some("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (2 2, 3 2, 3 3, 2 3, 2 2), (6 6, 8 6, 8 8, 6 8, 6 6))"),
-            Some("POLYGON ((30 0, 60 20, 50 50, 10 50, 0 20, 30 0), (20 30, 25 40, 15 40, 20 30), (30 30, 35 40, 25 40, 30 30), (40 30, 45 40, 35 40, 40 30))"),
-            Some("POLYGON ((40 0, 50 30, 80 20, 90 70, 60 90, 30 80, 20 40, 40 0), (50 20, 65 30, 60 50, 45 40, 50 20), (30 60, 50 70, 45 80, 30 60))"),
-        ];
-        let polygons = create_array_storage(polygon_values, &WKB_GEOMETRY);
-
-        // Let the gpusaptial joiner to parse WKBs and get building boxes
-        joiner
-            .push_build(&polygons, 0, polygons.len().try_into().unwrap())
-            .expect("Failed to push building");
-        // Build a spatial index for Build internally on GPU
-        joiner.finish_building().expect("Failed to finish building");
-
-        // Each thread that performs spatial joins should have its own context.
-        // The context is passed to PushStream calls to perform spatial joins.
-        let mut ctx = GpuSpatialJoinerContext {
-            last_error: std::ptr::null(),
-            private_data: std::ptr::null_mut(),
-            build_indices: std::ptr::null_mut(),
-            stream_indices: std::ptr::null_mut(),
-        };
-
-        joiner.create_context(&mut ctx);
-
-        let point_values = &[
-            Some("POINT (30 20)"), // poly0
-            Some("POINT (20 20)"), // poly1
-            Some("POINT (1 1)"),   // poly2
-            Some("POINT (70 70)"),
-            Some("POINT (55 35)"), // poly4
-        ];
-        let points = create_array_storage(point_values, &WKB_GEOMETRY);
-
-        // array_index_offset offsets the result of stream indices
-        let array_index_offset = 0;
-        joiner
-            .push_stream(
-                &mut ctx,
-                &points,
-                0,
-                points.len().try_into().unwrap(),
-                GpuSpatialPredicateWrapper::Intersects,
-                array_index_offset,
-            )
-            .expect("Failed to push building");
-
-        let build_indices = joiner.get_build_indices_buffer(&mut ctx);
-        let stream_indices = joiner.get_stream_indices_buffer(&mut ctx);
-
-        let mut result_pairs: Vec<(u32, u32)> = Vec::new();
-
-        for (build_index, stream_index) in build_indices.iter().zip(stream_indices.iter()) {
-            result_pairs.push((*build_index, *stream_index));
-        }
-
-        let kernels = scalar_kernels();
-
-        // Iterate through the vector and find the one named "st_intersects"
-        let st_intersects = kernels
-            .into_iter()
-            .find(|(name, _)| *name == "st_intersects")
-            .map(|(_, kernel_ref)| kernel_ref)
-            .unwrap();
-
-        let sedona_type = SedonaType::Wkb(Edges::Planar, lnglat());
-        let udf = SedonaScalarUDF::from_kernel("st_intersects", st_intersects);
-        let tester =
-            ScalarUdfTester::new(udf.into(), vec![sedona_type.clone(), sedona_type.clone()]);
-
-        let mut answer_pairs: Vec<(u32, u32)> = Vec::new();
-
-        for (poly_index, poly) in polygon_values.iter().enumerate() {
-            for (point_index, point) in point_values.iter().enumerate() {
-                let result = tester
-                    .invoke_scalar_scalar(poly.unwrap(), point.unwrap())
-                    .unwrap();
-                if result == true.into() {
-                    answer_pairs.push((poly_index as u32, point_index as u32));
-                }
-            }
-        }
-
-        // Sort both vectors. The default sort on tuples compares element by element.
-        result_pairs.sort();
-        answer_pairs.sort();
-
-        // Assert that the two sorted vectors are equal.
-        assert_eq!(result_pairs, answer_pairs);
-
-        joiner.destroy_context(&mut ctx);
-        joiner.release();
-    }
-}
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index bad4cb5c6..d89d3b9a8 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -9,14 +9,9 @@ c/sedona-geoarrow-c/src/geoarrow/ryu/*
 c/sedona-geoarrow-c/src/nanoarrow/*
 c/sedona-s2geography/s2geography/*
 c/sedona-s2geography/s2geometry/*
-c/sedona-libgpuspatial/libgpuspatial/test/geoarrow_geos/*
 c/sedona-libgpuspatial/libgpuspatial/cmake/RAPIDS_VERSION
-c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/im.cuh
-c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.cuh
-c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.h
-c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.h
-c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.h
-c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/thread_pool.h
+c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.hpp
+c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/thread_pool.hpp
 c/sedona-tg/src/tg/*
 Cargo.lock
 ci/scripts/windows/Cargo.lock