Skip to content

Commit c217bf7

Browse files
committed
Providing a SpatialIndexBuilder trait and DefaultSpatialIndexBuilder
1 parent 085a047 commit c217bf7

8 files changed

Lines changed: 161 additions & 112 deletions

File tree

rust/sedona-spatial-join/src/index.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,14 @@ mod knn_adapter;
2222
pub(crate) mod memory_plan;
2323
pub(crate) mod partitioned_index_provider;
2424
pub(crate) mod spatial_index;
25+
pub(crate) mod spatial_index_builder;
2526

2627
pub(crate) use build_side_collector::{
2728
BuildPartition, BuildSideBatchesCollector, CollectBuildSideMetrics,
2829
};
2930
pub(crate) use spatial_index::SpatialIndex;
3031

31-
pub use default_spatial_index_builder::{SpatialIndexBuilder, SpatialJoinBuildMetrics};
32+
pub use default_spatial_index_builder::DefaultSpatialIndexBuilder;
3233
use wkb::reader::Wkb;
3334

3435
/// The result of a spatial index query

rust/sedona-spatial-join/src/index/build_side_collector.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ use sedona_expr::statistics::GeoStatistics;
3232
use sedona_functions::st_analyze_agg::AnalyzeAccumulator;
3333
use sedona_schema::datatypes::WKB_GEOMETRY;
3434

35+
use crate::index::spatial_index_builder::SpatialIndexBuilder;
36+
use crate::index::DefaultSpatialIndexBuilder;
3537
use crate::{
3638
evaluated_batch::{
3739
evaluated_batch_stream::{
@@ -41,7 +43,6 @@ use crate::{
4143
spill::EvaluatedBatchSpillWriter,
4244
EvaluatedBatch,
4345
},
44-
index::SpatialIndexBuilder,
4546
operand_evaluator::{create_operand_evaluator, OperandEvaluator},
4647
spatial_predicate::SpatialPredicate,
4748
utils::bbox_sampler::{BoundingBoxSampler, BoundingBoxSamples},
@@ -217,7 +218,7 @@ impl BuildSideBatchesCollector {
217218
}
218219

219220
let geo_statistics = analyzer.finish();
220-
let extra_mem = SpatialIndexBuilder::estimate_extra_memory_usage(
221+
let extra_mem = DefaultSpatialIndexBuilder::estimate_extra_memory_usage(
221222
&geo_statistics,
222223
&self.spatial_predicate,
223224
&self.spatial_join_options,

rust/sedona-spatial-join/src/index/default_spatial_index.rs

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -705,13 +705,14 @@ impl SpatialIndex for DefaultSpatialIndex {
705705
#[cfg(test)]
706706
mod tests {
707707
use crate::{
708-
index::{SpatialIndexBuilder, SpatialJoinBuildMetrics},
709708
operand_evaluator::EvaluatedGeometryArray,
710709
spatial_predicate::{KNNPredicate, RelationPredicate, SpatialRelationType},
711710
};
712711

713712
use super::*;
714713
use crate::index::spatial_index::SpatialIndexRef;
714+
use crate::index::spatial_index_builder::{SpatialIndexBuilder, SpatialJoinBuildMetrics};
715+
use crate::index::DefaultSpatialIndexBuilder;
715716
use arrow_array::RecordBatch;
716717
use arrow_schema::{DataType, Field};
717718
use datafusion_common::JoinSide;
@@ -737,7 +738,7 @@ mod tests {
737738
SpatialRelationType::Intersects,
738739
));
739740

740-
let builder = SpatialIndexBuilder::new(
741+
let builder = DefaultSpatialIndexBuilder::new(
741742
schema.clone(),
742743
spatial_predicate,
743744
options,
@@ -774,7 +775,7 @@ mod tests {
774775
true,
775776
)]));
776777

777-
let mut builder = SpatialIndexBuilder::new(
778+
let mut builder = DefaultSpatialIndexBuilder::new(
778779
schema.clone(),
779780
spatial_predicate,
780781
options,
@@ -829,7 +830,7 @@ mod tests {
829830
true,
830831
)]));
831832

832-
let mut builder = SpatialIndexBuilder::new(
833+
let mut builder = DefaultSpatialIndexBuilder::new(
833834
schema.clone(),
834835
spatial_predicate,
835836
options,
@@ -929,7 +930,7 @@ mod tests {
929930
true,
930931
)]));
931932

932-
let mut builder = SpatialIndexBuilder::new(
933+
let mut builder = DefaultSpatialIndexBuilder::new(
933934
schema.clone(),
934935
spatial_predicate,
935936
options,
@@ -1028,7 +1029,7 @@ mod tests {
10281029
true,
10291030
)]));
10301031

1031-
let mut builder = SpatialIndexBuilder::new(
1032+
let mut builder = DefaultSpatialIndexBuilder::new(
10321033
schema.clone(),
10331034
spatial_predicate,
10341035
options,
@@ -1128,7 +1129,7 @@ mod tests {
11281129
true,
11291130
)]));
11301131

1131-
let mut builder = SpatialIndexBuilder::new(
1132+
let mut builder = DefaultSpatialIndexBuilder::new(
11321133
schema.clone(),
11331134
spatial_predicate,
11341135
options,
@@ -1219,7 +1220,7 @@ mod tests {
12191220
JoinSide::Left,
12201221
));
12211222

1222-
let builder = SpatialIndexBuilder::new(
1223+
let builder = DefaultSpatialIndexBuilder::new(
12231224
schema.clone(),
12241225
spatial_predicate,
12251226
options,
@@ -1279,7 +1280,7 @@ mod tests {
12791280
true,
12801281
)]));
12811282

1282-
let mut builder = SpatialIndexBuilder::new(
1283+
let mut builder = DefaultSpatialIndexBuilder::new(
12831284
schema.clone(),
12841285
spatial_predicate,
12851286
options,
@@ -1398,7 +1399,7 @@ mod tests {
13981399
true,
13991400
)]));
14001401

1401-
let mut builder = SpatialIndexBuilder::new(
1402+
let mut builder = DefaultSpatialIndexBuilder::new(
14021403
schema.clone(),
14031404
spatial_predicate,
14041405
options,
@@ -1485,7 +1486,7 @@ mod tests {
14851486
true,
14861487
)]));
14871488

1488-
let mut builder = SpatialIndexBuilder::new(
1489+
let mut builder = DefaultSpatialIndexBuilder::new(
14891490
schema.clone(),
14901491
spatial_predicate,
14911492
options,
@@ -1565,7 +1566,7 @@ mod tests {
15651566
true,
15661567
)]));
15671568

1568-
let mut builder = SpatialIndexBuilder::new(
1569+
let mut builder = DefaultSpatialIndexBuilder::new(
15691570
schema.clone(),
15701571
spatial_predicate,
15711572
options,
@@ -1691,7 +1692,7 @@ mod tests {
16911692
true,
16921693
)]));
16931694

1694-
let mut builder = SpatialIndexBuilder::new(
1695+
let mut builder = DefaultSpatialIndexBuilder::new(
16951696
schema.clone(),
16961697
spatial_predicate,
16971698
options,
@@ -1760,7 +1761,7 @@ mod tests {
17601761
true,
17611762
)]));
17621763

1763-
let mut builder = SpatialIndexBuilder::new(
1764+
let mut builder = DefaultSpatialIndexBuilder::new(
17641765
schema,
17651766
spatial_predicate,
17661767
options,

rust/sedona-spatial-join/src/index/default_spatial_index_builder.rs

Lines changed: 62 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,12 @@
1717

1818
use arrow::array::BooleanBufferBuilder;
1919
use arrow_schema::SchemaRef;
20-
use datafusion_physical_plan::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder};
2120
use sedona_common::SpatialJoinOptions;
2221
use sedona_expr::statistics::GeoStatistics;
2322
use std::sync::Arc;
2423

25-
use datafusion_common::{utils::proxy::VecAllocExt, Result};
26-
use datafusion_expr::JoinType;
27-
use futures::StreamExt;
28-
use geo_index::rtree::{sort::HilbertSort, RTree, RTreeBuilder, RTreeIndex};
29-
use parking_lot::Mutex;
30-
use std::sync::atomic::AtomicUsize;
31-
3224
use crate::index::spatial_index::SpatialIndexRef;
25+
use crate::index::spatial_index_builder::{SpatialIndexBuilder, SpatialJoinBuildMetrics};
3326
use crate::{
3427
evaluated_batch::{evaluated_batch_stream::SendableEvaluatedBatchStream, EvaluatedBatch},
3528
index::{default_spatial_index::DefaultSpatialIndex, knn_adapter::KnnComponents},
@@ -38,6 +31,13 @@ use crate::{
3831
spatial_predicate::SpatialPredicate,
3932
utils::join_utils::need_produce_result_in_final,
4033
};
34+
use async_trait::async_trait;
35+
use datafusion_common::{utils::proxy::VecAllocExt, Result};
36+
use datafusion_expr::JoinType;
37+
use futures::StreamExt;
38+
use geo_index::rtree::{sort::HilbertSort, RTree, RTreeBuilder, RTreeIndex};
39+
use parking_lot::Mutex;
40+
use std::sync::atomic::AtomicUsize;
4141

4242
// Type aliases for better readability
4343
type SpatialRTree = RTree<f32>;
@@ -54,7 +54,7 @@ const RTREE_MEMORY_ESTIMATE_PER_RECT: usize = 60;
5454
/// 2. Building the spatial R-tree index
5555
/// 3. Setting up memory tracking and visited bitmaps
5656
/// 4. Configuring prepared geometries based on execution mode
57-
pub struct SpatialIndexBuilder {
57+
pub struct DefaultSpatialIndexBuilder {
5858
schema: SchemaRef,
5959
spatial_predicate: SpatialPredicate,
6060
options: SpatialJoinOptions,
@@ -72,25 +72,7 @@ pub struct SpatialIndexBuilder {
7272
memory_used: usize,
7373
}
7474

75-
/// Metrics for the build phase of the spatial join.
76-
#[derive(Clone, Debug, Default)]
77-
pub struct SpatialJoinBuildMetrics {
78-
/// Total time for collecting build-side of join
79-
pub(crate) build_time: metrics::Time,
80-
/// Memory used by the spatial-index in bytes
81-
pub(crate) build_mem_used: metrics::Gauge,
82-
}
83-
84-
impl SpatialJoinBuildMetrics {
85-
pub fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
86-
Self {
87-
build_time: MetricBuilder::new(metrics).subset_time("build_time", partition),
88-
build_mem_used: MetricBuilder::new(metrics).gauge("build_mem_used", partition),
89-
}
90-
}
91-
}
92-
93-
impl SpatialIndexBuilder {
75+
impl DefaultSpatialIndexBuilder {
9476
/// Create a new builder with the given configuration.
9577
pub fn new(
9678
schema: SchemaRef,
@@ -113,55 +95,6 @@ impl SpatialIndexBuilder {
11395
})
11496
}
11597

116-
/// Estimate the amount of memory required by the R-tree index and evaluating spatial predicates.
117-
/// The estimated memory usage does not include the memory required for holding the build side
118-
/// batches.
119-
pub fn estimate_extra_memory_usage(
120-
geo_stats: &GeoStatistics,
121-
spatial_predicate: &SpatialPredicate,
122-
options: &SpatialJoinOptions,
123-
) -> usize {
124-
// Estimate the amount of memory needed by the refiner
125-
let num_geoms = geo_stats.total_geometries().unwrap_or(0) as usize;
126-
let refiner = create_refiner(
127-
options.spatial_library,
128-
spatial_predicate,
129-
options.clone(),
130-
num_geoms,
131-
geo_stats.clone(),
132-
);
133-
let refiner_mem_usage = refiner.estimate_max_memory_usage(geo_stats);
134-
135-
let knn_components_mem_usage =
136-
if matches!(spatial_predicate, SpatialPredicate::KNearestNeighbors(_)) {
137-
KnnComponents::estimate_max_memory_usage(geo_stats)
138-
} else {
139-
0
140-
};
141-
142-
// Estimate the amount of memory needed for the R-tree
143-
let rtree_mem_usage = num_geoms * RTREE_MEMORY_ESTIMATE_PER_RECT;
144-
145-
// The final estimation is the sum of all above
146-
refiner_mem_usage + knn_components_mem_usage + rtree_mem_usage
147-
}
148-
149-
/// Add a geometry batch to be indexed.
150-
///
151-
/// This method accumulates geometry batches that will be used to build the spatial index.
152-
/// Each batch contains processed geometry data along with memory usage information.
153-
pub fn add_batch(&mut self, indexed_batch: EvaluatedBatch) -> Result<()> {
154-
let in_mem_size = indexed_batch.in_mem_size()?;
155-
self.indexed_batches.push(indexed_batch);
156-
self.record_memory_usage(in_mem_size);
157-
Ok(())
158-
}
159-
160-
pub fn merge_stats(&mut self, stats: GeoStatistics) -> &mut Self {
161-
self.stats.merge(&stats);
162-
self
163-
}
164-
16598
/// Build the spatial R-tree index from collected geometry batches.
16699
fn build_rtree(&mut self) -> Result<RTreeBuildResult> {
167100
let build_timer = self.metrics.build_time.timer();
@@ -244,8 +177,57 @@ impl SpatialIndexBuilder {
244177
geom_idx_vec
245178
}
246179

247-
/// Finish building and return the completed SpatialIndex.
248-
pub fn finish(mut self) -> Result<SpatialIndexRef> {
180+
fn record_memory_usage(&mut self, bytes: usize) {
181+
self.memory_used += bytes;
182+
self.metrics.build_mem_used.set_max(self.memory_used);
183+
}
184+
}
185+
186+
#[async_trait]
187+
impl SpatialIndexBuilder for DefaultSpatialIndexBuilder {
188+
fn estimate_extra_memory_usage(
189+
geo_stats: &GeoStatistics,
190+
spatial_predicate: &SpatialPredicate,
191+
options: &SpatialJoinOptions,
192+
) -> usize {
193+
// Estimate the amount of memory needed by the refiner
194+
let num_geoms = geo_stats.total_geometries().unwrap_or(0) as usize;
195+
let refiner = create_refiner(
196+
options.spatial_library,
197+
spatial_predicate,
198+
options.clone(),
199+
num_geoms,
200+
geo_stats.clone(),
201+
);
202+
let refiner_mem_usage = refiner.estimate_max_memory_usage(geo_stats);
203+
204+
let knn_components_mem_usage =
205+
if matches!(spatial_predicate, SpatialPredicate::KNearestNeighbors(_)) {
206+
KnnComponents::estimate_max_memory_usage(geo_stats)
207+
} else {
208+
0
209+
};
210+
211+
// Estimate the amount of memory needed for the R-tree
212+
let rtree_mem_usage = num_geoms * RTREE_MEMORY_ESTIMATE_PER_RECT;
213+
214+
// The final estimation is the sum of all above
215+
refiner_mem_usage + knn_components_mem_usage + rtree_mem_usage
216+
}
217+
218+
fn add_batch(&mut self, indexed_batch: EvaluatedBatch) -> Result<()> {
219+
let in_mem_size = indexed_batch.in_mem_size()?;
220+
self.indexed_batches.push(indexed_batch);
221+
self.record_memory_usage(in_mem_size);
222+
Ok(())
223+
}
224+
225+
fn merge_stats(&mut self, stats: GeoStatistics) -> &mut Self {
226+
self.stats.merge(&stats);
227+
self
228+
}
229+
230+
fn finish(mut self) -> Result<SpatialIndexRef> {
249231
if self.indexed_batches.is_empty() {
250232
return Ok(Arc::new(DefaultSpatialIndex::empty(
251233
self.spatial_predicate,
@@ -309,7 +291,7 @@ impl SpatialIndexBuilder {
309291
)))
310292
}
311293

312-
pub async fn add_stream(
294+
async fn add_stream(
313295
&mut self,
314296
mut stream: SendableEvaluatedBatchStream,
315297
geo_statistics: GeoStatistics,
@@ -321,9 +303,4 @@ impl SpatialIndexBuilder {
321303
self.merge_stats(geo_statistics);
322304
Ok(())
323305
}
324-
325-
fn record_memory_usage(&mut self, bytes: usize) {
326-
self.memory_used += bytes;
327-
self.metrics.build_mem_used.set_max(self.memory_used);
328-
}
329306
}

0 commit comments

Comments
 (0)