Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 53 additions & 1 deletion python/sedonadb/tests/test_sjoin.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,16 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest

import json
import warnings

import geopandas as gpd
import numpy as np
import pandas as pd
import pytest
from sedonadb.testing import PostGIS, SedonaDB
from shapely.geometry import Point


@pytest.mark.parametrize(
Expand Down Expand Up @@ -227,3 +234,48 @@ def test_non_optimizable_subquery():
sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
assert len(sedonadb_results) > 0
eng_postgis.assert_query_result(sql, sedonadb_results)


def test_spatial_join_with_pandas_metadata(con):
# Previous versions of SedonaDB failed to execute this because of a mismatched
# schema. Attempts to simplify this reproducer weren't able to recreate the
# initial error (PhysicalOptimizer rule 'join_selection' failed).
# https://github.com/apache/sedona-db/issues/477

# 1. Generate Data
n_points = 1000
n_polys = 10

# Points
rng = np.random.Generator(np.random.MT19937(49791))
lons = rng.uniform(-6, 2, n_points)
lats = rng.uniform(50, 59, n_points)
pts_df = pd.DataFrame(
{"idx": range(n_points), "geometry": [Point(x, y) for x, y in zip(lons, lats)]}
)
pts_gdf = gpd.GeoDataFrame(pts_df, crs="EPSG:4326")

# Polygons (Centers buffered)
plons = rng.uniform(-6, 2, n_polys)
plats = rng.uniform(50, 59, n_polys)
poly_centers = gpd.GeoDataFrame(
{"geometry": [Point(x, y) for x, y in zip(plons, plats)]}, crs="EPSG:4326"
)
# Simple buffer in degrees (test data so we don't need the GeoPandas warning here)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
polys_gdf = poly_centers.buffer(0.1).to_frame(name="geometry")

# 2. Load
con.create_data_frame(pts_gdf).to_view("points", overwrite=True)
con.create_data_frame(polys_gdf).to_view("polygons", overwrite=True)

# 3. Intersection
query = """
SELECT p.idx
FROM points AS p, polygons AS poly
WHERE ST_Intersects(p.geometry, poly.geometry)
"""

res = con.sql(query).to_pandas()
pd.testing.assert_frame_equal(res, pd.DataFrame({"idx": [304, 342, 490, 705]}))
23 changes: 21 additions & 2 deletions rust/sedona/src/record_batch_reader_provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use std::{any::Any, fmt::Debug, sync::Arc};

use std::{any::Any, collections::HashMap, fmt::Debug, sync::Arc};

use arrow_array::RecordBatchReader;
use arrow_schema::SchemaRef;
Expand Down Expand Up @@ -49,8 +50,13 @@ pub struct RecordBatchReaderProvider {
unsafe impl Sync for RecordBatchReaderProvider {}

impl RecordBatchReaderProvider {
/// Create a new RecordBatchReaderProvider from an existing RecordBatchReader
///
/// Schema metadata is stripped if provided. While schema metadata is supported
/// in theory in DataFusion, it causes issues with schema equivalence in some
/// corner cases: https://github.com/apache/sedona-db/issues/477.
pub fn new(reader: Box<dyn RecordBatchReader + Send>) -> Self {
let schema = reader.schema();
let schema = schema_ref_strip_metadata(reader.schema());
Self {
reader: Mutex::new(Some(reader)),
schema,
Expand Down Expand Up @@ -297,6 +303,19 @@ impl ExecutionPlan for RecordBatchReaderExec {
}
}

/// Strips metadata from a SchemaRef if needed
fn schema_ref_strip_metadata(schema_ref: SchemaRef) -> SchemaRef {
if schema_ref.metadata().is_empty() {
schema_ref
} else {
schema_ref
.as_ref()
.clone()
.with_metadata(HashMap::new())
.into()
}
}

#[cfg(test)]
mod test {

Expand Down