From ee1e1667605b8192deea5a92bb31df05d79d1ca2 Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Thu, 11 Jun 2026 16:47:19 -0400 Subject: [PATCH] feat(vortex-geo): native Geometry extension type with point support Add a single logical extension type `vortex.geo.geometry` for GeoArrow-native geometry. The geometry kind (point, linestring, ...) and the CRS live in the extension metadata; the storage dtype is the kind's GeoArrow separated coordinate layout, and the coordinate dimension is recovered from the storage field names. Only point columns are supported end to end so far; other kinds are rejected at dtype construction until their scalar unpacking and kernels exist. Add a `vortex.geo.distance` scalar function computing planar (Euclidean) distance. The signature takes geometry operands and execution dispatches on their kinds, with a point x point kernel; operands are type-checked at construction to be non-nullable point columns sharing a CRS. Signed-off-by: Nemo Yu --- Cargo.lock | 1 + vortex-geo/Cargo.toml | 1 + .../src/extension/geometry/coordinate.rs | 270 ++++++++++++ vortex-geo/src/extension/geometry/mod.rs | 413 ++++++++++++++++++ vortex-geo/src/extension/mod.rs | 85 +++- vortex-geo/src/extension/wkb.rs | 5 +- vortex-geo/src/lib.rs | 13 +- vortex-geo/src/scalar_fn/distance.rs | 361 +++++++++++++++ vortex-geo/src/scalar_fn/mod.rs | 6 + 9 files changed, 1148 insertions(+), 7 deletions(-) create mode 100644 vortex-geo/src/extension/geometry/coordinate.rs create mode 100644 vortex-geo/src/extension/geometry/mod.rs create mode 100644 vortex-geo/src/scalar_fn/distance.rs create mode 100644 vortex-geo/src/scalar_fn/mod.rs diff --git a/Cargo.lock b/Cargo.lock index b6f67682c4a..4bc788f48d9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10269,6 +10269,7 @@ dependencies = [ "geo-types", "geoarrow", "prost 0.14.4", + "rstest", "vortex-array", "vortex-error", "vortex-session", diff --git a/vortex-geo/Cargo.toml b/vortex-geo/Cargo.toml index 4e76e2eefcb..3b09cb77cf8 100644 --- a/vortex-geo/Cargo.toml +++ b/vortex-geo/Cargo.toml @@ -26,6 +26,7 @@ wkb = { workspace = true } [dev-dependencies] geo-traits = { workspace = true } geo-types = { workspace = true } +rstest = { workspace = true } [lints] workspace = true diff --git a/vortex-geo/src/extension/geometry/coordinate.rs b/vortex-geo/src/extension/geometry/coordinate.rs new file mode 100644 index 00000000000..6fc555a6bab --- /dev/null +++ b/vortex-geo/src/extension/geometry/coordinate.rs @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Coordinate building blocks for the [`Geometry`](crate::extension::Geometry) extension type: +//! the `Struct` storage, its [`Dimension`], and the decoded +//! [`Coordinate`] value. +//! +//! The coordinate fields, where `{}` marks a field that may be absent (every field present is +//! non-nullable), are: +//! - `x` — longitude or easting +//! - `y` — latitude or northing +//! - `z` — elevation +//! - `m` — measure: an arbitrary per-point value such as distance along a route or a timestamp + +use std::fmt::Display; +use std::fmt::Formatter; + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrays::struct_::StructArrayExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldNames; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::scalar::Scalar; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; + +use crate::extension::Geometry; +use crate::extension::GeometryKind; + +/// Coordinate dimensions, matching GeoArrow. Field order is fixed: `x`, `y`, then `z` before `m`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum Dimension { + /// 2D: `x`, `y`. + Xy, + /// 3D with elevation: `x`, `y`, `z`. + Xyz, + /// 3D with a measure: `x`, `y`, `m`. + Xym, + /// 4D: `x`, `y`, `z`, `m`. + Xyzm, +} + +impl Dimension { + /// Recover the dimension from a coordinate's field names, in GeoArrow order. + pub(crate) fn from_field_names(names: &FieldNames) -> VortexResult { + let mut strs = [""; 4]; + vortex_ensure!( + names.len() <= strs.len(), + "not a valid GeoArrow coordinate dimension: {names:?}" + ); + for (slot, name) in strs.iter_mut().zip(names.iter()) { + *slot = name.as_ref(); + } + Ok(match &strs[..names.len()] { + ["x", "y"] => Dimension::Xy, + ["x", "y", "z"] => Dimension::Xyz, + ["x", "y", "m"] => Dimension::Xym, + ["x", "y", "z", "m"] => Dimension::Xyzm, + _ => vortex_bail!("not a valid GeoArrow coordinate dimension: {names:?}"), + }) + } +} + +/// A decoded coordinate. `z`/`m` are `Some` iff the storage dimension includes them. +/// +/// This is the native value produced when unpacking a point-kind +/// [`Geometry`](crate::extension::Geometry) scalar; the rest of the coordinate machinery is +/// crate-internal. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct Coordinate { + /// The x (longitude/easting) ordinate. + pub x: f64, + /// The y (latitude/northing) ordinate. + pub y: f64, + /// The optional `z` (elevation) ordinate. + pub z: Option, + /// The optional `m` (measure) ordinate. + pub m: Option, +} + +impl Coordinate { + /// A 2D coordinate (`z`/`m` unset). + pub fn xy(x: f64, y: f64) -> Self { + Coordinate { + x, + y, + z: None, + m: None, + } + } +} + +impl Display for Coordinate { + fn fmt(&self, fmt: &mut Formatter<'_>) -> std::fmt::Result { + match (self.z, self.m) { + (None, None) => write!(fmt, "POINT({} {})", self.x, self.y), + (Some(z), None) => write!(fmt, "POINT Z ({} {} {})", self.x, self.y, z), + (None, Some(m)) => write!(fmt, "POINT M ({} {} {})", self.x, self.y, m), + (Some(z), Some(m)) => write!(fmt, "POINT ZM ({} {} {} {})", self.x, self.y, z, m), + } + } +} + +/// Validate that `dtype` is a coordinate struct of non-nullable `f64` fields, returning its +/// [`Dimension`]. Any of the four GeoArrow dimensions validates. +pub(crate) fn coordinate_dimension(dtype: &DType) -> VortexResult { + let DType::Struct(fields, _) = dtype else { + vortex_bail!("coordinate storage must be a Struct, was {dtype}"); + }; + for (name, field) in fields.names().iter().zip(fields.fields()) { + vortex_ensure!( + matches!( + field, + DType::Primitive(PType::F64, Nullability::NonNullable) + ), + "coordinate field {name} must be non-nullable f64, was {field}" + ); + } + Dimension::from_field_names(fields.names()) +} + +/// Decode a [`Coordinate`] from a coordinate `Struct` scalar (`z`/`m` read iff +/// present, so the same decoder serves every dimension). +pub(crate) fn coordinate_from_struct(scalar: &Scalar) -> VortexResult { + let fields = scalar.as_struct(); + let required = |name: &str| -> VortexResult { + f64::try_from( + &fields + .field(name) + .ok_or_else(|| vortex_err!("coordinate missing {name}"))?, + ) + }; + let optional = |name: &str| -> VortexResult> { + fields + .field(name) + .map(|value| f64::try_from(&value)) + .transpose() + }; + Ok(Coordinate { + x: required("x")?, + y: required("y")?, + z: optional("z")?, + m: optional("m")?, + }) +} + +/// Decode a [`Coordinate`] from an extension-typed point scalar (unwrapped to its coordinate +/// storage) or a bare coordinate `Struct` scalar. The per-row decode used by the distance fns. +pub(crate) fn coordinate_from_scalar(scalar: &Scalar) -> VortexResult { + match scalar.as_extension_opt() { + Some(ext_scalar) => coordinate_from_struct(&ext_scalar.to_storage_scalar()), + None => coordinate_from_struct(scalar), + } +} + +/// Validated, executed `x`/`y` columns of a point array. The bulk counterpart to [`Coordinate`]; +/// `z`/`m` are not executed. +pub(crate) struct ParsedCoordinates { + /// The flat `f64` `x` column. + pub(crate) xs: PrimitiveArray, + /// The flat `f64` `y` column. + pub(crate) ys: PrimitiveArray, +} + +/// Validate a point column's geometry kind and coordinate storage (layout and non-nullability), +/// then execute its `x`/`y` columns. +pub(crate) fn parse_storage( + points: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult { + if let Some(ext) = points.dtype().as_extension_opt() + && ext.is::() + { + let kind = ext.metadata::().kind()?; + vortex_ensure!( + kind == GeometryKind::Point, + "expected a point column, was {kind}" + ); + } + let storage = points + .clone() + .execute::(ctx)? + .storage_array() + .clone() + .execute::(ctx)?; + coordinate_dimension(storage.dtype())?; + vortex_ensure!( + !storage.dtype().is_nullable(), + "coordinate storage must be non-nullable to read unmasked ordinates, was {}", + storage.dtype() + ); + let xs = storage + .unmasked_field_by_name("x")? + .clone() + .execute::(ctx)?; + let ys = storage + .unmasked_field_by_name("y")? + .clone() + .execute::(ctx)?; + Ok(ParsedCoordinates { xs, ys }) +} + +#[cfg(test)] +mod tests { + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::ExtensionArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::StructArray; + use vortex_array::dtype::FieldNames; + use vortex_array::session::ArraySession; + use vortex_array::validity::Validity; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use super::Coordinate; + use super::parse_storage; + use crate::extension::Geometry; + use crate::extension::GeometryKind; + + /// Display emits WKT, including `z`/`m` when present. + #[test] + fn display_is_wkt() { + let coordinate = |z, m| Coordinate { + x: 1.0, + y: 2.0, + z, + m, + }; + assert_eq!(coordinate(None, None).to_string(), "POINT(1 2)"); + assert_eq!(coordinate(Some(3.0), None).to_string(), "POINT Z (1 2 3)"); + assert_eq!(coordinate(None, Some(4.0)).to_string(), "POINT M (1 2 4)"); + assert_eq!( + coordinate(Some(3.0), Some(4.0)).to_string(), + "POINT ZM (1 2 3 4)" + ); + } + + /// [`parse_storage`] reads the coordinate fields unmasked, so a nullable point column must + /// be rejected at parse time rather than decoding null rows as garbage ordinates. + #[test] + fn parse_rejects_nullable_points() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let storage = StructArray::try_new( + FieldNames::from(["x", "y"]), + vec![ + PrimitiveArray::from_iter(vec![1.0f64]).into_array(), + PrimitiveArray::from_iter(vec![2.0f64]).into_array(), + ], + 1, + Validity::AllValid, + )? + .into_array(); + let dtype = Geometry::dtype(GeometryKind::Point, None, storage.dtype().clone())?; + let points = ExtensionArray::new(dtype.erased(), storage).into_array(); + + assert!(parse_storage(&points, &mut ctx).is_err()); + Ok(()) + } +} diff --git a/vortex-geo/src/extension/geometry/mod.rs b/vortex-geo/src/extension/geometry/mod.rs new file mode 100644 index 00000000000..2a1604c4e07 --- /dev/null +++ b/vortex-geo/src/extension/geometry/mod.rs @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! The [`Geometry`] extension type (`vortex.geo.geometry`): one logical type for all +//! GeoArrow-native geometry kinds. + +pub(crate) mod coordinate; + +use std::fmt::Display; +use std::fmt::Formatter; + +use prost::Message; +use vortex_array::dtype::DType; +use vortex_array::dtype::extension::ExtDType; +use vortex_array::dtype::extension::ExtId; +use vortex_array::dtype::extension::ExtVTable; +use vortex_array::scalar::Scalar; +use vortex_array::scalar::ScalarValue; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; + +use self::coordinate::Coordinate; +use self::coordinate::Dimension; +use self::coordinate::coordinate_dimension; +use self::coordinate::coordinate_from_struct; +use super::GeoMetadata; +use super::GeometryKind; + +/// The `vortex.geo.geometry` extension type: native geometry of a single [`GeometryKind`]. +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] +pub struct Geometry; + +impl Geometry { + /// The extension dtype for a column of `kind` geometries over `storage`. + pub fn dtype( + kind: GeometryKind, + crs: Option, + storage: DType, + ) -> VortexResult> { + ExtDType::try_new( + GeoMetadata { + crs, + geometry_type: kind as i32, + }, + storage, + ) + } + + /// The [`GeometryKind`] of a geometry-typed `dtype`; errors on non-geometry dtypes. + pub fn kind_of(dtype: &DType) -> VortexResult { + let Some(ext) = dtype.as_extension_opt() else { + vortex_bail!("expected a geometry column, was {dtype}"); + }; + vortex_ensure!( + ext.is::(), + "expected a geometry column, was {dtype}" + ); + ext.metadata::().kind() + } +} + +/// Validate that `storage` is `kind`'s GeoArrow layout. +/// +/// Only point columns are supported end to end; other kinds are rejected here until their +/// scalar unpacking and kernels exist, so that a valid dtype never produces arrays whose +/// scalars fail to unpack. +fn validate_storage(kind: GeometryKind, storage: &DType) -> VortexResult { + match kind { + GeometryKind::Unspecified => { + vortex_bail!("geometry kind must be specified; mixed columns are not yet supported") + } + GeometryKind::Point => coordinate_dimension(storage), + kind => vortex_bail!("{kind} geometry columns are not yet supported"), + } +} + +impl ExtVTable for Geometry { + type Metadata = GeoMetadata; + type NativeValue<'a> = GeometryValue; + + fn id(&self) -> ExtId { + ExtId::new_static("vortex.geo.geometry") + } + + fn serialize_metadata(&self, metadata: &Self::Metadata) -> VortexResult> { + Ok(metadata.encode_to_vec()) + } + + fn deserialize_metadata(&self, metadata: &[u8]) -> VortexResult { + Ok(GeoMetadata::decode(metadata)?) + } + + fn validate_dtype(ext_dtype: &ExtDType) -> VortexResult<()> { + validate_storage(ext_dtype.metadata().kind()?, ext_dtype.storage_dtype()).map(|_| ()) + } + + fn unpack_native<'a>( + ext_dtype: &'a ExtDType, + storage_value: &'a ScalarValue, + ) -> VortexResult { + match ext_dtype.metadata().kind()? { + GeometryKind::Point => { + let storage = Scalar::try_new( + ext_dtype.storage_dtype().clone(), + Some(storage_value.clone()), + )?; + coordinate_from_struct(&storage).map(GeometryValue::Point) + } + kind => vortex_bail!("unpacking {kind} scalars is not yet implemented"), + } + } +} + +/// A decoded native geometry scalar value. +#[derive(Debug, Clone, PartialEq)] +#[non_exhaustive] +pub enum GeometryValue { + /// A single coordinate. + Point(Coordinate), + /// A line of coordinates. + LineString(Vec), + /// An outer ring plus zero or more interior rings (holes). + Polygon(Vec>), + /// A collection of points. + MultiPoint(Vec), + /// A collection of linestrings. + MultiLineString(Vec>), + /// A collection of polygons. + MultiPolygon(Vec>>), +} + +/// The WKT dimension tag (`" Z"`, `" M"`, `" ZM"`, or empty), read from `first`. +fn dimension_tag(first: Option<&Coordinate>) -> &'static str { + match first.map(|coordinate| (coordinate.z, coordinate.m)) { + None | Some((None, None)) => "", + Some((Some(_), None)) => " Z", + Some((None, Some(_))) => " M", + Some((Some(_), Some(_))) => " ZM", + } +} + +/// Write the bare ordinates of one coordinate: `x y {z} {m}`. +fn fmt_ordinates(f: &mut Formatter<'_>, coordinate: &Coordinate) -> std::fmt::Result { + write!(f, "{} {}", coordinate.x, coordinate.y)?; + if let Some(z) = coordinate.z { + write!(f, " {z}")?; + } + if let Some(m) = coordinate.m { + write!(f, " {m}")?; + } + Ok(()) +} + +/// Write `(…, …)`, formatting each element with `item`. +fn fmt_seq( + f: &mut Formatter<'_>, + items: &[T], + item: impl Fn(&mut Formatter<'_>, &T) -> std::fmt::Result, +) -> std::fmt::Result { + write!(f, "(")?; + for (idx, value) in items.iter().enumerate() { + if idx > 0 { + write!(f, ", ")?; + } + item(f, value)?; + } + write!(f, ")") +} + +/// Write a WKT body: ` EMPTY` if empty, otherwise ` {tag} (…)`. +fn fmt_body( + f: &mut Formatter<'_>, + items: &[T], + first: Option<&Coordinate>, + item: impl Fn(&mut Formatter<'_>, &T) -> std::fmt::Result, +) -> std::fmt::Result { + if items.is_empty() { + return write!(f, " EMPTY"); + } + write!(f, "{} ", dimension_tag(first))?; + fmt_seq(f, items, item) +} + +impl Display for GeometryValue { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let coords = + |f: &mut Formatter<'_>, line: &Vec| fmt_seq(f, line, fmt_ordinates); + let rings = |f: &mut Formatter<'_>, rings: &Vec>| fmt_seq(f, rings, coords); + match self { + GeometryValue::Point(coordinate) => Display::fmt(coordinate, f), + GeometryValue::LineString(line) => { + write!(f, "LINESTRING")?; + fmt_body(f, line, line.first(), fmt_ordinates) + } + GeometryValue::MultiPoint(points) => { + write!(f, "MULTIPOINT")?; + fmt_body(f, points, points.first(), fmt_ordinates) + } + GeometryValue::Polygon(polygon) => { + write!(f, "POLYGON")?; + let first = polygon.first().and_then(|ring| ring.first()); + fmt_body(f, polygon, first, coords) + } + GeometryValue::MultiLineString(lines) => { + write!(f, "MULTILINESTRING")?; + let first = lines.first().and_then(|line| line.first()); + fmt_body(f, lines, first, coords) + } + GeometryValue::MultiPolygon(polygons) => { + write!(f, "MULTIPOLYGON")?; + let first = polygons + .first() + .and_then(|polygon| polygon.first()) + .and_then(|ring| ring.first()); + fmt_body(f, polygons, first, rings) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use rstest::rstest; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::ExtensionArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::StructArray; + use vortex_array::dtype::DType; + use vortex_array::dtype::FieldNames; + use vortex_array::dtype::Nullability; + use vortex_array::dtype::PType; + use vortex_array::dtype::StructFields; + use vortex_array::session::ArraySession; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use super::Geometry; + use super::GeometryValue; + use super::coordinate::Coordinate; + use super::coordinate::Dimension; + use super::coordinate::coordinate_dimension; + use super::coordinate::coordinate_from_scalar; + use crate::extension::GeometryKind; + + /// A coordinate storage dtype with the given field names, non-nullable `f64` per field. + fn coordinate_dtype(names: &[&'static str]) -> DType { + let fields = std::iter::repeat_n( + DType::Primitive(PType::F64, Nullability::NonNullable), + names.len(), + ) + .collect::>(); + DType::Struct( + StructFields::new(FieldNames::from(names), fields), + Nullability::NonNullable, + ) + } + + /// `storage` wrapped in `depth` non-nullable `List` layers. + fn nested_list(storage: DType, depth: usize) -> DType { + let mut dtype = storage; + for _ in 0..depth { + dtype = DType::List(Arc::new(dtype), Nullability::NonNullable); + } + dtype + } + + /// All four GeoArrow dimensions validate as point storage and round-trip via field names. + #[test] + fn point_validates_every_dimension() -> VortexResult<()> { + let cases = [ + (Dimension::Xy, ["x", "y"].as_slice()), + (Dimension::Xyz, ["x", "y", "z"].as_slice()), + (Dimension::Xym, ["x", "y", "m"].as_slice()), + (Dimension::Xyzm, ["x", "y", "z", "m"].as_slice()), + ]; + for (dim, names) in cases { + let storage = coordinate_dtype(names); + assert_eq!(coordinate_dimension(&storage)?, dim); + Geometry::dtype(GeometryKind::Point, Some("EPSG:4326".to_string()), storage)?; + } + Ok(()) + } + + /// Non-point kinds are rejected at dtype construction — even with their correct GeoArrow + /// layout — until their scalar unpacking and kernels exist. + #[rstest] + #[case(GeometryKind::LineString, 1)] + #[case(GeometryKind::MultiPoint, 1)] + #[case(GeometryKind::Polygon, 2)] + #[case(GeometryKind::MultiLineString, 2)] + #[case(GeometryKind::MultiPolygon, 3)] + fn non_point_kinds_are_rejected(#[case] kind: GeometryKind, #[case] depth: usize) { + let storage = nested_list(coordinate_dtype(&["x", "y"]), depth); + assert!(Geometry::dtype(kind, None, storage).is_err()); + } + + /// Construction rejects non-struct storage, non-coordinate fields, and the `Unspecified` + /// kind. + #[test] + fn rejects_invalid_storage() -> VortexResult<()> { + let primitive = DType::Primitive(PType::F64, Nullability::NonNullable); + assert!(Geometry::dtype(GeometryKind::Point, None, primitive).is_err()); + + let wrong_fields = StructArray::from_fields(&[ + ("a", PrimitiveArray::from_iter(vec![0.0f64]).into_array()), + ("b", PrimitiveArray::from_iter(vec![0.0f64]).into_array()), + ])? + .into_array(); + assert!(Geometry::dtype(GeometryKind::Point, None, wrong_fields.dtype().clone()).is_err()); + + assert!( + Geometry::dtype( + GeometryKind::Unspecified, + None, + coordinate_dtype(&["x", "y"]) + ) + .is_err() + ); + Ok(()) + } + + /// A point column round-trips through scalar execution back to its coordinates. + #[test] + fn point_unpacks_coordinates() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let storage = StructArray::from_fields(&[ + ( + "x", + PrimitiveArray::from_iter(vec![1.0f64, -111.7610]).into_array(), + ), + ( + "y", + PrimitiveArray::from_iter(vec![2.0f64, 34.8697]).into_array(), + ), + ])? + .into_array(); + let dtype = Geometry::dtype( + GeometryKind::Point, + Some("EPSG:4326".to_string()), + storage.dtype().clone(), + )?; + let points = ExtensionArray::new(dtype.erased(), storage).into_array(); + + assert_eq!( + coordinate_from_scalar(&points.execute_scalar(0, &mut ctx)?)?, + Coordinate::xy(1.0, 2.0) + ); + assert_eq!( + coordinate_from_scalar(&points.execute_scalar(1, &mut ctx)?)?, + Coordinate::xy(-111.7610, 34.8697) + ); + Ok(()) + } + + /// `GeometryValue` displays as WKT for every kind, including dimension tags and `EMPTY`. + #[test] + fn display_is_wkt() { + let xy = Coordinate::xy; + assert_eq!(GeometryValue::Point(xy(1.0, 2.0)).to_string(), "POINT(1 2)"); + assert_eq!( + GeometryValue::LineString(vec![xy(0.0, 0.0), xy(1.0, 1.0)]).to_string(), + "LINESTRING (0 0, 1 1)" + ); + assert_eq!( + GeometryValue::LineString(vec![]).to_string(), + "LINESTRING EMPTY" + ); + assert_eq!( + GeometryValue::MultiPoint(vec![xy(0.0, 0.0), xy(1.0, 1.0)]).to_string(), + "MULTIPOINT (0 0, 1 1)" + ); + assert_eq!( + GeometryValue::Polygon(vec![vec![xy(0.0, 0.0), xy(1.0, 0.0), xy(0.0, 0.0)]]) + .to_string(), + "POLYGON ((0 0, 1 0, 0 0))" + ); + assert_eq!( + GeometryValue::MultiLineString(vec![vec![xy(0.0, 0.0), xy(1.0, 1.0)]]).to_string(), + "MULTILINESTRING ((0 0, 1 1))" + ); + assert_eq!( + GeometryValue::MultiPolygon(vec![vec![vec![xy(0.0, 0.0), xy(1.0, 0.0), xy(0.0, 0.0)]]]) + .to_string(), + "MULTIPOLYGON (((0 0, 1 0, 0 0)))" + ); + let zm = Coordinate { + x: 1.0, + y: 2.0, + z: Some(3.0), + m: Some(4.0), + }; + assert_eq!( + GeometryValue::LineString(vec![zm, zm]).to_string(), + "LINESTRING ZM (1 2 3 4, 1 2 3 4)" + ); + let z = Coordinate { m: None, ..zm }; + assert_eq!( + GeometryValue::LineString(vec![z]).to_string(), + "LINESTRING Z (1 2 3)" + ); + let m = Coordinate { z: None, ..zm }; + assert_eq!( + GeometryValue::MultiPoint(vec![m]).to_string(), + "MULTIPOINT M (1 2 4)" + ); + } +} diff --git a/vortex-geo/src/extension/mod.rs b/vortex-geo/src/extension/mod.rs index f08b76ae83d..b5edacb6b3d 100644 --- a/vortex-geo/src/extension/mod.rs +++ b/vortex-geo/src/extension/mod.rs @@ -1,42 +1,103 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +pub(crate) mod geometry; mod wkb; use std::fmt::Display; +pub use geometry::*; +use vortex_error::VortexResult; +use vortex_error::vortex_err; pub use wkb::*; /// Extension metadata that is common to all the geospatial extension types. /// -/// Currently, this is just the coordinate reference system (CRS). -/// We may wish to add a second field for edges interpretation in the future similar to -/// the GeoArrow standard. +/// This carries the coordinate reference system (CRS) and, for native [`Geometry`] columns, the +/// [`GeometryKind`] held by the column. We may wish to add a field for edges interpretation in +/// the future similar to the GeoArrow standard. #[derive(Clone, PartialEq, Eq, Hash, prost::Message)] pub struct GeoMetadata { + /// The coordinate reference system, or `None` for unreferenced geometry. #[prost(optional, string, tag = "1")] pub crs: Option, + + /// The [`GeometryKind`] held by a native [`Geometry`] column. + #[prost(enumeration = "GeometryKind", tag = "2")] + pub geometry_type: i32, +} + +impl GeoMetadata { + /// The decoded [`GeometryKind`] of this metadata. + pub fn kind(&self) -> VortexResult { + GeometryKind::try_from(self.geometry_type) + .map_err(|_| vortex_err!("unknown geometry kind {}", self.geometry_type)) + } } impl Display for GeoMetadata { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Geometry(")?; + match GeometryKind::try_from(self.geometry_type) { + Ok(GeometryKind::Unspecified) => {} + Ok(kind) => write!(f, "{kind}, ")?, + Err(_) => write!(f, "kind={}, ", self.geometry_type)?, + } match self.crs.as_ref() { - Some(crs) => write!(f, "Geometry(crs={crs})"), - None => write!(f, "Geometry(unreferenced)"), + Some(crs) => write!(f, "crs={crs})"), + None => write!(f, "unreferenced)"), } } } +/// The kind of geometry held in a native [`Geometry`] column, matching the GeoArrow native +/// geometry types. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, prost::Enumeration)] +#[repr(i32)] +pub enum GeometryKind { + /// The wire default when no kind was set (e.g. WKB); rejected by [`Geometry`]. + Unspecified = 0, + /// A single location. + Point = 1, + /// A sequence of locations connected into a line. + LineString = 2, + /// One outer ring with zero or more interior rings (holes). + Polygon = 3, + /// A collection of points. + MultiPoint = 4, + /// A collection of linestrings. + MultiLineString = 5, + /// A collection of polygons. + MultiPolygon = 6, +} + +impl Display for GeometryKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let name = match self { + GeometryKind::Unspecified => "unspecified", + GeometryKind::Point => "point", + GeometryKind::LineString => "linestring", + GeometryKind::Polygon => "polygon", + GeometryKind::MultiPoint => "multipoint", + GeometryKind::MultiLineString => "multilinestring", + GeometryKind::MultiPolygon => "multipolygon", + }; + write!(f, "{name}") + } +} + #[cfg(test)] mod tests { use prost::Message; use crate::extension::GeoMetadata; + use crate::extension::GeometryKind; #[test] fn test_metadata() { let meta = GeoMetadata { crs: Some("EPSG:4326".to_string()), + ..Default::default() }; assert_eq!(meta.to_string(), "Geometry(crs=EPSG:4326)"); @@ -45,4 +106,18 @@ mod tests { let decoded = GeoMetadata::decode(bytes.as_slice()).unwrap(); assert_eq!(decoded, meta); } + + #[test] + fn test_metadata_with_kind() { + let meta = GeoMetadata { + crs: Some("EPSG:4326".to_string()), + geometry_type: GeometryKind::Point as i32, + }; + + assert_eq!(meta.to_string(), "Geometry(point, crs=EPSG:4326)"); + let bytes = meta.encode_to_vec(); + let decoded = GeoMetadata::decode(bytes.as_slice()).unwrap(); + assert_eq!(decoded, meta); + assert_eq!(decoded.kind().unwrap(), GeometryKind::Point); + } } diff --git a/vortex-geo/src/extension/wkb.rs b/vortex-geo/src/extension/wkb.rs index 172da41b93b..e49c8587f86 100644 --- a/vortex-geo/src/extension/wkb.rs +++ b/vortex-geo/src/extension/wkb.rs @@ -313,5 +313,8 @@ fn geo_metadata(wkb_type: &WkbType) -> GeoMetadata { .map(str::to_string) .unwrap_or_else(|| value.to_string()) }); - GeoMetadata { crs } + GeoMetadata { + crs, + ..Default::default() + } } diff --git a/vortex-geo/src/lib.rs b/vortex-geo/src/lib.rs index 513caf85d92..1f4e401c573 100644 --- a/vortex-geo/src/lib.rs +++ b/vortex-geo/src/lib.rs @@ -5,17 +5,26 @@ use std::sync::Arc; use vortex_array::arrow::ArrowSessionExt; use vortex_array::dtype::session::DTypeSessionExt; +use vortex_array::scalar_fn::session::ScalarFnSessionExt; use vortex_session::VortexSession; +use crate::extension::Geometry; use crate::extension::WellKnownBinary; +use crate::scalar_fn::distance::GeoDistance; pub mod extension; +pub mod scalar_fn; + /// Set up a session with support for geospatial extension types, encodings and layouts. pub fn initialize(session: &VortexSession) { - // register geospatial extension types + // Register the geospatial extension types. session.dtypes().register(WellKnownBinary); session.arrow().register_exporter(Arc::new(WellKnownBinary)); session.arrow().register_importer(Arc::new(WellKnownBinary)); + session.dtypes().register(Geometry); + + // Register the geometry scalar functions. + session.scalar_fns().register(GeoDistance); } #[cfg(test)] @@ -95,6 +104,7 @@ mod tests { let dtype = ExtDType::::try_new( GeoMetadata { crs: Some("EPSG:4326".to_string()), + ..Default::default() }, DType::Binary(Nullability::NonNullable), )?; @@ -135,6 +145,7 @@ mod tests { let dtype = ExtDType::::try_new( GeoMetadata { crs: Some("EPSG:4326".to_string()), + ..Default::default() }, DType::Binary(Nullability::NonNullable), )?; diff --git a/vortex-geo/src/scalar_fn/distance.rs b/vortex-geo/src/scalar_fn/distance.rs new file mode 100644 index 00000000000..cbec9f48081 --- /dev/null +++ b/vortex-geo/src/scalar_fn/distance.rs @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Straight-line (Euclidean) distance between geometries; "planar" distance in GIS terms. + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::Constant; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::ScalarFnArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::Arity; +use vortex_array::scalar_fn::ChildName; +use vortex_array::scalar_fn::EmptyOptions; +use vortex_array::scalar_fn::ExecutionArgs; +use vortex_array::scalar_fn::ScalarFnId; +use vortex_array::scalar_fn::ScalarFnVTable; +use vortex_array::scalar_fn::TypedScalarFnInstance; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_session::VortexSession; + +use crate::extension::Geometry; +use crate::extension::GeometryKind; +use crate::extension::geometry::coordinate::coordinate_from_scalar; +use crate::extension::geometry::coordinate::parse_storage; + +/// Straight-line (L2) distance between `(ax, ay)` and `(bx, by)`. +fn euclidean_distance(ax: f64, ay: f64, bx: f64, by: f64) -> f64 { + let dx = ax - bx; + let dy = ay - by; + (dx * dx + dy * dy).sqrt() +} + +/// Planar (Euclidean) distance between two geometry columns, like PostGIS `ST_Distance`; +/// `z`/`m` are ignored. +/// +/// Operands are type-checked at construction: point kind (the only kernel so far), +/// non-nullable, and sharing a CRS. Execution dispatches on the operands' [`GeometryKind`]s. +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] +pub struct GeoDistance; + +impl GeoDistance { + /// A lazy `ScalarFnArray` of per-row distances between `a` and `b`, with `a`'s length. + pub fn try_new_array(a: ArrayRef, b: ArrayRef) -> VortexResult { + let len = a.len(); + ScalarFnArray::try_new( + TypedScalarFnInstance::new(GeoDistance, EmptyOptions).erased(), + vec![a, b], + len, + ) + } +} + +impl ScalarFnVTable for GeoDistance { + type Options = EmptyOptions; + + fn id(&self) -> ScalarFnId { + ScalarFnId::new("vortex.geo.distance") + } + + fn serialize(&self, _: &Self::Options) -> VortexResult>> { + Ok(Some(vec![])) + } + + fn deserialize(&self, _: &[u8], _: &VortexSession) -> VortexResult { + Ok(EmptyOptions) + } + + fn arity(&self, _: &Self::Options) -> Arity { + Arity::Exact(2) + } + + fn child_name(&self, _: &Self::Options, child_idx: usize) -> ChildName { + match child_idx { + 0 => ChildName::from("a"), + 1 => ChildName::from("b"), + _ => unreachable!("distance has exactly two children"), + } + } + + fn return_dtype(&self, _: &Self::Options, arg_dtypes: &[DType]) -> VortexResult { + for dtype in arg_dtypes { + let kind = Geometry::kind_of(dtype)?; + vortex_ensure!( + kind == GeometryKind::Point, + "distance over {kind} geometries is not yet implemented" + ); + vortex_ensure!( + !dtype.is_nullable(), + "distance over nullable geometry is not yet supported, was {dtype}" + ); + } + if let [a, b] = arg_dtypes { + let a_crs = &a.as_extension().metadata::().crs; + let b_crs = &b.as_extension().metadata::().crs; + vortex_ensure!( + a_crs == b_crs, + "distance operands must share a CRS, was {} and {}", + a_crs.as_deref().unwrap_or("unreferenced"), + b_crs.as_deref().unwrap_or("unreferenced") + ); + } + Ok(DType::Primitive(PType::F64, Nullability::NonNullable)) + } + + fn execute( + &self, + _: &Self::Options, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let a = args.get(0)?; + let b = args.get(1)?; + match (Geometry::kind_of(a.dtype())?, Geometry::kind_of(b.dtype())?) { + (GeometryKind::Point, GeometryKind::Point) => point_to_point_distance(&a, &b, ctx), + (a_kind, b_kind) => { + vortex_bail!("distance({a_kind}, {b_kind}) is not yet implemented") + } + } + } +} + +/// Per-row distance between two point columns; either operand (or both) may be constant. +fn point_to_point_distance( + a: &ArrayRef, + b: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult { + match (a.as_opt::(), b.as_opt::()) { + (Some(qa), Some(qb)) => { + let qa = coordinate_from_scalar(qa.scalar())?; + let qb = coordinate_from_scalar(qb.scalar())?; + let distance = euclidean_distance(qa.x, qa.y, qb.x, qb.y); + Ok(ConstantArray::new( + Scalar::primitive(distance, Nullability::NonNullable), + a.len(), + ) + .into_array()) + } + (Some(query), None) => point_to_constant_distance(b, query.scalar(), ctx), + (None, Some(query)) => point_to_constant_distance(a, query.scalar(), ctx), + (None, None) => { + let a_coords = parse_storage(a, ctx)?; + let b_coords = parse_storage(b, ctx)?; + let distances = a_coords + .xs + .as_slice::() + .iter() + .zip(a_coords.ys.as_slice::()) + .zip( + b_coords + .xs + .as_slice::() + .iter() + .zip(b_coords.ys.as_slice::()), + ) + .map(|((&ax, &ay), (&bx, &by))| euclidean_distance(ax, ay, bx, by)); + Ok(PrimitiveArray::from_iter(distances).into_array()) + } + } +} + +/// Per-row distance from `points` to the constant `query` point, decoded once. Distance is +/// symmetric, so this serves a constant on either side. +fn point_to_constant_distance( + points: &ArrayRef, + query: &Scalar, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let query = coordinate_from_scalar(query)?; + let coords = parse_storage(points, ctx)?; + let distances = coords + .xs + .as_slice::() + .iter() + .zip(coords.ys.as_slice::()) + .map(|(&x, &y)| euclidean_distance(x, y, query.x, query.y)); + Ok(PrimitiveArray::from_iter(distances).into_array()) +} + +#[cfg(test)] +mod tests { + use vortex_array::ArrayRef; + use vortex_array::Canonical; + use vortex_array::ExecutionCtx; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::ConstantArray; + use vortex_array::arrays::ExtensionArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::StructArray; + use vortex_array::dtype::FieldNames; + use vortex_array::session::ArraySession; + use vortex_array::validity::Validity; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use super::GeoDistance; + use super::euclidean_distance; + use crate::extension::Geometry; + use crate::extension::GeometryKind; + + /// A point `Geometry` column with the given CRS over the given x/y coordinates. + fn point_column_with_crs( + xs: Vec, + ys: Vec, + crs: Option, + ) -> VortexResult { + let storage = StructArray::from_fields(&[ + ("x", PrimitiveArray::from_iter(xs).into_array()), + ("y", PrimitiveArray::from_iter(ys).into_array()), + ])? + .into_array(); + let dtype = Geometry::dtype(GeometryKind::Point, crs, storage.dtype().clone())?; + Ok(ExtensionArray::new(dtype.erased(), storage).into_array()) + } + + /// A point `Geometry` column (CRS `EPSG:4326`) over the given x/y coordinates. + fn point_column(xs: Vec, ys: Vec) -> VortexResult { + point_column_with_crs(xs, ys, Some("EPSG:4326".to_string())) + } + + /// A constant point column of length `len`, every row at `(x, y)`. + fn point_constant( + x: f64, + y: f64, + len: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let single = point_column(vec![x], vec![y])?.execute_scalar(0, ctx)?; + Ok(ConstantArray::new(single, len).into_array()) + } + + /// Execute a `GeoDistance` array and read back its per-row `f64` distances. + fn distances(distance: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult> { + Ok(distance + .execute::(ctx)? + .into_primitive() + .as_slice::() + .to_vec()) + } + + /// The kernel computes straight-line distance (the 3–4–5 triangle). + #[test] + fn euclidean_distance_is_straight_line() { + assert_eq!(euclidean_distance(0.0, 0.0, 3.0, 4.0), 5.0); + assert_eq!(euclidean_distance(1.5, -1.5, 1.5, -1.5), 0.0); + } + + /// Per-row distance between a point column and a constant query point. + #[test] + fn distance_over_points() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let a = point_column(vec![0.0, 3.0, 0.0, 3.0], vec![0.0, 0.0, 4.0, 4.0])?; + let b = point_constant(0.0, 0.0, 4, &mut ctx)?; + let distance = GeoDistance::try_new_array(a, b)?.into_array(); + + assert_eq!(distances(distance, &mut ctx)?, vec![0.0, 3.0, 4.0, 5.0]); + Ok(()) + } + + /// Column-to-column distance pairs corresponding rows of the two columns. + #[test] + fn distance_between_columns() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let a = point_column(vec![0.0, 1.0], vec![0.0, 1.0])?; + let b = point_column(vec![3.0, 1.0], vec![4.0, 1.0])?; + let distance = GeoDistance::try_new_array(a, b)?.into_array(); + + assert_eq!(distances(distance, &mut ctx)?, vec![5.0, 0.0]); + Ok(()) + } + + /// The constant query point may be either operand; distance is symmetric. + #[test] + fn distance_with_constant_first_operand() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let a = point_constant(0.0, 0.0, 4, &mut ctx)?; + let b = point_column(vec![0.0, 3.0, 0.0, 3.0], vec![0.0, 0.0, 4.0, 4.0])?; + let distance = GeoDistance::try_new_array(a, b)?.into_array(); + + assert_eq!(distances(distance, &mut ctx)?, vec![0.0, 3.0, 4.0, 5.0]); + Ok(()) + } + + /// Two constant operands: every row has the same distance. + #[test] + fn distance_between_two_constants() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let a = point_constant(0.0, 0.0, 3, &mut ctx)?; + let b = point_constant(3.0, 4.0, 3, &mut ctx)?; + let distance = GeoDistance::try_new_array(a, b)?.into_array(); + + assert_eq!(distances(distance, &mut ctx)?, vec![5.0, 5.0, 5.0]); + Ok(()) + } + + /// Distance is signed `(geometry, geometry)`: a non-geometry operand is rejected at + /// construction. + #[test] + fn distance_rejects_non_geometry_operands() -> VortexResult<()> { + let a = point_column(vec![0.0], vec![0.0])?; + let b = PrimitiveArray::from_iter(vec![1.0f64]).into_array(); + assert!(GeoDistance::try_new_array(a, b).is_err()); + Ok(()) + } + + /// Operands in different (or missing vs. set) CRS are rejected at construction: their raw + /// ordinates are not comparable. + #[test] + fn distance_rejects_mismatched_crs() -> VortexResult<()> { + let a = point_column(vec![0.0], vec![0.0])?; + let b = point_column_with_crs(vec![1.0], vec![1.0], Some("EPSG:3857".to_string()))?; + assert!(GeoDistance::try_new_array(a, b).is_err()); + + let a = point_column(vec![0.0], vec![0.0])?; + let unreferenced = point_column_with_crs(vec![1.0], vec![1.0], None)?; + assert!(GeoDistance::try_new_array(a, unreferenced).is_err()); + Ok(()) + } + + /// Nullable point columns are rejected at construction until validity propagation exists. + #[test] + fn distance_rejects_nullable_points() -> VortexResult<()> { + let storage = StructArray::try_new( + FieldNames::from(["x", "y"]), + vec![ + PrimitiveArray::from_iter(vec![1.0f64]).into_array(), + PrimitiveArray::from_iter(vec![2.0f64]).into_array(), + ], + 1, + Validity::AllValid, + )? + .into_array(); + let dtype = Geometry::dtype( + GeometryKind::Point, + Some("EPSG:4326".to_string()), + storage.dtype().clone(), + )?; + let nullable = ExtensionArray::new(dtype.erased(), storage).into_array(); + + let b = point_column(vec![2.0], vec![2.0])?; + assert!(GeoDistance::try_new_array(nullable, b).is_err()); + Ok(()) + } +} diff --git a/vortex-geo/src/scalar_fn/mod.rs b/vortex-geo/src/scalar_fn/mod.rs new file mode 100644 index 00000000000..ef9f220218a --- /dev/null +++ b/vortex-geo/src/scalar_fn/mod.rs @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Geometry scalar functions over the [`Geometry`](crate::extension::Geometry) type. + +pub mod distance;