Skip to content

Commit 3d7bbfb

Browse files
authored
WKB import/export VTables to Arrow (#8325)
## Summary Part of #7686 Following up on #7722 , which added the WellKnownBinary extension type and handling for exporting to DuckDB vectors. This PR adds support for import/export to Arrow for the extension type. ## Testing Unit tests are added to exercise both code paths --------- Signed-off-by: Andrew Duffy <andrew@a10y.dev>
1 parent a0203f0 commit 3d7bbfb

4 files changed

Lines changed: 461 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 55 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-geo/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ rust-version.workspace = true
1414
version.workspace = true
1515

1616
[dependencies]
17+
arrow-array = { workspace = true }
18+
arrow-schema = { workspace = true }
19+
geoarrow = { workspace = true }
1720
prost = { workspace = true }
1821
vortex-array = { workspace = true }
1922
vortex-error = { workspace = true }

vortex-geo/src/extension/wkb.rs

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,32 @@
33

44
use std::fmt::Display;
55
use std::ops::Deref;
6+
use std::sync::Arc;
67

8+
use arrow_array::ArrayRef as ArrowArrayRef;
9+
use arrow_schema::DataType;
10+
use arrow_schema::Field;
11+
use arrow_schema::extension::ExtensionType;
12+
use geoarrow::array::GenericWkbArray;
13+
use geoarrow::array::IntoArrow;
14+
use geoarrow::array::WkbViewArray;
15+
use geoarrow::datatypes::Crs;
16+
use geoarrow::datatypes::Metadata;
17+
use geoarrow::datatypes::WkbType;
718
use prost::Message;
819
use vortex_array::ArrayRef;
20+
use vortex_array::ExecutionCtx;
21+
use vortex_array::IntoArray;
922
use vortex_array::arrays::ExtensionArray;
1023
use vortex_array::arrays::extension::ExtensionArrayExt;
24+
use vortex_array::arrow::ArrowExport;
25+
use vortex_array::arrow::ArrowExportVTable;
26+
use vortex_array::arrow::ArrowImport;
27+
use vortex_array::arrow::ArrowImportVTable;
28+
use vortex_array::arrow::ArrowSession;
29+
use vortex_array::arrow::ArrowSessionExt;
30+
use vortex_array::arrow::FromArrowArray;
31+
use vortex_array::dtype::DType;
1132
use vortex_array::dtype::extension::ExtDType;
1233
use vortex_array::dtype::extension::ExtId;
1334
use vortex_array::dtype::extension::ExtVTable;
@@ -17,6 +38,8 @@ use vortex_error::VortexResult;
1738
use vortex_error::vortex_bail;
1839
use vortex_error::vortex_ensure;
1940
use vortex_error::vortex_err;
41+
use vortex_session::registry::CachedId;
42+
use vortex_session::registry::Id;
2043
use wkb::reader::GeometryType;
2144

2245
use crate::extension::GeoMetadata;
@@ -140,3 +163,155 @@ impl ExtVTable for WellKnownBinary {
140163
Wkb::try_from_bytes(storage_value.as_binary().as_slice())
141164
}
142165
}
166+
167+
static ARROW_WKB: CachedId = CachedId::new(WkbType::NAME);
168+
169+
impl ArrowExportVTable for WellKnownBinary {
170+
fn arrow_ext_id(&self) -> Id {
171+
*ARROW_WKB
172+
}
173+
174+
fn vortex_id(&self) -> Id {
175+
self.id()
176+
}
177+
178+
fn to_arrow_field(
179+
&self,
180+
name: &str,
181+
dtype: &DType,
182+
session: &ArrowSession,
183+
) -> VortexResult<Option<Field>> {
184+
let ext_type = dtype.as_extension();
185+
let geo_metadata = ext_type.metadata::<WellKnownBinary>();
186+
187+
let mut field = session.to_arrow_field(name, ext_type.storage_dtype())?;
188+
field.try_with_extension_type(wkb_type(geo_metadata))?;
189+
190+
Ok(Some(field))
191+
}
192+
193+
fn execute_arrow(
194+
&self,
195+
array: ArrayRef,
196+
target: &Field,
197+
ctx: &mut ExecutionCtx,
198+
) -> VortexResult<ArrowExport> {
199+
let is_wkb = array
200+
.dtype()
201+
.as_extension_opt()
202+
.map(|ext| ext.is::<WellKnownBinary>())
203+
.unwrap_or(false);
204+
if !is_wkb {
205+
return Ok(ArrowExport::Unsupported(array));
206+
}
207+
208+
let Ok(wkb_meta) = target.try_extension_type::<WkbType>() else {
209+
return Ok(ArrowExport::Unsupported(array));
210+
};
211+
212+
let executed = array.execute::<ExtensionArray>(ctx)?;
213+
let storage = executed.storage_array().clone();
214+
215+
let storage_field = Field::new(
216+
String::new(),
217+
target.data_type().clone(),
218+
target.is_nullable(),
219+
);
220+
let session = ctx.session().clone();
221+
let arrow_storage = session
222+
.arrow()
223+
.execute_arrow(storage, Some(&storage_field), ctx)?;
224+
225+
// Round-trip through the GeoArrow WKB array types: this validates that the storage
226+
// is a binary-family Arrow array and produces the canonical physical representation
227+
// expected for a `WkbType` extension field.
228+
let arrow_ref: ArrowArrayRef = match target.data_type() {
229+
DataType::Binary => Arc::new(
230+
GenericWkbArray::<i32>::try_from((arrow_storage.as_ref(), wkb_meta))
231+
.map_err(|e| vortex_err!("failed to construct WkbArray: {e}"))?
232+
.into_arrow(),
233+
),
234+
DataType::LargeBinary => Arc::new(
235+
GenericWkbArray::<i64>::try_from((arrow_storage.as_ref(), wkb_meta))
236+
.map_err(|e| vortex_err!("failed to construct LargeWkbArray: {e}"))?
237+
.into_arrow(),
238+
),
239+
DataType::BinaryView => Arc::new(
240+
WkbViewArray::try_from((arrow_storage.as_ref(), wkb_meta))
241+
.map_err(|e| vortex_err!("failed to construct WkbViewArray: {e}"))?
242+
.into_arrow(),
243+
),
244+
_ => unreachable!("target data type was validated above"),
245+
};
246+
247+
Ok(ArrowExport::Exported(arrow_ref))
248+
}
249+
}
250+
251+
impl ArrowImportVTable for WellKnownBinary {
252+
fn arrow_ext_id(&self) -> Id {
253+
*ARROW_WKB
254+
}
255+
256+
fn from_arrow_field(&self, field: &Field) -> VortexResult<Option<DType>> {
257+
let Ok(wkb_meta) = field.try_extension_type::<WkbType>() else {
258+
return Ok(None);
259+
};
260+
261+
let storage_dtype = DType::Binary(field.is_nullable().into());
262+
Ok(Some(DType::Extension(
263+
ExtDType::try_with_vtable(WellKnownBinary, geo_metadata(&wkb_meta), storage_dtype)?
264+
.erased(),
265+
)))
266+
}
267+
268+
fn from_arrow_array(
269+
&self,
270+
array: ArrowArrayRef,
271+
field: &Field,
272+
dtype: &DType,
273+
) -> VortexResult<ArrowImport> {
274+
let Some(ext_dtype) = dtype.as_extension_opt() else {
275+
return Ok(ArrowImport::Unsupported(array));
276+
};
277+
if !ext_dtype.is::<WellKnownBinary>()
278+
|| field.try_extension_type::<WkbType>().is_err()
279+
|| !matches!(
280+
array.data_type(),
281+
DataType::Binary | DataType::LargeBinary | DataType::BinaryView
282+
)
283+
{
284+
return Ok(ArrowImport::Unsupported(array));
285+
}
286+
287+
let storage = ArrayRef::from_arrow(array.as_ref(), field.is_nullable())?;
288+
Ok(ArrowImport::Imported(
289+
ExtensionArray::new(ext_dtype.clone(), storage).into_array(),
290+
))
291+
}
292+
}
293+
294+
fn wkb_type(geo_metadata: &GeoMetadata) -> WkbType {
295+
let metadata = Metadata::new(
296+
geo_metadata
297+
.crs
298+
.as_ref()
299+
.map(|crs| Crs::from_unknown_crs_type(crs.to_string()))
300+
.unwrap_or_default(),
301+
None,
302+
);
303+
WkbType::new(Arc::new(metadata))
304+
}
305+
306+
fn geo_metadata(wkb_type: &WkbType) -> GeoMetadata {
307+
let crs = wkb_type.metadata().crs().crs_value().map(|value| {
308+
// `Crs::from_unknown_crs_type` stores the user's string verbatim as a JSON string
309+
// value, so prefer the raw string when available to round-trip cleanly. For other
310+
// CRS encodings (PROJJSON object, etc.), fall back to the JSON-encoded form.
311+
value
312+
.as_str()
313+
.map(str::to_string)
314+
.unwrap_or_else(|| value.to_string())
315+
});
316+
GeoMetadata { crs }
317+
}

0 commit comments

Comments
 (0)