diff --git a/.github/workflows/python-wheels.yml b/.github/workflows/python-wheels.yml index 9e389f29d..2833bd41f 100644 --- a/.github/workflows/python-wheels.yml +++ b/.github/workflows/python-wheels.yml @@ -36,9 +36,13 @@ concurrency: group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} cancel-in-progress: true +env: + # At GEOS updated to 3.14.0 + VCPKG_REF: 5a01de756c28279ddfdd2b061d1c75710a6255fa + jobs: windows-x86_64: - runs-on: windows-latest + runs-on: windows-2022 steps: - uses: actions/checkout@v4 @@ -61,18 +65,9 @@ jobs: uses: actions/checkout@v4 with: repository: microsoft/vcpkg - ref: "2025.06.13" + ref: ${{ env.VCPKG_REF }} path: vcpkg - - name: Bootstrap vcpkg - shell: bash - env: - VCPKG_ROOT: ${{ github.workspace }}/vcpkg - VCPKG_DEFAULT_TRIPLET: x64-windows-dynamic-release - run: | - cd ci/scripts - ./wheels-bootstrap-vcpkg.sh - - name: Build and test wheels (sedonadb) run: | cd ci/scripts @@ -80,6 +75,7 @@ jobs: env: VCPKG_ROOT: ${{ github.workspace }}/vcpkg VCPKG_DEFAULT_TRIPLET: x64-windows-dynamic-release + CMAKE_TOOLCHAIN_FILE: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake CIBW_BUILD: "*-win_amd64" CIBW_TEST_SKIP: "cp314* cp38*" CIBW_TEST_REQUIRES: pytest adbc_driver_manager geoarrow-pyarrow geopandas @@ -114,7 +110,7 @@ jobs: uses: actions/checkout@v4 with: repository: microsoft/vcpkg - ref: "2025.06.13" + ref: ${{ env.VCPKG_REF }} path: vcpkg - name: Build and test wheels (sedonadb) @@ -139,8 +135,7 @@ jobs: matrix: config: - {os: "ubuntu-latest", label: "linux-x86_64", arch: "x86_64"} - # We can't include this in our CI config until the repository is public - # - {os: "ubuntu-24.04-arm", label: "linux-arm64", arch: "aarch64"} + - {os: "ubuntu-24.04-arm", label: "linux-arm64", arch: "aarch64"} steps: - uses: actions/checkout@v4 @@ -168,3 +163,31 @@ jobs: with: name: release-wheels-${{ matrix.config.label }} path: python/sedonadb/dist/*.whl + + upload_nightly: + needs: ["wheels-linux", "macOS-arm64", "windows-x86_64"] + name: Upload nightly packages + runs-on: "macos-latest" + steps: + - uses: actions/download-artifact@v4 + with: + pattern: release-* + merge-multiple: true + path: dist + + - name: Install gemfury client + run: | + brew tap gemfury/tap + brew install fury-cli + fury --version + + - name: Upload packages to Gemfury + if: github.repository == 'apache/sedona-db' && github.ref == 'refs/heads/main' + shell: bash + run: | + fury push \ + --api-token=${GEMFURY_PUSH_TOKEN} \ + --as="sedona-nightlies" \ + dist/* + env: + NANOARROW_GEMFURY_TOKEN: ${{ secrets.GEMFURY_PUSH_TOKEN }} diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 9ef7d1633..aca83b2f0 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -21,6 +21,13 @@ on: pull_request: branches: - main + paths: + - 'Cargo.toml' + - 'Cargo.lock' + - '.github/workflows/python.yml' + - 'rust/**' + - 'c/**' + - 'python/**' push: branches: - main @@ -36,6 +43,10 @@ defaults: run: shell: bash -l -eo pipefail {0} +env: + # At GEOS updated to 3.14.0 + VCPKG_REF: 5a01de756c28279ddfdd2b061d1c75710a6255fa + jobs: test: strategy: @@ -55,6 +66,39 @@ jobs: python-version: '3.x' cache: 'pip' + - name: Clone vcpkg + uses: actions/checkout@v4 + with: + repository: microsoft/vcpkg + ref: ${{ env.VCPKG_REF }} + path: vcpkg + + - name: Set up environment variables and bootstrap vcpkg + env: + VCPKG_ROOT: ${{ github.workspace }}/vcpkg + CMAKE_TOOLCHAIN_FILE: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake + run: | + cd vcpkg + ./bootstrap-vcpkg.sh + cd .. + + echo "VCPKG_ROOT=$VCPKG_ROOT" >> $GITHUB_ENV + echo "PATH=$VCPKG_ROOT:$PATH" >> $GITHUB_ENV + echo "CMAKE_TOOLCHAIN_FILE=$CMAKE_TOOLCHAIN_FILE" >> $GITHUB_ENV + + - name: Cache vcpkg binaries + id: cache-vcpkg + uses: actions/cache@v4 + with: + path: vcpkg/packages + # Bump the number at the end of this line to force a new dependency build + key: vcpkg-installed-${{ runner.os }}-${{ runner.arch }}-${{ env.VCPKG_REF }}-1 + + - name: Install vcpkg dependencies + if: steps.cache-vcpkg.outputs.cache-hit != 'true' + run: | + ./vcpkg/vcpkg install abseil openssl + - name: Use stable Rust id: rust run: | @@ -72,6 +116,7 @@ jobs: - name: Install run: | + export MATURIN_PEP517_ARGS="--features s2geography" pip install -e "python/sedonadb/[test]" -vv - name: Download minimal geoarrow-data assets diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 27b28858e..1c5721857 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -41,6 +41,10 @@ defaults: run: shell: bash -l -eo pipefail {0} +env: + # At GEOS updated to 3.14.0 + VCPKG_REF: 5a01de756c28279ddfdd2b061d1c75710a6255fa + jobs: rust: strategy: @@ -65,7 +69,7 @@ jobs: uses: actions/checkout@v4 with: repository: microsoft/vcpkg - ref: "2025.06.13" + ref: ${{ env.VCPKG_REF }} path: vcpkg - name: Set up environment variables and bootstrap vcpkg @@ -87,7 +91,7 @@ jobs: with: path: vcpkg/packages # Bump the number at the end of this line to force a new dependency build - key: vcpkg-installed-${{ runner.os }}-${{ runner.arch }}-2 + key: vcpkg-installed-${{ runner.os }}-${{ runner.arch }}-${{ env.VCPKG_REF }}-1 - name: Install vcpkg dependencies if: steps.cache-vcpkg.outputs.cache-hit != 'true' diff --git a/c/.gitignore b/c/.gitignore index e3e7ca16d..c46e252aa 100644 --- a/c/.gitignore +++ b/c/.gitignore @@ -14,6 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. + build/ dist/ .cache +CMakeUserPresets.json diff --git a/c/sedona-s2geography/CMakeLists.txt b/c/sedona-s2geography/CMakeLists.txt index b6625dde1..06cd06352 100644 --- a/c/sedona-s2geography/CMakeLists.txt +++ b/c/sedona-s2geography/CMakeLists.txt @@ -251,32 +251,60 @@ install(FILES "${CMAKE_BINARY_DIR}/openssl_libraries.txt" # .a file (but this might not work if the absl libraries weren't static, # as they aren't on Homebrew and linux distributions). -if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(LINK_CXX_STANDARD_LIB "-lc++") -elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU") - set(LINK_CXX_STANDARD_LIB "-lstdc++") -elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - # set MSVC-specific flags if we need them -else() - set(LINK_CXX_STANDARD_LIB "") -endif() +if(NOT WIN32) + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(LINK_CXX_STANDARD_LIB "-lc++") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU") + set(LINK_CXX_STANDARD_LIB "-lstdc++") + elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + # set MSVC-specific flags if we need them + set(LINK_CXX_STANDARD_LIB "") + else() + set(LINK_CXX_STANDARD_LIB "") + endif() -set(CMAKE_ECHO_STANDARD_LIBRARIES ${CMAKE_CXX_STANDARD_LIBRARIES}) -set(CMAKE_ECHO_FLAGS ${CMAKE_CXX_FLAGS}) -set(CMAKE_ECHO_LINK_FLAGS ${CMAKE_CXX_LINK_FLAGS}) -set(CMAKE_ECHO_IMPLICIT_LINK_DIRECTORIES ${CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES}) -# TODO: This won't work on Windows -set(CMAKE_ECHO_LINK_EXECUTABLE - "sh -c \"echo ${LINK_CXX_STANDARD_LIB} > \"" -) + set(CMAKE_ECHO_STANDARD_LIBRARIES ${CMAKE_CXX_STANDARD_LIBRARIES}) + set(CMAKE_ECHO_FLAGS ${CMAKE_CXX_FLAGS}) + set(CMAKE_ECHO_LINK_FLAGS ${CMAKE_CXX_LINK_FLAGS}) + set(CMAKE_ECHO_IMPLICIT_LINK_DIRECTORIES ${CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES}) -add_executable(linker_flags "CMakeLists.txt") -target_link_libraries(linker_flags - OpenSSL::SSL - OpenSSL::Crypto - ${S2_EXTRA_OPENSSL_LIBS} - ${ABSL_LIBRARIES}) + set(CMAKE_ECHO_LINK_EXECUTABLE + "sh -c \"echo ${LINK_CXX_STANDARD_LIB} > \"" + ) -set_target_properties(linker_flags PROPERTIES LINKER_LANGUAGE ECHO SUFFIX ".txt") + add_executable(linker_flags "CMakeLists.txt") + target_link_libraries(linker_flags + OpenSSL::SSL + OpenSSL::Crypto + ${S2_EXTRA_OPENSSL_LIBS} + ${ABSL_LIBRARIES}) -install(TARGETS linker_flags DESTINATION "${CMAKE_INSTALL_LIBDIR}") + set_target_properties(linker_flags PROPERTIES LINKER_LANGUAGE ECHO SUFFIX ".txt") + + install(TARGETS linker_flags DESTINATION "${CMAKE_INSTALL_LIBDIR}") + +else() + # On Windows, MSBuild will write this file for us, but we have to look in a very specific place + # to find it. This is possibly brittle but makes it possible to build this on Windows at all. + file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/linker_flags.cc" + " +int main(int argc, const char** args) { + return 0; +}") + add_executable(linker_flags "${CMAKE_CURRENT_BINARY_DIR}/linker_flags.cc") + target_link_libraries(linker_flags + OpenSSL::SSL + OpenSSL::Crypto + ${S2_EXTRA_OPENSSL_LIBS} + ${ABSL_LIBRARIES}) + + add_custom_command(TARGET linker_flags + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + "${CMAKE_CURRENT_BINARY_DIR}/linker_flags.dir/$/linker_flags.tlog/link.command.1.tlog" + "${CMAKE_CURRENT_BINARY_DIR}/$/linker_flags.txt" + COMMENT "Copying linker command file for configuration $") + + install(FILES "${CMAKE_CURRENT_BINARY_DIR}/$/linker_flags.txt" + DESTINATION "${CMAKE_INSTALL_LIBDIR}") +endif() diff --git a/c/sedona-s2geography/build.rs b/c/sedona-s2geography/build.rs index 49d1485be..6eadb11d7 100644 --- a/c/sedona-s2geography/build.rs +++ b/c/sedona-s2geography/build.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. use std::{ + collections::HashSet, env, path::{Path, PathBuf}, }; @@ -31,10 +32,24 @@ fn main() { // Link the libraries that are easy to enumerate by hand and whose location // we control in CMakeLists.txt. - println!( - "cargo:rustc-link-search=native={}", - dst.join("lib").display() - ); + let mut lib_dirs = [ + "geography_glue", + "s2geography", + "s2", + "geoarrow", + "nanoarrow_static", + ] + .map(|lib| find_lib_dir(&dst, lib)) + .into_iter() + .collect::>() + .into_iter() + .collect::>(); + + lib_dirs.sort(); + for lib_dir in lib_dirs { + println!("cargo:rustc-link-search=native={}", lib_dir.display()); + } + println!("cargo:rustc-link-lib=static=geography_glue"); println!("cargo:rustc-link-lib=static=s2geography"); println!("cargo:rustc-link-lib=static=s2"); @@ -62,23 +77,28 @@ fn main() { fn parse_cmake_linker_flags(binary_dir: &Path) { // e.g., libabsl_base.a - let re_lib = Regex::new("^(lib|)([^.]+).*?(lib|a|dylib|so|dll)$").unwrap(); + let re_lib = Regex::new("^(lib|)([^.]+).*?(LIB|lib|a|dylib|so|dll)$").unwrap(); // e.g., -L/path/to/lib (CMake doesn't usually output this, preferrig instead // to pass the full path to the library) let re_linker_dir = Regex::new("^-L(.*)").unwrap(); // e.g., -lstdc++ let re_linker_lib = Regex::new("^-l(.*)").unwrap(); - let path = binary_dir.join("lib").join("linker_flags.txt"); - let values = std::fs::read_to_string(path).expect("Read linker_flags.txt"); + + let path = find_cmake_linker_flags(binary_dir); + let linker_flags_string = read_file_maybe_utf16(&path); // Print out the whole thing for debugging failures - println!("Parsing CMake linker flags: {values}"); + println!("Parsing CMake linker flags: {linker_flags_string}"); let mut last_lib_dir = "".to_string(); // Split flags on whitespace. This probably won't work if library paths // contain spaces. - for item in values.split_whitespace() { + for item in linker_flags_string.split_whitespace() { + if item.is_empty() { + continue; + } + if let Some(dir_match) = re_linker_dir.captures(item) { let (_, [dir]) = dir_match.extract(); println!("cargo:rustc-link-search=native={dir}"); @@ -89,8 +109,12 @@ fn parse_cmake_linker_flags(binary_dir: &Path) { continue; } - // Try to interpret as a path to a library. CMake loves to do this. - let mut path = PathBuf::from(item); + // Try to interpret as a path to a library. CMake loves to do this. It might be quoted (Windows) + let mut path = if item.starts_with('"') && item.ends_with('"') { + PathBuf::from(item[1..(item.len() - 1)].to_string()) + } else { + PathBuf::from(item) + }; // If it's a relative path, it's relative to the binary directory if path.is_relative() { @@ -108,7 +132,8 @@ fn parse_cmake_linker_flags(binary_dir: &Path) { } match suffix { - "a" | "lib" => println!("cargo:rustc-link-lib=static={lib}"), + "lib" | "LIB" => println!("cargo:rustc-link-lib={lib}"), + "a" => println!("cargo:rustc-link-lib=static={lib}"), _ => println!("cargo:rustc-link-lib=dylib={lib}"), } } @@ -119,3 +144,69 @@ fn parse_cmake_linker_flags(binary_dir: &Path) { } } } + +fn find_cmake_linker_flags(binary_dir: &Path) -> PathBuf { + // Usually lib but could be lib64 (e.g., the Linux used for wheel builds) + let possible_lib_dirs = ["lib", "lib64", "build/Release"]; + for possible_lib in possible_lib_dirs { + let path = binary_dir.join(possible_lib).join("linker_flags.txt"); + if path.exists() { + return path; + } + } + + panic!( + "Can't find linker_flags.txt output at {}", + binary_dir.to_string_lossy() + ) +} + +fn find_lib_dir(binary_dir: &Path, lib_file: &str) -> PathBuf { + // Usually lib but could be lib64 (e.g., the Linux used for wheel builds) + let possible_lib_dirs = ["lib", "lib64", "build/Release"]; + for possible_lib in possible_lib_dirs { + let path = binary_dir.join(possible_lib); + let static_lib_posix = path.join(format!("lib{lib_file}.a")); + let static_lib_windows = path.join(format!("{lib_file}.lib")); + if static_lib_posix.exists() || static_lib_windows.exists() { + return path; + } + } + + panic!( + "Can't find library dir for static library '{lib_file}' output at {}", + binary_dir.to_string_lossy() + ) +} + +// Linker flags scraped from MSBuild are UTF-16 with a byte order mark; linker flags scraped otherwise +// are system encoding (likely UTF-8 or compatible). +fn read_file_maybe_utf16(path: &PathBuf) -> String { + let linker_flags_bytes = std::fs::read(path).expect("Read linker_flags.txt"); + + // Check if the first two bytes are UTF-16 BOM (0xFF 0xFE or 0xFE 0xFF) + if linker_flags_bytes.len() >= 2 + && ((linker_flags_bytes[0] == 0xFF && linker_flags_bytes[1] == 0xFE) + || (linker_flags_bytes[0] == 0xFE && linker_flags_bytes[1] == 0xFF)) + { + // Determine endianness from BOM + let is_le = linker_flags_bytes[0] == 0xFF; + + // Skip the BOM and convert the rest + let u16_bytes = &linker_flags_bytes[2..]; + let u16_vec: Vec = u16_bytes + .chunks_exact(2) + .map(|chunk| { + if is_le { + u16::from_le_bytes([chunk[0], chunk[1]]) + } else { + u16::from_be_bytes([chunk[0], chunk[1]]) + } + }) + .collect(); + + String::from_utf16_lossy(&u16_vec).to_string() + } else { + String::from_utf8_lossy(&linker_flags_bytes).to_string() + } +} diff --git a/c/sedona-s2geography/src/s2geography.rs b/c/sedona-s2geography/src/s2geography.rs index 731c6fd1b..f02b6353f 100644 --- a/c/sedona-s2geography/src/s2geography.rs +++ b/c/sedona-s2geography/src/s2geography.rs @@ -297,16 +297,14 @@ mod test { fn scalar_udf_errors() { let mut udf = S2ScalarUDF::Length(); let err = udf.init(Fields::empty(), None).unwrap_err(); - assert_eq!( - err.to_string(), - "Invalid argument: Expected one argument in unary s2geography UDF" - ); + assert!(err + .to_string() + .contains("Expected one argument in unary s2geography UDF")); let err = udf.execute(&[]).unwrap_err(); - assert_eq!( - err.to_string(), - "Invalid argument: Expected one argument/one argument type in in unary s2geography UDF" - ); + assert!(err + .to_string() + .contains("Expected one argument/one argument type in in unary s2geography UDF")); } #[test] diff --git a/c/sedona-s2geography/src/scalar_kernel.rs b/c/sedona-s2geography/src/scalar_kernel.rs index c5ab78c8c..361bdb010 100644 --- a/c/sedona-s2geography/src/scalar_kernel.rs +++ b/c/sedona-s2geography/src/scalar_kernel.rs @@ -14,7 +14,7 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -use std::sync::Arc; +use std::{iter::zip, sync::Arc}; use arrow_schema::DataType; use datafusion_common::{Result, ScalarValue}; @@ -218,8 +218,13 @@ impl SedonaScalarKernel for S2ScalarKernel { ) -> Result { let mut inner = (self.inner_factory)(); + let arg_types_if_null = self.matcher.types_if_null(arg_types)?; + let args_casted_null = zip(args, &arg_types_if_null) + .map(|(arg, type_if_null)| arg.cast_to(type_if_null.storage_type(), None)) + .collect::>>()?; + // S2's scalar UDFs operate on fields with extension metadata - let arg_fields = arg_types + let arg_fields = arg_types_if_null .iter() .map(|arg_type| arg_type.to_storage_field("", true)) .collect::>>()?; @@ -228,7 +233,7 @@ impl SedonaScalarKernel for S2ScalarKernel { let out_ffi_schema = inner.init(arg_fields.into(), None)?; // Create arrays from each argument (scalars become arrays of size 1) - let arg_arrays = args + let arg_arrays = args_casted_null .iter() .map(|arg| match arg { ColumnarValue::Array(array) => Ok(array.clone()), @@ -299,6 +304,10 @@ mod test { .invoke_wkb_scalar(Some("LINESTRING (0 0, 0 1)")) .unwrap(); assert_eq!(result, ScalarValue::Float64(Some(111195.10117748393))); + + // Null scalar -> Null + let result = tester.invoke_scalar(ScalarValue::Null).unwrap(); + assert_eq!(result, ScalarValue::Float64(None)); } #[rstest] @@ -338,6 +347,12 @@ mod test { .invoke_scalar_scalar(polygon_scalar, point_scalar) .unwrap(); assert_eq!(result, ScalarValue::Boolean(Some(true))); + + // Null scalars -> Null + let result = tester + .invoke_scalar_scalar(ScalarValue::Null, ScalarValue::Null) + .unwrap(); + assert_eq!(result, ScalarValue::Boolean(None)); } #[test] diff --git a/c/sedona-tg/src/tg/tg.c b/c/sedona-tg/src/tg/tg.c index bdeb8d6af..b78fe56a4 100644 --- a/c/sedona-tg/src/tg/tg.c +++ b/c/sedona-tg/src/tg/tg.c @@ -20,6 +20,17 @@ #include #include +// See https://github.com/tidwall/tg/issues/15 for upstream resolution +#if defined(_MSC_VER) +#undef __BYTE_ORDER__ +#undef __ORDER_LITTLE_ENDIAN__ +#undef __ORDER_BIG_ENDIAN__ + +#define __BYTE_ORDER__ 1 +#define __ORDER_LITTLE_ENDIAN__ 1 +#define __ORDER_BIG_ENDIAN__ 0 +#endif + /****************************************************************************** Implementation Notes: diff --git a/ci/scripts/.gitignore b/ci/scripts/.gitignore new file mode 100644 index 000000000..400987c7d --- /dev/null +++ b/ci/scripts/.gitignore @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Ignore the nasm compiler downloaded when building Windows wheels +nasm-* diff --git a/ci/scripts/custom-triplets/x64-windows-dynamic-release.cmake b/ci/scripts/custom-triplets/x64-windows-dynamic-release.cmake index 7157a03b1..eda7718a7 100644 --- a/ci/scripts/custom-triplets/x64-windows-dynamic-release.cmake +++ b/ci/scripts/custom-triplets/x64-windows-dynamic-release.cmake @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. + set(VCPKG_TARGET_ARCHITECTURE x64) set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE dynamic) diff --git a/ci/scripts/wheels-bootstrap-vcpkg.sh b/ci/scripts/wheels-bootstrap-vcpkg.sh index c70a0a25a..b564d3fa5 100755 --- a/ci/scripts/wheels-bootstrap-vcpkg.sh +++ b/ci/scripts/wheels-bootstrap-vcpkg.sh @@ -32,8 +32,14 @@ else export PATH="${VCPKG_ROOT}/installed/${VCPKG_DEFAULT_TRIPLET}/tools/geos/bin:${PATH}" pushd ${VCPKG_ROOT} + + # If we have an explicitly requested reference, ensure it is checked out + if [ ! -z "${VCPKG_REF}" ]; then + git checkout ${VCPKG_REF} + fi + ./bootstrap-vcpkg.sh - ./vcpkg install --overlay-triplets="${SEDONADB_DIR}/ci/scripts/custom-triplets" geos + ./vcpkg install --overlay-triplets="${SEDONADB_DIR}/ci/scripts/custom-triplets" geos abseil openssl popd export CMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" diff --git a/ci/scripts/wheels-build-linux.sh b/ci/scripts/wheels-build-linux.sh index 3259b6adb..00014956d 100755 --- a/ci/scripts/wheels-build-linux.sh +++ b/ci/scripts/wheels-build-linux.sh @@ -40,7 +40,7 @@ fi # manylinux is AlmaLinux/Fedora-based, musllinux is Alpine-based # If we want musllinux support there will be some workshopping required (vcpkg # needs some newer components than are provided by the default musllinux image) -BEFORE_ALL_MANYLINUX="yum install -y curl zip unzip tar clang" +BEFORE_ALL_MANYLINUX="yum install -y curl zip unzip tar clang perl" # This approach downloads and builds native dependencies with vcpkg once for every image. # Compared to the Rust build time, the native dependency build time is not too bad. We could @@ -48,9 +48,8 @@ BEFORE_ALL_MANYLINUX="yum install -y curl zip unzip tar clang" # add quite a bit of complexity but could save time if we build wheels for linux frequently. # The native and Rust builds are cached on each image such that compile work is effectively # cached between Python versions (just not between invocations of this script). -export CIBW_ENVIRONMENT_LINUX="VCPKG_ROOT=/vcpkg VCPKG_DEFAULT_TRIPLET=$VCPKG_DEFAULT_TRIPLET CMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake PKG_CONFIG_PATH=/vcpkg/installed/$VCPKG_DEFAULT_TRIPLET/lib/pkgconfig LD_LIBRARY_PATH=/vcpkg/installed/$VCPKG_DEFAULT_TRIPLET/lib" +export CIBW_ENVIRONMENT_LINUX="VCPKG_ROOT=/vcpkg VCPKG_REF=$VCPKG_REF VCPKG_DEFAULT_TRIPLET=$VCPKG_DEFAULT_TRIPLET CMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake PKG_CONFIG_PATH=/vcpkg/installed/$VCPKG_DEFAULT_TRIPLET/lib/pkgconfig LD_LIBRARY_PATH=/vcpkg/installed/$VCPKG_DEFAULT_TRIPLET/lib MATURIN_PEP517_ARGS='--features s2geography'" export CIBW_BEFORE_ALL="$BEFORE_ALL_MANYLINUX && git clone https://github.com/microsoft/vcpkg.git /vcpkg && bash {package}/../../ci/scripts/wheels-bootstrap-vcpkg.sh" pushd "${SEDONADB_DIR}" python -m cibuildwheel --platform linux --archs ${ARCH} --output-dir python/$2/dist python/$2 -popd diff --git a/ci/scripts/wheels-build-macos.sh b/ci/scripts/wheels-build-macos.sh index 4ce86c42f..54315cec1 100755 --- a/ci/scripts/wheels-build-macos.sh +++ b/ci/scripts/wheels-build-macos.sh @@ -43,8 +43,7 @@ source ./wheels-bootstrap-vcpkg.sh export CIBW_REPAIR_WHEEL_COMMAND_MACOS="DYLD_LIBRARY_PATH=$VCPKG_INSTALL_NAME_DIR delocate-listdeps {wheel} && DYLD_LIBRARY_PATH=$VCPKG_INSTALL_NAME_DIR delocate-wheel --require-archs {delocate_archs} -w {dest_dir} {wheel}" # Pass on environment variables specifically for the build -export CIBW_ENVIRONMENT_MACOS="$CIBW_ENVIRONMENT_MACOS MACOSX_DEPLOYMENT_TARGET=12.0 CMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}" +export CIBW_ENVIRONMENT_MACOS="$CIBW_ENVIRONMENT_MACOS _PYTHON_HOST_PLATFORM=macosx-12.0-arm64 MACOSX_DEPLOYMENT_TARGET=12.0 CMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} MATURIN_PEP517_ARGS='--features s2geography'" pushd "${SEDONADB_DIR}" python -m cibuildwheel --output-dir python/$1/dist python/$1 -popd diff --git a/ci/scripts/wheels-build-windows.ps1 b/ci/scripts/wheels-build-windows.ps1 index 7e0664ab4..001664a9b 100644 --- a/ci/scripts/wheels-build-windows.ps1 +++ b/ci/scripts/wheels-build-windows.ps1 @@ -16,13 +16,54 @@ # under the License. # If running locally: -# $env:VCPKG_ROOT="C:\Users\dewey\Documents\rscratch\vcpkg" +# $env:VCPKG_ROOT="C:\Users\dewey\Documents\gh\vcpkg" # $env:VCPKG_DEFAULT_TRIPLET="x64-windows-dynamic-release" # $env:CIBW_BUILD="cp311-win_amd64" +$originalDirectory = Get-Location $scriptDirectory = Split-Path -Parent $MyInvocation.MyCommand.Path -$vcpkgBinDirectory = "$env:VCPKG_ROOT\installed\$env:VCPKG_DEFAULT_TRIPLET\bin" -$vcpkgLibDirectory = "$env:VCPKG_ROOT\installed\$env:VCPKG_DEFAULT_TRIPLET\lib" +$vcpkgInstalledDirectory = "$env:VCPKG_ROOT\installed\$env:VCPKG_DEFAULT_TRIPLET" +$vcpkgBinDirectory = "$vcpkgInstalledDirectory\bin" +$vcpkgLibDirectory = "$vcpkgInstalledDirectory\lib" + +# Ensure vcpkg +try { + Push-Location "$env:VCPKG_ROOT" + .\bootstrap-vcpkg + .\vcpkg --overlay-triplets="${scriptDirectory}/custom-triplets" install geos abseil openssl + Pop-Location +} +finally { + # Restore the original working directory + Set-Location -Path $originalDirectory +} + +# Download and extract NASM if it doesn't exist +# On Windows, NASM is required for AWS Rust dependencies +$NASM_URL = "https://www.nasm.us/pub/nasm/releasebuilds/2.16.03/win64/nasm-2.16.03-win64.zip" +$NASM_DIR = "$scriptDirectory\nasm-2.16.03" +$NASM_ZIP = "$scriptDirectory\nasm.zip" + +if (-not (Test-Path $NASM_DIR)) { + Write-Host "Downloading NASM to $NASM_DIR..." + New-Item -Path $NASM_DIR -ItemType Directory -Force | Out-Null + + # Download the NASM zip file + Invoke-WebRequest -Uri $NASM_URL -OutFile $NASM_ZIP + + # Extract the zip file + Expand-Archive -Path $NASM_ZIP -DestinationPath $scriptDirectory -Force + + # Clean up the zip file + Remove-Item -Path $NASM_ZIP -Force + + Write-Host "NASM downloaded and extracted to $NASM_DIR" +} else { + Write-Host "NASM directory already exists at $NASM_DIR" +} + +# Add NASM to PATH +$env:PATH += ";$NASM_DIR" # Put here/windows on PATH for our fake pkg-config and geos-config executables $env:PATH += ";$scriptDirectory\windows" @@ -31,10 +72,15 @@ $env:PATH += ";$scriptDirectory\windows" # (well, specifically our dummy geos-config) the information it needs to build bindings $env:GEOS_LIB_DIR = "$vcpkgLibDirectory" $env:GEOS_VERSION = "3.13.0" -$originalDirectory = Get-Location + +# Some CMake configurations needs this separately from the toolchain file +$env:CMAKE_PREFIX_PATH="$vcpkgInstalledDirectory" +$env:OPENSSL_ROOT_DIR="$vcpkgInstalledDirectory" # Use delvewheel to copy any required dependencies from vcpkg into the wheel -$env:CIBW_REPAIR_WHEEL_COMMAND_WINDOWS="delvewheel repair -v --add-path=$vcpkgBinDirectory --wheel-dir={dest_dir} {wheel}" +# combase.dll seems to be required; however, causes errors when copied into the wheel +# This likely means that the wheel won't work on Windows 7. +$env:CIBW_REPAIR_WHEEL_COMMAND_WINDOWS="delvewheel repair -v --exclude=combase.dll --add-path=$vcpkgBinDirectory --wheel-dir={dest_dir} {wheel}" # Quality of life: don't change the working directory of the calling script even when it fails $parentDirectory = Split-Path -Parent (Split-Path -Parent $scriptDirectory) @@ -50,6 +96,7 @@ try { Push-Location "$parentDirectory" python -m cibuildwheel --output-dir python\sedonadb\dist python\sedonadb + Pop-Location } finally { # Restore the original working directory diff --git a/python/sedonadb/Cargo.toml b/python/sedonadb/Cargo.toml index bfb868b0a..9fa82a563 100644 --- a/python/sedonadb/Cargo.toml +++ b/python/sedonadb/Cargo.toml @@ -28,6 +28,7 @@ crate-type = ["cdylib"] [features] default = ["mimalloc"] mimalloc = ["dep:mimalloc", "dep:libmimalloc-sys"] +s2geography = ["sedona/s2geography"] [dependencies] adbc_core = { workspace = true } @@ -38,7 +39,7 @@ datafusion = { workspace = true } datafusion-common = { workspace = true } datafusion-ffi = { workspace = true } futures = { workspace = true } -pyo3 = "0.25.1" +pyo3 = { version = "0.25.1", features = ["extension-module"] } sedona = { path = "../../rust/sedona" } sedona-adbc = { path = "../../rust/sedona-adbc" } sedona-schema = { path = "../../rust/sedona-schema" } diff --git a/rust/sedona-expr/src/scalar_udf.rs b/rust/sedona-expr/src/scalar_udf.rs index ab3491ad0..0820cf7d0 100644 --- a/rust/sedona-expr/src/scalar_udf.rs +++ b/rust/sedona-expr/src/scalar_udf.rs @@ -14,7 +14,7 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -use std::{any::Any, fmt::Debug, sync::Arc}; +use std::{any::Any, fmt::Debug, iter::zip, sync::Arc}; use arrow_schema::{DataType, FieldRef}; use datafusion_common::{not_impl_err, plan_err, Result, ScalarValue}; @@ -23,7 +23,7 @@ use datafusion_expr::{ Volatility, }; use sedona_common::sedona_internal_err; -use sedona_schema::datatypes::{Edges, SedonaType}; +use sedona_schema::datatypes::{Edges, SedonaType, WKB_GEOGRAPHY, WKB_GEOMETRY}; pub type ScalarKernelRef = Arc; @@ -180,6 +180,31 @@ impl ArgMatcher { arg_iter.next().is_none() } + /// Calls each [TypeMatcher]'s `type_if_null()` + /// + /// This method errors if one or more matchers does not have an + /// unambiguous castable-from-null storage type. It is provided + /// as a utility for generic kernel implementations that rely on + /// the matcher to sanitize input that may contain literal nulls. + pub fn types_if_null(&self, args: &[SedonaType]) -> Result> { + let mut out = Vec::new(); + for (arg, matcher) in zip(args, &self.matchers) { + if let SedonaType::Arrow(DataType::Null) = arg { + if let Some(type_if_null) = matcher.type_if_null() { + out.push(type_if_null); + } else { + return sedona_internal_err!( + "Matcher {matcher:?} does not provide type_if_null()" + ); + } + } else { + out.push(arg.clone()); + } + } + + Ok(out) + } + /// Matches any argument pub fn is_any() -> Arc { Arc::new(IsAny {}) @@ -240,11 +265,28 @@ impl ArgMatcher { } } +/// A TypeMatcher is a predicate on a [SedonaType] +/// +/// TypeMatchers are the building blocks of an [ArgMatcher] that +/// represent a single argument. This is a generalization of the +/// DataFusion [Signature] which does not currently consider +/// extension types and/or how extension arrays might be casted +/// to conform to a function with a given signature. pub trait TypeMatcher: Debug { + /// Returns true if this matcher matches a type fn match_type(&self, arg: &SedonaType) -> bool; + + /// If this argument is optional, return true fn is_optional(&self) -> bool { false } + + /// Return the type to which an argument should be casted, + /// if applicable. This can be used to generalize null handling + /// or casting. + fn type_if_null(&self) -> Option { + None + } } #[derive(Debug)] @@ -265,6 +307,10 @@ impl TypeMatcher for IsExact { fn match_type(&self, arg: &SedonaType) -> bool { self.exact_type.match_signature(arg) } + + fn type_if_null(&self) -> Option { + Some(self.exact_type.clone()) + } } #[derive(Debug)] @@ -280,6 +326,10 @@ impl TypeMatcher for OptionalMatcher { fn is_optional(&self) -> bool { true } + + fn type_if_null(&self) -> Option { + self.inner.type_if_null() + } } #[derive(Debug)] @@ -303,6 +353,10 @@ impl TypeMatcher for IsGeometry { _ => false, } } + + fn type_if_null(&self) -> Option { + Some(WKB_GEOMETRY) + } } #[derive(Debug)] @@ -317,6 +371,10 @@ impl TypeMatcher for IsGeography { _ => false, } } + + fn type_if_null(&self) -> Option { + Some(WKB_GEOGRAPHY) + } } #[derive(Debug)] @@ -329,6 +387,10 @@ impl TypeMatcher for IsNumeric { _ => false, } } + + fn type_if_null(&self) -> Option { + Some(SedonaType::Arrow(DataType::Float64)) + } } #[derive(Debug)] @@ -346,6 +408,10 @@ impl TypeMatcher for IsString { _ => false, } } + + fn type_if_null(&self) -> Option { + Some(SedonaType::Arrow(DataType::Utf8)) + } } #[derive(Debug)] @@ -360,6 +426,10 @@ impl TypeMatcher for IsBinary { _ => false, } } + + fn type_if_null(&self) -> Option { + Some(SedonaType::Arrow(DataType::Binary)) + } } #[derive(Debug)] @@ -374,6 +444,10 @@ impl TypeMatcher for IsBoolean { _ => false, } } + + fn type_if_null(&self) -> Option { + Some(SedonaType::Arrow(DataType::Boolean)) + } } #[derive(Debug)] @@ -594,30 +668,52 @@ mod tests { assert!(ArgMatcher::is_geometry_or_geography().match_type(&WKB_GEOGRAPHY)); assert!(!ArgMatcher::is_geometry_or_geography() .match_type(&SedonaType::Arrow(DataType::Binary))); + assert_eq!(ArgMatcher::is_geometry_or_geography().type_if_null(), None); assert!(ArgMatcher::is_geometry().match_type(&WKB_GEOMETRY)); assert!(!ArgMatcher::is_geometry().match_type(&WKB_GEOGRAPHY)); + assert_eq!(ArgMatcher::is_geometry().type_if_null(), Some(WKB_GEOMETRY)); assert!(ArgMatcher::is_geography().match_type(&WKB_GEOGRAPHY)); assert!(!ArgMatcher::is_geography().match_type(&WKB_GEOMETRY)); + assert_eq!( + ArgMatcher::is_geography().type_if_null(), + Some(WKB_GEOGRAPHY) + ); assert!(ArgMatcher::is_numeric().match_type(&SedonaType::Arrow(DataType::Int32))); assert!(ArgMatcher::is_numeric().match_type(&SedonaType::Arrow(DataType::Float64))); + assert_eq!( + ArgMatcher::is_numeric().type_if_null(), + Some(SedonaType::Arrow(DataType::Float64)) + ); assert!(ArgMatcher::is_string().match_type(&SedonaType::Arrow(DataType::Utf8))); assert!(ArgMatcher::is_string().match_type(&SedonaType::Arrow(DataType::Utf8View))); assert!(ArgMatcher::is_string().match_type(&SedonaType::Arrow(DataType::LargeUtf8))); assert!(!ArgMatcher::is_string().match_type(&SedonaType::Arrow(DataType::Binary))); + assert_eq!( + ArgMatcher::is_string().type_if_null(), + Some(SedonaType::Arrow(DataType::Utf8)) + ); assert!(ArgMatcher::is_binary().match_type(&SedonaType::Arrow(DataType::Binary))); assert!(ArgMatcher::is_binary().match_type(&SedonaType::Arrow(DataType::BinaryView))); assert!(!ArgMatcher::is_binary().match_type(&SedonaType::Arrow(DataType::Utf8))); + assert_eq!( + ArgMatcher::is_binary().type_if_null(), + Some(SedonaType::Arrow(DataType::Binary)) + ); assert!(ArgMatcher::is_boolean().match_type(&SedonaType::Arrow(DataType::Boolean))); assert!(!ArgMatcher::is_boolean().match_type(&SedonaType::Arrow(DataType::Int32))); assert!(ArgMatcher::is_null().match_type(&SedonaType::Arrow(DataType::Null))); assert!(!ArgMatcher::is_null().match_type(&SedonaType::Arrow(DataType::Int32))); + assert_eq!( + ArgMatcher::is_boolean().type_if_null(), + Some(SedonaType::Arrow(DataType::Boolean)) + ); } #[test]