diff --git a/Cargo.lock b/Cargo.lock index 94302bc2e62..7eb0ff08a22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5238,14 +5238,18 @@ dependencies = [ "base64 0.22.1", "bytes", "chrono", + "datafusion-functions", "futures", "google-cloud-auth", "lance", + "lance-arrow", "lance-core", "lance-index", "lance-io", "lance-namespace", + "lance-namespace-reqwest-client", "log", + "murmur3", "object_store", "rand 0.9.2", "reqwest", @@ -5266,8 +5270,7 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" +source = "git+https://github.com/wojiaodoubao/lance-namespace?branch=rest-table-properties#a122997e5ed74122686c3a4bd228b839d0d025eb" dependencies = [ "reqwest", "serde", @@ -5952,6 +5955,12 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + [[package]] name = "murmurhash32" version = "0.3.1" diff --git a/Cargo.toml b/Cargo.toml index 172b1238290..8162e75271a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,7 +65,7 @@ lance-linalg = { version = "=3.0.0-beta.1", path = "./rust/lance-linalg" } lance-namespace = { version = "=3.0.0-beta.1", path = "./rust/lance-namespace" } lance-namespace-impls = { version = "=3.0.0-beta.1", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=3.0.0-beta.1", path = "./rust/lance-namespace-datafusion" } -lance-namespace-reqwest-client = { version = "=0.4.5" } +lance-namespace-reqwest-client = { git = "https://github.com/wojiaodoubao/lance-namespace", branch = "rest-table-properties" } lance-table = { version = "=3.0.0-beta.1", path = "./rust/lance-table" } lance-test-macros = { version = "=3.0.0-beta.1", path = "./rust/lance-test-macros" } lance-testing = { version = "=3.0.0-beta.1", path = "./rust/lance-testing" } diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index a1da8179320..b37e93fc3da 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3728,6 +3728,7 @@ dependencies = [ "lance-linalg", "lance-namespace", "lance-namespace-impls", + "lance-namespace-reqwest-client", "log", "object_store", "prost", @@ -3779,13 +3780,17 @@ dependencies = [ "axum", "bytes", "chrono", + "datafusion-functions", "futures", "lance", + "lance-arrow", "lance-core", "lance-index", "lance-io", "lance-namespace", + "lance-namespace-reqwest-client", "log", + "murmur3", "object_store", "rand 0.9.2", "reqwest", @@ -3801,8 +3806,7 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" +source = "git+https://github.com/wojiaodoubao/lance-namespace?branch=rest-table-properties#a122997e5ed74122686c3a4bd228b839d0d025eb" dependencies = [ "reqwest", "serde", @@ -4174,6 +4178,12 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + [[package]] name = "murmurhash32" version = "0.3.1" diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index bac5c4a8da7..e4269220912 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -24,6 +24,7 @@ lance-index = { path = "../../rust/lance-index" } lance-io = { path = "../../rust/lance-io" } lance-namespace = { path = "../../rust/lance-namespace" } lance-namespace-impls = { path = "../../rust/lance-namespace-impls", features = ["rest", "rest-adapter"] } +lance-namespace-reqwest-client = { git = "https://github.com/wojiaodoubao/lance-namespace", branch = "rest-table-properties" } lance-core = { path = "../../rust/lance-core" } lance-file = { path = "../../rust/lance-file" } arrow = { version = "57.1", features = ["ffi"] } @@ -60,4 +61,4 @@ incremental = false debug = false debug-assertions = false strip = "debuginfo" -incremental = false \ No newline at end of file +incremental = false diff --git a/java/lance-jni/src/namespace.rs b/java/lance-jni/src/namespace.rs index b9db171c064..1b5a5b05fc1 100644 --- a/java/lance-jni/src/namespace.rs +++ b/java/lance-jni/src/namespace.rs @@ -4,19 +4,27 @@ use std::collections::HashMap; use std::sync::Arc; +use arrow::datatypes::Schema as ArrowSchema; use bytes::Bytes; use jni::objects::{GlobalRef, JByteArray, JMap, JObject, JString, JValue}; use jni::sys::{jbyteArray, jlong, jstring}; use jni::JNIEnv; use lance_namespace::models::*; use lance_namespace::LanceNamespace as LanceNamespaceTrait; +use lance_namespace_impls::partition::{ + PartitionField, PartitionTable, PartitionedNamespace, PartitionedNamespaceBuilder, +}; use lance_namespace_impls::{ ConnectBuilder, DirectoryNamespace, DirectoryNamespaceBuilder, DynamicContextProvider, OperationInfo, RestAdapter, RestAdapterConfig, RestNamespace, RestNamespaceBuilder, }; +use lance_namespace_reqwest_client::models::{ + PartitionField as JsonPartitionField, PartitionSpec as JsonPartitionSpec, +}; use serde::{Deserialize, Serialize}; use crate::error::{Error, Result}; +use crate::traits::IntoJava; use crate::utils::to_rust_map; use crate::RT; @@ -126,6 +134,31 @@ pub struct BlockingRestNamespace { pub(crate) inner: RestNamespace, } +/// Blocking wrapper for PartitionedNamespace +pub struct BlockingPartitionedNamespace { + pub(crate) inner: PartitionedNamespace, +} + +#[derive(Debug, Serialize)] +struct JavaPartitionTable { + id: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + read_version: Option, +} + +#[derive(Debug, Serialize)] +struct JavaPlanScanItem { + table: JavaPartitionTable, + refine_expr: String, +} + +fn to_java_partition_table(t: &PartitionTable) -> JavaPartitionTable { + JavaPartitionTable { + id: t.id.clone(), + read_version: t.read_version, + } +} + // ============================================================================ // DirectoryNamespace JNI Functions // ============================================================================ @@ -1290,6 +1323,121 @@ fn call_namespace_query_method<'local>( Ok(byte_array) } +/// Helper function to call namespace methods that return a response object (PartitionedNamespace) +fn call_partitioned_namespace_method<'local, Req, Resp, F>( + env: &mut JNIEnv<'local>, + handle: jlong, + request_json: JString, + f: F, +) -> Result> +where + Req: for<'de> Deserialize<'de>, + Resp: Serialize, + F: FnOnce(&BlockingPartitionedNamespace, Req) -> lance_core::Result, +{ + let namespace = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let request_str: String = env.get_string(&request_json)?.into(); + let request: Req = serde_json::from_str(&request_str) + .map_err(|e| Error::input_error(format!("Failed to parse request JSON: {}", e)))?; + + let response = f(namespace, request) + .map_err(|e| Error::runtime_error(format!("Namespace operation failed: {}", e)))?; + + let response_json = serde_json::to_string(&response) + .map_err(|e| Error::runtime_error(format!("Failed to serialize response: {}", e)))?; + + env.new_string(response_json).map_err(Into::into) +} + +/// Helper function for void methods (PartitionedNamespace) +fn call_partitioned_namespace_void_method( + env: &mut JNIEnv, + handle: jlong, + request_json: JString, + f: F, +) -> Result<()> +where + Req: for<'de> Deserialize<'de>, + F: FnOnce(&BlockingPartitionedNamespace, Req) -> lance_core::Result<()>, +{ + let namespace = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let request_str: String = env.get_string(&request_json)?.into(); + let request: Req = serde_json::from_str(&request_str) + .map_err(|e| Error::input_error(format!("Failed to parse request JSON: {}", e)))?; + + f(namespace, request) + .map_err(|e| Error::runtime_error(format!("Namespace operation failed: {}", e)))?; + + Ok(()) +} + +/// Helper function for count methods (PartitionedNamespace) +fn call_partitioned_namespace_count_method( + env: &mut JNIEnv, + handle: jlong, + request_json: JString, +) -> Result { + let namespace = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let request_str: String = env.get_string(&request_json)?.into(); + let request: CountTableRowsRequest = serde_json::from_str(&request_str) + .map_err(|e| Error::input_error(format!("Failed to parse request JSON: {}", e)))?; + + let count = RT + .block_on(namespace.inner.count_table_rows(request)) + .map_err(|e| Error::runtime_error(format!("Count table rows failed: {}", e)))?; + + Ok(count) +} + +/// Helper function for methods with data parameter (PartitionedNamespace) +fn call_partitioned_namespace_with_data_method<'local, Req, Resp, F>( + env: &mut JNIEnv<'local>, + handle: jlong, + request_json: JString, + request_data: JByteArray, + f: F, +) -> Result> +where + Req: for<'de> Deserialize<'de>, + Resp: Serialize, + F: FnOnce(&BlockingPartitionedNamespace, Req, Bytes) -> lance_core::Result, +{ + let namespace = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let request_str: String = env.get_string(&request_json)?.into(); + let request: Req = serde_json::from_str(&request_str) + .map_err(|e| Error::input_error(format!("Failed to parse request JSON: {}", e)))?; + + let data_vec = env.convert_byte_array(request_data)?; + let data = bytes::Bytes::from(data_vec); + + let response = f(namespace, request, data) + .map_err(|e| Error::runtime_error(format!("Namespace operation failed: {}", e)))?; + + let response_json = serde_json::to_string(&response) + .map_err(|e| Error::runtime_error(format!("Failed to serialize response: {}", e)))?; + + env.new_string(response_json).map_err(Into::into) +} + +/// Helper function for query methods that return byte arrays (PartitionedNamespace) +fn call_partitioned_namespace_query_method<'local>( + env: &mut JNIEnv<'local>, + handle: jlong, + request_json: JString, +) -> Result> { + let namespace = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let request_str: String = env.get_string(&request_json)?.into(); + let request: QueryTableRequest = serde_json::from_str(&request_str) + .map_err(|e| Error::input_error(format!("Failed to parse request JSON: {}", e)))?; + + let result_bytes = RT + .block_on(namespace.inner.query_table(request)) + .map_err(|e| Error::runtime_error(format!("Query table failed: {}", e)))?; + + let byte_array = env.byte_array_from_slice(&result_bytes)?; + Ok(byte_array) +} + /// Helper function to call namespace methods that return a response object (RestNamespace) fn call_rest_namespace_method<'local, Req, Resp, F>( env: &mut JNIEnv<'local>, @@ -1541,3 +1689,739 @@ pub extern "system" fn Java_org_lance_namespace_RestAdapter_releaseNative( } } } + +// ============================================================================ +// PartitionedNamespace JNI Functions +// ============================================================================ + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createNative( + mut env: JNIEnv, + _obj: JObject, + properties_map: JObject, +) -> jlong { + ok_or_throw_with_return!( + env, + create_partitioned_namespace_internal(&mut env, properties_map, None), + 0 + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createNativeWithProvider( + mut env: JNIEnv, + _obj: JObject, + properties_map: JObject, + context_provider: JObject, +) -> jlong { + ok_or_throw_with_return!( + env, + create_partitioned_namespace_internal(&mut env, properties_map, Some(context_provider),), + 0 + ) +} + +fn create_partitioned_namespace_internal( + env: &mut JNIEnv, + properties_map: JObject, + context_provider: Option, +) -> Result { + // Convert Java HashMap to Rust HashMap + let jmap = JMap::from_env(env, &properties_map)?; + let mut properties = to_rust_map(env, &jmap)?; + + // Use the same key as DirectoryNamespace to locate root. + if !properties.contains_key("root") { + if let Some(location) = properties.get("location").cloned() { + properties.insert("root".to_string(), location); + } + } + let location = properties.get("root").cloned().ok_or_else(|| { + Error::input_error("Missing 'root' (or 'location') in configProperties".to_string()) + })?; + + // Build DirectoryNamespace using properties so we can reuse storage options, credential vending, + // and the Java dynamic context provider. + let mut dir_builder = DirectoryNamespaceBuilder::from_properties(properties, None) + .map_err(|e| { + Error::runtime_error(format!("Failed to create DirectoryNamespaceBuilder: {}", e)) + })? + .manifest_enabled(true) + .dir_listing_enabled(false) + .inline_optimization_enabled(true); + + if let Some(provider_obj) = context_provider { + if !provider_obj.is_null() { + let java_provider = JavaDynamicContextProvider::new(env, &provider_obj)?; + dir_builder = dir_builder.context_provider(Arc::new(java_provider)); + } + } + + let directory = RT + .block_on(dir_builder.build()) + .map_err(|e| Error::runtime_error(format!("Failed to build DirectoryNamespace: {}", e)))?; + + let builder = PartitionedNamespaceBuilder::new(location).directory(directory); + let ns = RT + .block_on(builder.load()) + .map_err(|e| Error::runtime_error(format!("Failed to load PartitionedNamespace: {}", e)))?; + + Ok(Box::into_raw(Box::new(BlockingPartitionedNamespace { inner: ns })) as jlong) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_releaseNative( + _env: JNIEnv, + _obj: JObject, + handle: jlong, +) { + if handle != 0 { + unsafe { + let _ = Box::from_raw(handle as *mut BlockingPartitionedNamespace); + } + } +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_namespaceIdNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, +) -> jstring { + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let namespace_id = ns.inner.namespace_id(); + ok_or_throw_with_return!( + env, + env.new_string(namespace_id).map_err(Error::from), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_listNamespacesNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.list_namespaces(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_describeNamespaceNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.describe_namespace(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createNamespaceNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.create_namespace(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_dropNamespaceNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.drop_namespace(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_namespaceExistsNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) { + ok_or_throw_without_return!( + env, + call_partitioned_namespace_void_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.namespace_exists(req)) + }) + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_listTablesNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.list_tables(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_describeTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.describe_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_registerTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.register_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_tableExistsNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) { + ok_or_throw_without_return!( + env, + call_partitioned_namespace_void_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.table_exists(req)) + }) + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_dropTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.drop_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_deregisterTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.deregister_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_countTableRowsNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jlong { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_count_method(&mut env, handle, request_json), + 0 + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, + request_data: JByteArray, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_with_data_method( + &mut env, + handle, + request_json, + request_data, + |ns, req, data| { RT.block_on(ns.inner.create_table(req, data)) } + ), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +#[allow(deprecated)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createEmptyTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.create_empty_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_declareTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.declare_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_insertIntoTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, + request_data: JByteArray, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_with_data_method( + &mut env, + handle, + request_json, + request_data, + |ns, req, data| { RT.block_on(ns.inner.insert_into_table(req, data)) } + ), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_mergeInsertIntoTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, + request_data: JByteArray, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_with_data_method( + &mut env, + handle, + request_json, + request_data, + |ns, req, data| { RT.block_on(ns.inner.merge_insert_into_table(req, data)) } + ), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_updateTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.update_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_deleteFromTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.delete_from_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_queryTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jbyteArray { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_query_method(&mut env, handle, request_json), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createTableIndexNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.create_table_index(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_listTableIndicesNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.list_table_indices(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_describeTableIndexStatsNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.describe_table_index_stats(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_describeTransactionNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.describe_transaction(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_alterTransactionNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.alter_transaction(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_schemaNative<'local>( + mut env: JNIEnv<'local>, + _obj: JObject, + handle: jlong, +) -> JObject<'local> { + ok_or_throw!(env, { + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + ns.inner.schema().into_java(&mut env) + }) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_planScanNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + filter: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + (|| { + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let filter: String = env.get_string(&filter)?.into(); + // Use the partitioned namespace schema to resolve column references. + let arrow_schema: ArrowSchema = (&ns.inner.schema()).into(); + let expr = RT + .block_on( + lance_namespace_impls::partition::parse_filter_expr_from_sql( + &filter, + &arrow_schema, + ), + ) + .map_err(|e| Error::runtime_error(format!("Failed to parse filter SQL: {}", e)))?; + let planned = RT + .block_on(ns.inner.plan_scan(&expr)) + .map_err(|e| Error::runtime_error(format!("plan_scan failed: {}", e)))?; + + let items: Vec = planned + .into_iter() + .map(|(t, refine)| JavaPlanScanItem { + table: to_java_partition_table(&t), + refine_expr: refine.to_string(), + }) + .collect(); + + let json = serde_json::to_string(&items) + .map_err(|e| Error::runtime_error(format!("Failed to serialize plan: {}", e)))?; + Ok::(env.new_string(json)?.into_raw()) + })(), + std::ptr::null_mut() + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_partitioningNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, +) -> jstring { + ok_or_throw_with_return!( + env, + (|| { + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let partitioning = RT + .block_on(ns.inner.partitioning()) + .map_err(|e| Error::runtime_error(format!("partitioning failed: {}", e)))?; + let specs: Vec = + partitioning.all().iter().map(|s| s.to_json()).collect(); + let json = serde_json::to_string(&specs).map_err(|e| { + Error::runtime_error(format!("Failed to serialize partitioning: {}", e)) + })?; + Ok::(env.new_string(json)?.into_raw()) + })(), + std::ptr::null_mut() + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_updatePartitionSpecNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + partition_fields_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + (|| { + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let json: String = env.get_string(&partition_fields_json)?.into(); + let json_fields: Vec = serde_json::from_str(&json) + .map_err(|e| Error::input_error(format!("Invalid partition fields JSON: {}", e)))?; + let mut fields = Vec::with_capacity(json_fields.len()); + for jf in &json_fields { + fields.push( + PartitionField::from_json(jf).map_err(|e| { + Error::input_error(format!("Invalid partition field: {}", e)) + })?, + ); + } + let spec = RT + .block_on(ns.inner.update_partition_spec(fields)) + .map_err(|e| { + Error::runtime_error(format!("update_partition_spec failed: {}", e)) + })?; + let json = serde_json::to_string(&spec.to_json()).map_err(|e| { + Error::runtime_error(format!("Failed to serialize partition spec: {}", e)) + })?; + Ok::(env.new_string(json)?.into_raw()) + })(), + std::ptr::null_mut() + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_resolveOrCreatePartitionTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + arrow_array_stream_addr: jlong, +) -> jstring { + ok_or_throw_with_return!( + env, + (|| { + use arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream}; + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; + let mut reader = + unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }.map_err(|e| { + Error::runtime_error(format!("Failed to import ArrowArrayStream: {}", e)) + })?; + + let batch = reader + .next() + .transpose() + .map_err(|e| Error::runtime_error(format!("Failed to read record batch: {}", e)))? + .ok_or_else(|| Error::input_error("Empty ArrowArrayStream".to_string()))?; + + if batch.num_rows() != 1 { + return Err(Error::input_error(format!( + "resolve_or_create_partition_table expects exactly 1 row, got {}", + batch.num_rows() + ))); + } + + let table = RT + .block_on(ns.inner.resolve_or_create_partition_table(&batch)) + .map_err(|e| { + Error::runtime_error(format!("resolve_or_create_partition_table failed: {}", e)) + })?; + let json = serde_json::to_string(&to_java_partition_table(&table)) + .map_err(|e| Error::runtime_error(format!("Failed to serialize table: {}", e)))?; + Ok::(env.new_string(json)?.into_raw()) + })(), + std::ptr::null_mut() + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_tablesNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, +) -> jstring { + ok_or_throw_with_return!( + env, + (|| { + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let tables = RT.block_on(ns.inner.tables()).map_err(|e| { + Error::runtime_error(format!("PartitionedNamespace.tables failed: {}", e)) + })?; + + let java_tables: Vec = + tables.iter().map(to_java_partition_table).collect(); + let json = serde_json::to_string(&java_tables) + .map_err(|e| Error::runtime_error(format!("Failed to serialize tables: {}", e)))?; + + Ok::(env.new_string(json)?.into_raw()) + })(), + std::ptr::null_mut() + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_commitNative( + mut env: JNIEnv, + _obj: JObject, + _handle: jlong, + _read_version_json: JObject, + _new_version_json: JObject, +) -> jstring { + let err = Error::runtime_error("PartitionedNamespace.commit is not implemented".to_string()); + err.throw(&mut env); + std::ptr::null_mut() +} diff --git a/java/src/main/java/org/lance/namespace/PartitionedNamespace.java b/java/src/main/java/org/lance/namespace/PartitionedNamespace.java new file mode 100644 index 00000000000..194b454fafa --- /dev/null +++ b/java/src/main/java/org/lance/namespace/PartitionedNamespace.java @@ -0,0 +1,463 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.namespace; + +import org.lance.JniLoader; +import org.lance.namespace.model.*; +import org.lance.schema.LanceSchema; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Preconditions; +import org.apache.arrow.c.ArrowArrayStream; +import org.apache.arrow.memory.BufferAllocator; + +import java.io.Closeable; +import java.util.List; +import java.util.Map; + +/** Java wrapper for the native Rust PartitionedNamespace implementation. */ +public final class PartitionedNamespace implements LanceNamespace, Closeable { + static { + JniLoader.ensureLoaded(); + } + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private long nativeHandle; + + public PartitionedNamespace() {} + + PartitionedNamespace(long nativeHandle) { + Preconditions.checkArgument(nativeHandle != 0, "nativeHandle is 0"); + this.nativeHandle = nativeHandle; + } + + @Override + public void initialize(Map configProperties, BufferAllocator allocator) { + initialize(configProperties, allocator, null); + } + + /** Initialize with a dynamic context provider. */ + public void initialize( + Map configProperties, + BufferAllocator allocator, + DynamicContextProvider contextProvider) { + Preconditions.checkNotNull(configProperties, "configProperties is null"); + Preconditions.checkNotNull(allocator, "allocator is null"); + Preconditions.checkArgument(nativeHandle == 0, "PartitionedNamespace already initialized"); + if (contextProvider != null) { + this.nativeHandle = createNativeWithProvider(configProperties, contextProvider); + } else { + this.nativeHandle = createNative(configProperties); + } + } + + @Override + public String namespaceId() { + ensureOpen(); + return namespaceIdNative(nativeHandle); + } + + @Override + public ListNamespacesResponse listNamespaces(ListNamespacesRequest request) { + ensureOpen(); + String json = listNamespacesNative(nativeHandle, toJson(request)); + return fromJson(json, ListNamespacesResponse.class); + } + + @Override + public DescribeNamespaceResponse describeNamespace(DescribeNamespaceRequest request) { + ensureOpen(); + String json = describeNamespaceNative(nativeHandle, toJson(request)); + return fromJson(json, DescribeNamespaceResponse.class); + } + + @Override + public CreateNamespaceResponse createNamespace(CreateNamespaceRequest request) { + ensureOpen(); + String json = createNamespaceNative(nativeHandle, toJson(request)); + return fromJson(json, CreateNamespaceResponse.class); + } + + @Override + public DropNamespaceResponse dropNamespace(DropNamespaceRequest request) { + ensureOpen(); + String json = dropNamespaceNative(nativeHandle, toJson(request)); + return fromJson(json, DropNamespaceResponse.class); + } + + @Override + public void namespaceExists(NamespaceExistsRequest request) { + ensureOpen(); + namespaceExistsNative(nativeHandle, toJson(request)); + } + + @Override + public ListTablesResponse listTables(ListTablesRequest request) { + ensureOpen(); + String json = listTablesNative(nativeHandle, toJson(request)); + return fromJson(json, ListTablesResponse.class); + } + + @Override + public DescribeTableResponse describeTable(DescribeTableRequest request) { + ensureOpen(); + String json = describeTableNative(nativeHandle, toJson(request)); + return fromJson(json, DescribeTableResponse.class); + } + + @Override + public RegisterTableResponse registerTable(RegisterTableRequest request) { + ensureOpen(); + String json = registerTableNative(nativeHandle, toJson(request)); + return fromJson(json, RegisterTableResponse.class); + } + + @Override + public void tableExists(TableExistsRequest request) { + ensureOpen(); + tableExistsNative(nativeHandle, toJson(request)); + } + + @Override + public DropTableResponse dropTable(DropTableRequest request) { + ensureOpen(); + String json = dropTableNative(nativeHandle, toJson(request)); + return fromJson(json, DropTableResponse.class); + } + + @Override + public DeregisterTableResponse deregisterTable(DeregisterTableRequest request) { + ensureOpen(); + String json = deregisterTableNative(nativeHandle, toJson(request)); + return fromJson(json, DeregisterTableResponse.class); + } + + @Override + public Long countTableRows(CountTableRowsRequest request) { + ensureOpen(); + return countTableRowsNative(nativeHandle, toJson(request)); + } + + @Override + public CreateTableResponse createTable(CreateTableRequest request, byte[] requestData) { + ensureOpen(); + Preconditions.checkNotNull(requestData, "requestData is null"); + String json = createTableNative(nativeHandle, toJson(request), requestData); + return fromJson(json, CreateTableResponse.class); + } + + @Override + public CreateEmptyTableResponse createEmptyTable(CreateEmptyTableRequest request) { + ensureOpen(); + String json = createEmptyTableNative(nativeHandle, toJson(request)); + return fromJson(json, CreateEmptyTableResponse.class); + } + + @Override + public DeclareTableResponse declareTable(DeclareTableRequest request) { + ensureOpen(); + String json = declareTableNative(nativeHandle, toJson(request)); + return fromJson(json, DeclareTableResponse.class); + } + + @Override + public InsertIntoTableResponse insertIntoTable( + InsertIntoTableRequest request, byte[] requestData) { + ensureOpen(); + Preconditions.checkNotNull(requestData, "requestData is null"); + String json = insertIntoTableNative(nativeHandle, toJson(request), requestData); + return fromJson(json, InsertIntoTableResponse.class); + } + + @Override + public MergeInsertIntoTableResponse mergeInsertIntoTable( + MergeInsertIntoTableRequest request, byte[] requestData) { + ensureOpen(); + Preconditions.checkNotNull(requestData, "requestData is null"); + String json = mergeInsertIntoTableNative(nativeHandle, toJson(request), requestData); + return fromJson(json, MergeInsertIntoTableResponse.class); + } + + @Override + public UpdateTableResponse updateTable(UpdateTableRequest request) { + ensureOpen(); + String json = updateTableNative(nativeHandle, toJson(request)); + return fromJson(json, UpdateTableResponse.class); + } + + @Override + public DeleteFromTableResponse deleteFromTable(DeleteFromTableRequest request) { + ensureOpen(); + String json = deleteFromTableNative(nativeHandle, toJson(request)); + return fromJson(json, DeleteFromTableResponse.class); + } + + @Override + public byte[] queryTable(QueryTableRequest request) { + ensureOpen(); + return queryTableNative(nativeHandle, toJson(request)); + } + + @Override + public CreateTableIndexResponse createTableIndex(CreateTableIndexRequest request) { + ensureOpen(); + String json = createTableIndexNative(nativeHandle, toJson(request)); + return fromJson(json, CreateTableIndexResponse.class); + } + + @Override + public ListTableIndicesResponse listTableIndices(ListTableIndicesRequest request) { + ensureOpen(); + String json = listTableIndicesNative(nativeHandle, toJson(request)); + return fromJson(json, ListTableIndicesResponse.class); + } + + @Override + public DescribeTableIndexStatsResponse describeTableIndexStats( + DescribeTableIndexStatsRequest request, String indexName) { + ensureOpen(); + String json = describeTableIndexStatsNative(nativeHandle, toJson(request)); + return fromJson(json, DescribeTableIndexStatsResponse.class); + } + + @Override + public DescribeTransactionResponse describeTransaction(DescribeTransactionRequest request) { + ensureOpen(); + String json = describeTransactionNative(nativeHandle, toJson(request)); + return fromJson(json, DescribeTransactionResponse.class); + } + + @Override + public AlterTransactionResponse alterTransaction(AlterTransactionRequest request) { + ensureOpen(); + String json = alterTransactionNative(nativeHandle, toJson(request)); + return fromJson(json, AlterTransactionResponse.class); + } + + /** Shared logical schema enforced across all partition tables. */ + public LanceSchema schema() { + ensureOpen(); + return schemaNative(nativeHandle); + } + + /** + * Partition pruning for the given filter expression. + * + * @param filter SQL expression used in a WHERE clause (empty means TRUE) + */ + public List planScan(String filter) { + ensureOpen(); + Preconditions.checkNotNull(filter, "filter is null"); + String json = planScanNative(nativeHandle, filter); + return fromJson(json, new TypeReference>() {}); + } + + /** Get all partition specs as JSON objects. */ + public List> partitioning() { + ensureOpen(); + String json = partitioningNative(nativeHandle); + return fromJson(json, new TypeReference>>() {}); + } + + /** + * Update the current partition spec. + * + * @param partitionFieldsJson JSON array of partition field definitions + * @return the new partition spec as a JSON object + */ + public Map updatePartitionSpec(String partitionFieldsJson) { + ensureOpen(); + Preconditions.checkNotNull(partitionFieldsJson, "partitionFieldsJson is null"); + String json = updatePartitionSpecNative(nativeHandle, partitionFieldsJson); + return fromJson(json, new TypeReference>() {}); + } + + /** + * Resolve the target partition table for the input row. Create it (empty table) if not exists. + * + *

The stream must contain exactly one record batch with exactly one row. + */ + public PartitionTable resolveOrCreatePartitionTable(ArrowArrayStream recordStream) { + ensureOpen(); + Preconditions.checkNotNull(recordStream, "recordStream is null"); + String json = resolveOrCreatePartitionTableNative(nativeHandle, recordStream.memoryAddress()); + return fromJson(json, new TypeReference() {}); + } + + /** List all partition tables in this partitioned namespace. */ + public List tables() { + ensureOpen(); + String json = tablesNative(nativeHandle); + return fromJson(json, new TypeReference>() {}); + } + + /** Commit (currently not implemented on the Rust side). */ + public String commit(Object readVersionJson, Object newVersionJson) { + ensureOpen(); + return commitNative(nativeHandle, readVersionJson, newVersionJson); + } + + @Override + public void close() { + if (nativeHandle != 0) { + releaseNative(nativeHandle); + nativeHandle = 0; + } + } + + private void ensureOpen() { + Preconditions.checkArgument(nativeHandle != 0, "PartitionedNamespace is closed"); + } + + private static T fromJson(String json, TypeReference typeRef) { + try { + return OBJECT_MAPPER.readValue(json, typeRef); + } catch (JsonProcessingException e) { + throw new RuntimeException("Failed to deserialize JSON", e); + } + } + + private static String toJson(Object obj) { + try { + return OBJECT_MAPPER.writeValueAsString(obj); + } catch (JsonProcessingException e) { + throw new RuntimeException("Failed to serialize request to JSON", e); + } + } + + private static T fromJson(String json, Class clazz) { + try { + return OBJECT_MAPPER.readValue(json, clazz); + } catch (JsonProcessingException e) { + throw new RuntimeException("Failed to deserialize JSON", e); + } + } + + /** PlanScan result item. */ + public static final class PlanScanItem { + @JsonProperty("table") + private PartitionTable table; + + @JsonProperty("refine_expr") + private String refineExpr; + + public PlanScanItem() {} + + public PartitionTable table() { + return table; + } + + public String refineExpr() { + return refineExpr; + } + } + + /** Partition table identifier. */ + public static final class PartitionTable { + @JsonProperty("id") + private List id; + + @JsonProperty("read_version") + private Long readVersion; + + public PartitionTable() {} + + public List id() { + return id; + } + + public Long readVersion() { + return readVersion; + } + } + + // Native methods + private native long createNative(Map configProperties); + + private native long createNativeWithProvider( + Map configProperties, DynamicContextProvider contextProvider); + + private native void releaseNative(long handle); + + private native String namespaceIdNative(long handle); + + private native String listNamespacesNative(long handle, String requestJson); + + private native String describeNamespaceNative(long handle, String requestJson); + + private native String createNamespaceNative(long handle, String requestJson); + + private native String dropNamespaceNative(long handle, String requestJson); + + private native void namespaceExistsNative(long handle, String requestJson); + + private native String listTablesNative(long handle, String requestJson); + + private native String describeTableNative(long handle, String requestJson); + + private native String registerTableNative(long handle, String requestJson); + + private native void tableExistsNative(long handle, String requestJson); + + private native String dropTableNative(long handle, String requestJson); + + private native String deregisterTableNative(long handle, String requestJson); + + private native long countTableRowsNative(long handle, String requestJson); + + private native String createTableNative(long handle, String requestJson, byte[] requestData); + + private native String createEmptyTableNative(long handle, String requestJson); + + private native String declareTableNative(long handle, String requestJson); + + private native String insertIntoTableNative(long handle, String requestJson, byte[] requestData); + + private native String mergeInsertIntoTableNative( + long handle, String requestJson, byte[] requestData); + + private native String updateTableNative(long handle, String requestJson); + + private native String deleteFromTableNative(long handle, String requestJson); + + private native byte[] queryTableNative(long handle, String requestJson); + + private native String createTableIndexNative(long handle, String requestJson); + + private native String listTableIndicesNative(long handle, String requestJson); + + private native String describeTableIndexStatsNative(long handle, String requestJson); + + private native String describeTransactionNative(long handle, String requestJson); + + private native String alterTransactionNative(long handle, String requestJson); + + private native LanceSchema schemaNative(long handle); + + private native String planScanNative(long handle, String filter); + + private native String partitioningNative(long handle); + + private native String updatePartitionSpecNative(long handle, String partitionFieldsJson); + + private native String resolveOrCreatePartitionTableNative(long handle, long arrowArrayStreamAddr); + + private native String tablesNative(long handle); + + private native String commitNative(long handle, Object readVersionJson, Object newVersionJson); +} diff --git a/python/Cargo.lock b/python/Cargo.lock index 2748e2fdd87..c3523812524 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4303,13 +4303,17 @@ dependencies = [ "axum", "bytes", "chrono", + "datafusion-functions", "futures", "lance", + "lance-arrow", "lance-core", "lance-index", "lance-io", "lance-namespace", + "lance-namespace-reqwest-client", "log", + "murmur3", "object_store", "rand 0.9.2", "reqwest", @@ -4325,8 +4329,7 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" +source = "git+https://github.com/wojiaodoubao/lance-namespace?branch=rest-table-properties#a122997e5ed74122686c3a4bd228b839d0d025eb" dependencies = [ "reqwest", "serde", @@ -4902,6 +4905,12 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + [[package]] name = "murmurhash32" version = "0.3.1" diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index b41e7f44e01..d1d6b39bda1 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -13,8 +13,8 @@ rust-version.workspace = true [features] default = ["dir-aws", "dir-azure", "dir-gcp", "dir-oss", "dir-huggingface"] -rest = ["dep:reqwest", "dep:serde"] -rest-adapter = ["dep:axum", "dep:tower", "dep:tower-http", "dep:serde"] +rest = ["dep:reqwest"] +rest-adapter = ["dep:axum", "dep:tower", "dep:tower-http"] # Cloud storage features for directory implementation - align with lance-io dir-gcp = ["lance-io/gcp", "lance/gcp"] dir-aws = ["lance-io/aws", "lance/aws"] @@ -23,12 +23,15 @@ dir-oss = ["lance-io/oss", "lance/oss"] dir-huggingface = ["lance-io/huggingface", "lance/huggingface"] # Credential vending features credential-vendor-aws = ["dep:aws-sdk-sts", "dep:aws-config", "dep:aws-credential-types", "dep:sha2", "dep:base64"] -credential-vendor-gcp = ["dep:google-cloud-auth", "dep:reqwest", "dep:serde", "dep:sha2", "dep:base64"] +credential-vendor-gcp = ["dep:google-cloud-auth", "dep:reqwest", "dep:sha2", "dep:base64"] credential-vendor-azure = ["dep:azure_core", "dep:azure_identity", "dep:azure_storage", "dep:azure_storage_blobs", "dep:time", "dep:sha2", "dep:base64", "dep:reqwest"] [dependencies] lance-namespace.workspace = true lance-core.workspace = true +lance-namespace-reqwest-client.workspace = true +murmur3 = "0.5" +datafusion-functions.workspace = true # REST implementation dependencies (optional, enabled by "rest" feature) reqwest = { version = "0.12", optional = true, default-features = false, features = [ @@ -53,7 +56,7 @@ arrow-schema = { workspace = true } axum = { workspace = true, optional = true } tower = { workspace = true, optional = true } tower-http = { workspace = true, optional = true, features = ["trace", "cors", "normalize-path"] } -serde = { workspace = true, optional = true } +serde = { workspace = true } # Common dependencies async-trait.workspace = true @@ -82,6 +85,7 @@ azure_identity = { version = "0.21", optional = true } azure_storage = { version = "0.21", optional = true } azure_storage_blobs = { version = "0.21", optional = true } time = { version = "0.3", optional = true } +lance-arrow = { workspace = true } [dev-dependencies] tokio = { workspace = true, features = ["full"] } diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 875df33e580..bcaff210e96 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -7,21 +7,17 @@ //! that stores tables as Lance datasets in a filesystem directory structure. pub mod manifest; +pub mod manifest_ext; +use crate::context::DynamicContextProvider; use arrow::record_batch::RecordBatchIterator; use arrow_ipc::reader::StreamReader; use async_trait::async_trait; use bytes::Bytes; use lance::dataset::{Dataset, WriteParams}; use lance::session::Session; +use lance_core::{box_error, Error, Result}; use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; -use object_store::path::Path; -use object_store::{Error as ObjectStoreError, ObjectStore as OSObjectStore, PutMode, PutOptions}; -use std::collections::HashMap; -use std::io::Cursor; -use std::sync::Arc; - -use crate::context::DynamicContextProvider; use lance_namespace::models::{ CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeclareTableRequest, @@ -30,14 +26,19 @@ use lance_namespace::models::{ DropTableRequest, DropTableResponse, Identity, ListNamespacesRequest, ListNamespacesResponse, ListTablesRequest, ListTablesResponse, NamespaceExistsRequest, TableExistsRequest, }; - -use lance_core::{box_error, Error, Result}; use lance_namespace::schema::arrow_schema_to_json; use lance_namespace::LanceNamespace; +use object_store::path::Path; +use object_store::{Error as ObjectStoreError, ObjectStore as OSObjectStore, PutMode, PutOptions}; +use snafu::location; +use std::collections::HashMap; +use std::io::Cursor; +use std::sync::Arc; use crate::credentials::{ create_credential_vendor_for_location, has_credential_vendor_config, CredentialVendor, }; +use crate::ManifestNamespace; /// Result of checking table status atomically. /// @@ -552,6 +553,16 @@ impl std::fmt::Display for DirectoryNamespace { } impl DirectoryNamespace { + pub fn manifest_namespace(&self) -> Result> { + match self.manifest_ns { + Some(ref ns) => Ok(ns.clone()), + None => Err(Error::Namespace { + source: "Not manifest namespace".into(), + location: location!(), + }), + } + } + /// Apply pagination to a list of table names /// /// Sorts the list alphabetically and applies pagination using page_token (start_after) and limit. diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 49d19712e26..c3a358a52c0 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -6,16 +6,23 @@ //! This module provides a namespace implementation that uses a manifest table //! to track tables and nested namespaces. -use arrow::array::{Array, RecordBatch, RecordBatchIterator, StringArray}; +use crate::dir::manifest_ext::ManifestNamespaceExt; +use arrow::array::{Array, ArrayRef, RecordBatch, RecordBatchIterator, StringArray}; use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use arrow_ipc::reader::StreamReader; +use arrow_schema::{FieldRef, Schema}; use async_trait::async_trait; use bytes::Bytes; use futures::stream::StreamExt; use lance::dataset::optimize::{compact_files, CompactionOptions}; -use lance::dataset::{builder::DatasetBuilder, WriteParams}; +use lance::dataset::transaction::UpdateMapEntry; +use lance::dataset::{builder::DatasetBuilder, NewColumnTransform, WriteParams}; +use lance::deps::datafusion::logical_expr::Expr; +use lance::deps::datafusion::prelude::{col, lit}; +use lance::deps::datafusion::scalar::ScalarValue; use lance::session::Session; use lance::{dataset::scanner::Scanner, Dataset}; +use lance_arrow::RecordBatchExt; use lance_core::{box_error, Error, Result}; use lance_index::optimize::OptimizeOptions; use lance_index::scalar::{BuiltinIndexType, ScalarIndexParams}; @@ -36,9 +43,12 @@ use lance_namespace::schema::arrow_schema_to_json; use lance_namespace::LanceNamespace; use object_store::path::Path; use snafu::location; +use std::collections::HashSet; use std::io::Cursor; +use std::str::FromStr; use std::{ collections::HashMap, + f32, f64, hash::{DefaultHasher, Hash, Hasher}, ops::{Deref, DerefMut}, sync::Arc, @@ -46,7 +56,7 @@ use std::{ use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; const MANIFEST_TABLE_NAME: &str = "__manifest"; -const DELIMITER: &str = "$"; +pub(crate) const DELIMITER: &str = "$"; // Index names for the __manifest table /// BTREE index on the object_id column for fast lookups @@ -83,12 +93,18 @@ impl ObjectType { } } +pub enum ManifestObject { + Table(TableInfo), + Namespace(NamespaceInfo), +} + /// Information about a table stored in the manifest #[derive(Debug, Clone)] pub struct TableInfo { pub namespace: Vec, pub name: String, pub location: String, + pub properties: Option>, } /// Information about a namespace stored in the manifest @@ -235,24 +251,37 @@ impl DerefMut for DatasetWriteGuard<'_> { } } +/// Extended properties are special properties started with `lance.manifest.extended.` prefix, and +/// stored in the manifest table. +/// +/// For example, a namespace object contains metadata like: +/// ```json +/// { +/// "user_name": "Alice", +/// "lance.manifest.extended.user_id": "123456" +/// } +/// ``` +/// The first one is stored at column named "metadata", the second is stored at column named "user_id". +pub(crate) static EXTENDED_PREFIX: &str = "lance.manifest.extended."; + /// Manifest-based namespace implementation /// /// Uses a special `__manifest` Lance table to track tables and nested namespaces. #[derive(Debug)] pub struct ManifestNamespace { - root: String, - storage_options: Option>, + pub(crate) root: String, + pub(crate) storage_options: Option>, #[allow(dead_code)] session: Option>, #[allow(dead_code)] - object_store: Arc, + pub(crate) object_store: Arc, #[allow(dead_code)] - base_path: Path, - manifest_dataset: DatasetConsistencyWrapper, + pub(crate) base_path: Path, + pub(crate) manifest_dataset: DatasetConsistencyWrapper, /// Whether directory listing is enabled in dual mode /// If true, root namespace tables use {table_name}.lance naming /// If false, they use namespace-prefixed names - dir_listing_enabled: bool, + pub(crate) dir_listing_enabled: bool, /// Whether to perform inline optimization (compaction and indexing) on the __manifest table /// after every write. Defaults to true. inline_optimization_enabled: bool, @@ -311,8 +340,86 @@ impl ManifestNamespace { } } + /// Add extended properties to the manifest table. + pub async fn add_extended_properties(&self, properties: &Vec<(&str, DataType)>) -> Result<()> { + let full_schema = self.full_manifest_schema().await?; + let fields: Vec = properties + .iter() + .map(|(name, data_type)| { + if !name.starts_with(EXTENDED_PREFIX) { + return Err(Error::io( + format!( + "Extended properties key {} must start with prefix: {}", + name, EXTENDED_PREFIX + ), + location!(), + )); + } + Ok(Field::new( + name.strip_prefix(EXTENDED_PREFIX).unwrap().to_string(), + data_type.clone(), + true, + )) + }) + .collect::>>()? + .into_iter() + .filter(|f| full_schema.column_with_name(f.name()).is_none()) + .collect(); + + let schema = Schema::new(fields); + let transform = NewColumnTransform::AllNulls(Arc::new(schema)); + + let mut ds = self.manifest_dataset.get_mut().await?; + ds.add_columns(transform, None, None).await?; + + Ok(()) + } + + /// Get all extended properties keys + pub async fn get_extended_properties_keys(&self) -> Result> { + let basic_cols: HashSet = Self::basic_manifest_schema() + .fields + .iter() + .map(|f| f.name().to_string()) + .collect(); + let mut extended_props_keys = vec![]; + for f in self.full_manifest_schema().await?.fields.iter() { + if !basic_cols.contains(f.name().as_str()) { + extended_props_keys.push(f.name().to_string()); + } + } + Ok(extended_props_keys) + } + + /// Remove extended properties from the manifest table. + pub async fn remove_extended_properties(&mut self, properties: &Vec<&str>) -> Result<()> { + let full_schema = self.full_manifest_schema().await?; + let to_remove: Vec = properties + .iter() + .map(|name| { + if !name.starts_with(EXTENDED_PREFIX) { + return Err(Error::io( + format!( + "Extended properties key {} must start with prefix: {}", + name, EXTENDED_PREFIX + ), + location!(), + )); + } + Ok(name.strip_prefix(EXTENDED_PREFIX).unwrap().to_string()) + }) + .collect::>>()? + .into_iter() + .filter(|s| full_schema.column_with_name(s.as_str()).is_some()) + .collect(); + let remove: Vec<&str> = to_remove.iter().map(|s| s.as_str()).collect(); + + let mut ds = self.manifest_dataset.get_mut().await?; + ds.drop_columns(&remove).await + } + /// Split an object ID (table_id as vec of strings) into namespace and table name - fn split_object_id(table_id: &[String]) -> (Vec, String) { + pub(crate) fn split_object_id(table_id: &[String]) -> (Vec, String) { if table_id.len() == 1 { (vec![], table_id[0].clone()) } else { @@ -334,7 +441,7 @@ impl ManifestNamespace { /// failed table creation, delete and create new table of the same name, etc. /// The object_id is added after the hash to ensure /// dir name uniqueness and make debugging easier. - fn generate_dir_name(object_id: &str) -> String { + pub(crate) fn generate_dir_name(object_id: &str) -> String { // Generate a random number for uniqueness let random_num: u64 = rand::random(); @@ -385,7 +492,7 @@ impl ManifestNamespace { /// 3. Optimizes existing indices /// /// This is called automatically after writes when inline_optimization_enabled is true. - async fn run_inline_optimization(&self) -> Result<()> { + pub(crate) async fn run_inline_optimization(&self) -> Result<()> { if !self.inline_optimization_enabled { return Ok(()); } @@ -516,8 +623,8 @@ impl ManifestNamespace { Ok(()) } - /// Get the manifest schema - fn manifest_schema() -> Arc { + /// Get the manifest schema of basic fields: object_id, object_type, location, metadata, base_objects + fn basic_manifest_schema() -> Arc { Arc::new(ArrowSchema::new(vec![ Field::new("object_id", DataType::Utf8, false), Field::new("object_type", DataType::Utf8, false), @@ -531,6 +638,13 @@ impl ManifestNamespace { ])) } + /// Get the full manifest schema, including basic fields and extended fields. + pub(crate) async fn full_manifest_schema(&self) -> Result { + let dataset_guard = self.manifest_dataset.get().await?; + let schema = ArrowSchema::from(dataset_guard.schema()); + Ok(schema) + } + /// Get a scanner for the manifest dataset async fn manifest_scanner(&self) -> Result { let dataset_guard = self.manifest_dataset.get().await?; @@ -579,15 +693,10 @@ impl ManifestNamespace { /// Check if the manifest contains an object with the given ID async fn manifest_contains_object(&self, object_id: &str) -> Result { - let filter = format!("object_id = '{}'", object_id); - let dataset_guard = self.manifest_dataset.get().await?; let mut scanner = dataset_guard.scan(); - scanner.filter(&filter).map_err(|e| Error::IO { - source: box_error(std::io::Error::other(format!("Failed to filter: {}", e))), - location: location!(), - })?; + scanner.filter_expr(col("object_id").eq(lit(object_id.to_string()))); // Project no columns and enable row IDs for count_rows to work scanner.project::<&str>(&[]).map_err(|e| Error::IO { @@ -609,52 +718,34 @@ impl ManifestNamespace { } /// Query the manifest for a table with the given object ID - async fn query_manifest_for_table(&self, object_id: &str) -> Result> { - let filter = format!("object_id = '{}' AND object_type = 'table'", object_id); - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| Error::IO { - source: box_error(std::io::Error::other(format!("Failed to filter: {}", e))), - location: location!(), - })?; - scanner - .project(&["object_id", "location"]) - .map_err(|e| Error::IO { - source: box_error(std::io::Error::other(format!("Failed to project: {}", e))), - location: location!(), - })?; - let batches = Self::execute_scanner(scanner).await?; - - let mut found_result: Option = None; - let mut total_rows = 0; - - for batch in batches { - if batch.num_rows() == 0 { + pub(crate) async fn query_manifest_for_table( + &self, + object_id: &str, + ) -> Result> { + let objects = self + .query_manifest( + col("object_id") + .eq(lit(object_id.to_string())) + .and(col("object_type").eq(lit("table"))), + ) + .await?; + let mut found: Option = None; + for obj in objects { + let ManifestObject::Table(t) = obj else { continue; - } - - total_rows += batch.num_rows(); - if total_rows > 1 { + }; + if found.is_some() { return Err(Error::io( format!( - "Expected exactly 1 table with id '{}', found {}", - object_id, total_rows + "Expected exactly 1 table with id '{}', found more than 1", + object_id ), location!(), )); } - - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let location_array = Self::get_string_column(&batch, "location")?; - let location = location_array.value(0).to_string(); - let (namespace, name) = Self::parse_object_id(object_id_array.value(0)); - found_result = Some(TableInfo { - namespace, - name, - location, - }); + found = Some(t); } - - Ok(found_result) + Ok(found) } /// List all table locations in the manifest (for root namespace only) @@ -694,22 +785,23 @@ impl ManifestNamespace { object_type: ObjectType, location: Option, ) -> Result<()> { - self.insert_into_manifest_with_metadata(object_id, object_type, location, None, None) + self.insert_into_manifest_with_metadata(object_id, object_type, location, None, None, None) .await } /// Insert an entry into the manifest table with metadata and base_objects - async fn insert_into_manifest_with_metadata( + pub(crate) async fn insert_into_manifest_with_metadata( &self, object_id: String, object_type: ObjectType, location: Option, metadata: Option, base_objects: Option>, + extended_batch: Option, ) -> Result<()> { use arrow::array::builder::{ListBuilder, StringBuilder}; - let schema = Self::manifest_schema(); + let basic_schema = Self::basic_manifest_schema(); // Create base_objects array from the provided list let string_builder = StringBuilder::new(); @@ -745,7 +837,7 @@ impl ManifestNamespace { }; let batch = RecordBatch::try_new( - schema.clone(), + basic_schema.clone(), vec![ Arc::new(StringArray::from(vec![object_id.as_str()])), Arc::new(StringArray::from(vec![object_type.as_str()])), @@ -761,7 +853,15 @@ impl ManifestNamespace { ) })?; - let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + // Merge extended_batch with basic batch if provided + let batch = if let Some(extended_batch) = extended_batch { + batch.merge(&extended_batch)? + } else { + batch + }; + + let schema = batch.schema(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); // Use MergeInsert to ensure uniqueness on object_id let dataset_guard = self.manifest_dataset.get().await?; @@ -869,8 +969,26 @@ impl ManifestNamespace { .await } + /// Get metadata of __manifest table + pub async fn get_metadata(&self) -> Result> { + let ds = self.manifest_dataset.get().await?; + Ok(ds.metadata().clone()) + } + + /// Update metadata to __manifest table + pub async fn update_metadata( + &self, + values: impl IntoIterator>, + ) -> Result> { + let mut ds = self.manifest_dataset.get_mut().await?; + ds.update_metadata(values).await + } + /// Validate that all levels of a namespace path exist - async fn validate_namespace_levels_exist(&self, namespace_path: &[String]) -> Result<()> { + pub(crate) async fn validate_namespace_levels_exist( + &self, + namespace_path: &[String], + ) -> Result<()> { for i in 1..=namespace_path.len() { let partial_path = &namespace_path[..i]; let object_id = partial_path.join(DELIMITER); @@ -886,70 +1004,51 @@ impl ManifestNamespace { /// Query the manifest for a namespace with the given object ID async fn query_manifest_for_namespace(&self, object_id: &str) -> Result> { - let filter = format!("object_id = '{}' AND object_type = 'namespace'", object_id); - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| Error::IO { - source: box_error(std::io::Error::other(format!("Failed to filter: {}", e))), - location: location!(), - })?; - scanner - .project(&["object_id", "metadata"]) - .map_err(|e| Error::IO { - source: box_error(std::io::Error::other(format!("Failed to project: {}", e))), - location: location!(), - })?; - let batches = Self::execute_scanner(scanner).await?; - - let mut found_result: Option = None; - let mut total_rows = 0; - - for batch in batches { - if batch.num_rows() == 0 { + let objects = self + .query_manifest( + col("object_id") + .eq(lit(object_id.to_string())) + .and(col("object_type").eq(lit("namespace"))), + ) + .await?; + let mut found: Option = None; + for obj in objects { + let ManifestObject::Namespace(ns) = obj else { continue; - } - - total_rows += batch.num_rows(); - if total_rows > 1 { + }; + if found.is_some() { return Err(Error::io( format!( - "Expected exactly 1 namespace with id '{}', found {}", - object_id, total_rows + "Expected exactly 1 namespace with id '{}', found more than 1", + object_id ), location!(), )); } + found = Some(ns); + } + Ok(found) + } - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let metadata_array = Self::get_string_column(&batch, "metadata")?; - - let object_id_str = object_id_array.value(0); - let metadata = if !metadata_array.is_null(0) { - let metadata_str = metadata_array.value(0); - match serde_json::from_str::>(metadata_str) { - Ok(map) => Some(map), - Err(e) => { - return Err(Error::io( - format!( - "Failed to deserialize metadata for namespace '{}': {}", - object_id, e - ), - location!(), - )); - } - } - } else { - None - }; + pub(crate) async fn query_manifest(&self, filter: Expr) -> Result> { + let mut scanner = self.manifest_scanner().await?; + scanner.filter_expr(filter); + let batches = Self::execute_scanner(scanner).await?; - let (namespace, name) = Self::parse_object_id(object_id_str); - found_result = Some(NamespaceInfo { - namespace, - name, - metadata, - }); + let mut objects: Vec = vec![]; + + for batch in batches.iter() { + for row_idx in 0..batch.num_rows() { + let sliced_columns: Vec> = batch + .columns() + .iter() + .map(|col| col.slice(row_idx, 1)) + .collect(); + let row = RecordBatch::try_new(batch.schema(), sliced_columns)?; + objects.push(parse_manifest_object(&row)?); + } } - - Ok(found_result) + Ok(objects) } /// Create or get the manifest dataset @@ -975,7 +1074,7 @@ impl ManifestNamespace { Ok(DatasetConsistencyWrapper::new(dataset)) } else { log::info!("Creating new manifest table at {}", manifest_path); - let schema = Self::manifest_schema(); + let schema = Self::basic_manifest_schema(); let empty_batch = RecordBatch::new_empty(schema.clone()); let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone()); @@ -1011,6 +1110,86 @@ impl ManifestNamespace { Ok(DatasetConsistencyWrapper::new(dataset)) } } + + pub(crate) fn build_metadata_json( + properties: &Option>, + ) -> Option { + properties.as_ref().and_then(|props| { + if props.is_empty() { + None + } else { + let meta_props = props + .iter() + .filter(|(key, _)| !key.starts_with(EXTENDED_PREFIX)) + .collect::>(); + Some(serde_json::to_string(&meta_props).ok()?) + } + }) + } +} + +/// Parse one row of __manifest table into manifest object. +fn parse_manifest_object(batch: &RecordBatch) -> Result { + if batch.num_rows() == 0 { + return Err(Error::InvalidInput { + source: "batch must have at least one row".into(), + location: location!(), + }); + } + + // Parse properties + let mut merged = batch_to_extended_props(batch); + let metadata_array = ManifestNamespace::get_string_column(batch, "metadata")?; + + if !metadata_array.is_null(0) { + let metadata_str = metadata_array.value(0); + match serde_json::from_str::>(metadata_str) { + Ok(map) => merged.extend(map), + Err(e) => { + return Err(Error::io( + format!("Failed to deserialize metadata: {}", e), + location!(), + )); + } + } + } + + let properties = if merged.is_empty() { + None + } else { + Some(merged) + }; + + // Parse manifest object + let object_type = ManifestNamespace::get_string_column(batch, "object_type")?; + let object_type = object_type.value(0).to_string(); + match object_type.as_str() { + "namespace" => { + let object_id_array = ManifestNamespace::get_string_column(batch, "object_id")?; + let (namespace, name) = ManifestNamespace::parse_object_id(object_id_array.value(0)); + Ok(ManifestObject::Namespace(NamespaceInfo { + namespace, + name, + metadata: properties, + })) + } + "table" => { + let object_id_array = ManifestNamespace::get_string_column(batch, "object_id")?; + let location_array = ManifestNamespace::get_string_column(batch, "location")?; + let location = location_array.value(0).to_string(); + let (namespace, name) = ManifestNamespace::parse_object_id(object_id_array.value(0)); + Ok(ManifestObject::Table(TableInfo { + namespace, + name, + location, + properties, + })) + } + t => Err(Error::Internal { + message: format!("Unknown object type {}", t), + location: location!(), + }), + } } #[async_trait] @@ -1114,6 +1293,7 @@ impl LanceNamespace for ManifestNamespace { location: Some(table_uri.clone()), table_uri: Some(table_uri), storage_options, + properties: info.properties, ..Default::default() }); } @@ -1139,6 +1319,7 @@ impl LanceNamespace for ManifestNamespace { table_uri: Some(table_uri), schema: Some(Box::new(json_schema)), storage_options, + properties: info.properties, ..Default::default() }) } @@ -1150,6 +1331,7 @@ impl LanceNamespace for ManifestNamespace { location: Some(table_uri.clone()), table_uri: Some(table_uri), storage_options, + properties: info.properties, ..Default::default() }) } @@ -1208,6 +1390,15 @@ impl LanceNamespace for ManifestNamespace { let (namespace, table_name) = Self::split_object_id(table_id); let object_id = Self::build_object_id(&namespace, &table_name); + // Serialize properties and compute extended batch if provided + let metadata = Self::build_metadata_json(&request.properties); + + let extended_batch = if let Some(props) = &request.properties { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? + } else { + None + }; + // Check if table already exists in manifest if self.manifest_contains_object(&object_id).await? { return Err(Error::io( @@ -1270,13 +1461,21 @@ impl LanceNamespace for ManifestNamespace { })?; // Register in manifest (store dir_name, not full URI) - self.insert_into_manifest(object_id, ObjectType::Table, Some(dir_name)) - .await?; + self.insert_into_manifest_with_metadata( + object_id, + ObjectType::Table, + Some(dir_name), + metadata, + None, + extended_batch, + ) + .await?; Ok(CreateTableResponse { version: Some(1), location: Some(table_uri), storage_options: self.storage_options.clone(), + properties: request.properties.clone(), ..Default::default() }) } @@ -1448,14 +1647,14 @@ impl LanceNamespace for ManifestNamespace { }); } - // Serialize properties if provided - let metadata = request.properties.as_ref().and_then(|props| { - if props.is_empty() { - None - } else { - Some(serde_json::to_string(props).ok()?) - } - }); + // Serialize properties and compute extended batch if provided + let metadata = Self::build_metadata_json(&request.properties); + + let extended_batch = if let Some(props) = &request.properties { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? + } else { + None + }; self.insert_into_manifest_with_metadata( object_id, @@ -1463,6 +1662,7 @@ impl LanceNamespace for ManifestNamespace { None, metadata, None, + extended_batch, ) .await?; @@ -1574,6 +1774,15 @@ impl LanceNamespace for ManifestNamespace { let (namespace, table_name) = Self::split_object_id(table_id); let object_id = Self::build_object_id(&namespace, &table_name); + // Serialize properties and compute extended batch if provided + let metadata = Self::build_metadata_json(&request.properties); + + let extended_batch = if let Some(props) = &request.properties { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? + } else { + None + }; + // Check if table already exists in manifest let existing = self.query_manifest_for_table(&object_id).await?; if existing.is_some() { @@ -1637,8 +1846,15 @@ impl LanceNamespace for ManifestNamespace { })?; // Add entry to manifest marking this as an empty table (store dir_name, not full path) - self.insert_into_manifest(object_id, ObjectType::Table, Some(dir_name)) - .await?; + self.insert_into_manifest_with_metadata( + object_id, + ObjectType::Table, + Some(dir_name), + metadata, + None, + extended_batch, + ) + .await?; log::info!( "Created empty table '{}' in manifest at {}", @@ -1657,11 +1873,21 @@ impl LanceNamespace for ManifestNamespace { Ok(CreateEmptyTableResponse { location: Some(table_uri), storage_options, + properties: request.properties, ..Default::default() }) } async fn declare_table(&self, request: DeclareTableRequest) -> Result { + let extended_batch = if let Some(props) = &request.properties { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? + } else { + None + }; + self.declare_table_extended(request, extended_batch).await + } + + async fn register_table(&self, request: RegisterTableRequest) -> Result { let table_id = request.id.as_ref().ok_or_else(|| Error::InvalidInput { source: "Table ID is required".into(), location: location!(), @@ -1674,166 +1900,80 @@ impl LanceNamespace for ManifestNamespace { }); } + let location = request.location.clone(); + + // Validate that location is a relative path within the root directory + // We don't allow absolute URIs or paths that escape the root + if location.contains("://") { + return Err(Error::InvalidInput { + source: format!( + "Absolute URIs are not allowed for register_table. Location must be a relative path within the root directory: {}", + location + ).into(), + location: location!(), + }); + } + + if location.starts_with('/') { + return Err(Error::InvalidInput { + source: format!( + "Absolute paths are not allowed for register_table. Location must be a relative path within the root directory: {}", + location + ).into(), + location: location!(), + }); + } + + // Check for path traversal attempts + if location.contains("..") { + return Err(Error::InvalidInput { + source: format!( + "Path traversal is not allowed. Location must be a relative path within the root directory: {}", + location + ).into(), + location: location!(), + }); + } + let (namespace, table_name) = Self::split_object_id(table_id); let object_id = Self::build_object_id(&namespace, &table_name); - // Check if table already exists in manifest - let existing = self.query_manifest_for_table(&object_id).await?; - if existing.is_some() { + // Validate that parent namespaces exist (if not root) + if !namespace.is_empty() { + self.validate_namespace_levels_exist(&namespace).await?; + } + + // Check if table already exists + if self.manifest_contains_object(&object_id).await? { return Err(Error::Namespace { - source: format!("Table '{}' already exists", table_name).into(), + source: format!("Table '{}' already exists", object_id).into(), location: location!(), }); } - // Create table location path with hash-based naming - // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance - // Otherwise, use hash-based naming: {hash}_{object_id} - let dir_name = if namespace.is_empty() && self.dir_listing_enabled { - // Root table with directory listing enabled: use {table_name}.lance - format!("{}.lance", table_name) + // Serialize properties and compute extended batch if provided + let metadata = Self::build_metadata_json(&request.properties); + + let extended_batch = if let Some(props) = &request.properties { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? } else { - // Child namespace table or dir listing disabled: use hash-based naming - Self::generate_dir_name(&object_id) + None }; - let table_path = self.base_path.child(dir_name.as_str()); - let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; - - // Validate location if provided - if let Some(req_location) = &request.location { - let req_location = req_location.trim_end_matches('/'); - if req_location != table_uri { - return Err(Error::Namespace { - source: format!( - "Cannot declare table {} at location {}, must be at location {}", - table_name, req_location, table_uri - ) - .into(), - location: location!(), - }); - } - } - - // Create the .lance-reserved file to mark the table as existing - let reserved_file_path = table_path.child(".lance-reserved"); - - self.object_store - .create(&reserved_file_path) - .await - .map_err(|e| Error::Namespace { - source: format!( - "Failed to create .lance-reserved file for table {}: {}", - table_name, e - ) - .into(), - location: location!(), - })? - .shutdown() - .await - .map_err(|e| Error::Namespace { - source: format!( - "Failed to finalize .lance-reserved file for table {}: {}", - table_name, e - ) - .into(), - location: location!(), - })?; - - // Add entry to manifest marking this as a declared table (store dir_name, not full path) - self.insert_into_manifest(object_id, ObjectType::Table, Some(dir_name)) - .await?; - - log::info!( - "Declared table '{}' in manifest at {}", - table_name, - table_uri - ); - - // For backwards compatibility, only skip vending credentials when explicitly set to false - let vend_credentials = request.vend_credentials.unwrap_or(true); - let storage_options = if vend_credentials { - self.storage_options.clone() - } else { - None - }; - - Ok(DeclareTableResponse { - location: Some(table_uri), - storage_options, - ..Default::default() - }) - } - - async fn register_table(&self, request: RegisterTableRequest) -> Result { - let table_id = request.id.as_ref().ok_or_else(|| Error::InvalidInput { - source: "Table ID is required".into(), - location: location!(), - })?; - - if table_id.is_empty() { - return Err(Error::InvalidInput { - source: "Table ID cannot be empty".into(), - location: location!(), - }); - } - - let location = request.location.clone(); - - // Validate that location is a relative path within the root directory - // We don't allow absolute URIs or paths that escape the root - if location.contains("://") { - return Err(Error::InvalidInput { - source: format!( - "Absolute URIs are not allowed for register_table. Location must be a relative path within the root directory: {}", - location - ).into(), - location: location!(), - }); - } - - if location.starts_with('/') { - return Err(Error::InvalidInput { - source: format!( - "Absolute paths are not allowed for register_table. Location must be a relative path within the root directory: {}", - location - ).into(), - location: location!(), - }); - } - - // Check for path traversal attempts - if location.contains("..") { - return Err(Error::InvalidInput { - source: format!( - "Path traversal is not allowed. Location must be a relative path within the root directory: {}", - location - ).into(), - location: location!(), - }); - } - - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); - - // Validate that parent namespaces exist (if not root) - if !namespace.is_empty() { - self.validate_namespace_levels_exist(&namespace).await?; - } - - // Check if table already exists - if self.manifest_contains_object(&object_id).await? { - return Err(Error::Namespace { - source: format!("Table '{}' already exists", object_id).into(), - location: location!(), - }); - } // Register the table with its location in the manifest - self.insert_into_manifest(object_id, ObjectType::Table, Some(location.clone())) - .await?; + self.insert_into_manifest_with_metadata( + object_id, + ObjectType::Table, + Some(location.clone()), + metadata, + None, + extended_batch, + ) + .await?; Ok(RegisterTableResponse { location: Some(location), + properties: request.properties.clone(), ..Default::default() }) } @@ -1882,17 +2022,274 @@ impl LanceNamespace for ManifestNamespace { } } +/// Parse the first row of a RecordBatch into a HashMap, excluding specified columns. +fn batch_to_extended_props(batch: &RecordBatch) -> HashMap { + // Collect basic columns to excluded + let basic_schema = ManifestNamespace::basic_manifest_schema(); + let mut excluded: Vec<&str> = vec![]; + for field in basic_schema.fields.iter() { + excluded.push(field.name()); + } + + // Transform batch to properties + let mut result = HashMap::new(); + + if batch.num_rows() == 0 { + return result; + } + + for (i, field) in batch.schema().fields().iter().enumerate() { + let col_name = field.name().to_string(); + if excluded.contains(&col_name.as_str()) { + continue; + } + + let array = batch.column(i); + + if array.is_null(0) { + // skip null properties. + continue; + } + + let Ok(scalar) = ScalarValue::try_from_array(array.as_ref(), 0) else { + continue; + }; + + let Ok(value_str) = scalar_to_str(&scalar) else { + continue; + }; + + if let Some(value) = value_str { + if !value.is_empty() { + result.insert(format!("{}{}", EXTENDED_PREFIX, col_name), value); + } + } + } + + result +} + +/// Convert a HashMap into a RecordBatch, excluding specified columns. +fn batch_from_extended_props( + map: &HashMap, + schema: &Schema, +) -> Result> { + // Collect basic columns to excluded + let basic_schema = ManifestNamespace::basic_manifest_schema(); + let mut excluded: Vec<&str> = vec![]; + for field in basic_schema.fields.iter() { + excluded.push(field.name()); + } + + fn is_nullish_extended_value(v: &str) -> bool { + v.is_empty() || v.eq_ignore_ascii_case("null") + } + + // All non-null extended properties must be covered in schema. + for (k, v) in map.iter() { + if is_nullish_extended_value(v) { + continue; + } + if let Some(col_name) = k.strip_prefix(EXTENDED_PREFIX) { + if !excluded.contains(&col_name) && schema.column_with_name(col_name).is_none() { + return Err(Error::InvalidInput { + source: format!("Column {} does not exist in extended properties", col_name) + .into(), + location: location!(), + }); + } + } + } + + // Construct record batch + let mut array: Vec = vec![]; + let mut fields: Vec = vec![]; + for field in schema + .fields() + .iter() + .filter(|field| !excluded.contains(&field.name().as_str())) + { + let field_name = field.name().as_str(); + + match map.get(&format!("{}{}", EXTENDED_PREFIX, field_name)) { + Some(value) if !is_nullish_extended_value(value) => { + let scalar = scalar_from_str(field.data_type(), value)?; + let v = scalar.to_array().map_err(|e| Error::IO { + source: box_error(std::io::Error::other(format!( + "Failed to convert scalar for column '{}' to array: {}", + field_name, e + ))), + location: location!(), + })?; + array.push(v); + fields.push(field.clone()); + } + _ => {} + } + } + + if fields.is_empty() { + return Ok(None); + } + + let schema = Schema::new(fields); + Ok(Some(RecordBatch::try_new(Arc::new(schema), array)?)) +} + +pub(crate) fn scalar_to_str(scalar: &ScalarValue) -> Result> { + if scalar.is_null() { + return Ok(None); + } + + match scalar { + ScalarValue::Utf8(Some(v)) + | ScalarValue::Utf8View(Some(v)) + | ScalarValue::LargeUtf8(Some(v)) => Ok(Some(v.clone())), + ScalarValue::Boolean(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Int32(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Int64(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::UInt32(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::UInt64(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Float32(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Float64(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Date32(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Date64(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Binary(Some(v)) + | ScalarValue::LargeBinary(Some(v)) + | ScalarValue::BinaryView(Some(v)) + | ScalarValue::FixedSizeBinary(_, Some(v)) => Ok(Some(bytes_to_hex(v))), + _ => Err(Error::InvalidInput { + source: format!("Unsupported extended scalar: {:?}", scalar).into(), + location: location!(), + }), + } +} + +pub(crate) fn scalar_from_str(dt: &DataType, value: &str) -> Result { + match dt { + DataType::Utf8 => Ok(ScalarValue::Utf8(Some(value.to_string()))), + DataType::LargeUtf8 => Ok(ScalarValue::LargeUtf8(Some(value.to_string()))), + DataType::Boolean => Ok(ScalarValue::Boolean(Some(bool::from_str(value).map_err( + |e| Error::InvalidInput { + source: format!("Invalid boolean '{}': {}", value, e).into(), + location: location!(), + }, + )?))), + DataType::Int32 => Ok(ScalarValue::Int32(Some(i32::from_str(value).map_err( + |e| Error::InvalidInput { + source: format!("Invalid int32 '{}': {}", value, e).into(), + location: location!(), + }, + )?))), + DataType::Int64 => Ok(ScalarValue::Int64(Some(i64::from_str(value).map_err( + |e| Error::InvalidInput { + source: format!("Invalid int64 '{}': {}", value, e).into(), + location: location!(), + }, + )?))), + DataType::UInt32 => Ok(ScalarValue::UInt32(Some(u32::from_str(value).map_err( + |e| Error::InvalidInput { + source: format!("Invalid uint32 '{}': {}", value, e).into(), + location: location!(), + }, + )?))), + DataType::UInt64 => Ok(ScalarValue::UInt64(Some(u64::from_str(value).map_err( + |e| Error::InvalidInput { + source: format!("Invalid uint64 '{}': {}", value, e).into(), + location: location!(), + }, + )?))), + DataType::Float32 => Ok(ScalarValue::Float32(Some(f32::from_str(value).map_err( + |e| Error::InvalidInput { + source: format!("Invalid float32 '{}': {}", value, e).into(), + location: location!(), + }, + )?))), + DataType::Float64 => Ok(ScalarValue::Float64(Some(f64::from_str(value).map_err( + |e| Error::InvalidInput { + source: format!("Invalid float64 '{}': {}", value, e).into(), + location: location!(), + }, + )?))), + DataType::Date32 => Ok(ScalarValue::Date32(Some(i32::from_str(value).map_err( + |e| Error::InvalidInput { + source: format!("Invalid date32 '{}': {}", value, e).into(), + location: location!(), + }, + )?))), + DataType::Date64 => Ok(ScalarValue::Date64(Some(i64::from_str(value).map_err( + |e| Error::InvalidInput { + source: format!("Invalid date64 '{}': {}", value, e).into(), + location: location!(), + }, + )?))), + DataType::Binary => Ok(ScalarValue::Binary(Some(hex_to_bytes(value)?))), + DataType::LargeBinary => Ok(ScalarValue::LargeBinary(Some(hex_to_bytes(value)?))), + _ => Err(Error::InvalidInput { + source: format!("Unsupported extended column type: {:?}", dt).into(), + location: location!(), + }), + } +} + +fn bytes_to_hex(bytes: &[u8]) -> String { + let mut out = String::with_capacity(bytes.len() * 2); + for b in bytes { + use std::fmt::Write; + let _ = write!(&mut out, "{:02x}", b); + } + out +} + +fn hex_to_bytes(s: &str) -> Result> { + let s = s.strip_prefix("0x").unwrap_or(s); + if !s.len().is_multiple_of(2) { + return Err(Error::InvalidInput { + source: format!("Invalid hex string length {}", s.len()).into(), + location: location!(), + }); + } + + let mut out = Vec::with_capacity(s.len() / 2); + let bytes = s.as_bytes(); + for i in (0..bytes.len()).step_by(2) { + let hex = std::str::from_utf8(&bytes[i..i + 2]).map_err(|e| Error::InvalidInput { + source: format!("Invalid hex string encoding: {}", e).into(), + location: location!(), + })?; + let v = u8::from_str_radix(hex, 16).map_err(|e| Error::InvalidInput { + source: format!("Invalid hex byte '{}': {}", hex, e).into(), + location: location!(), + })?; + out.push(v); + } + Ok(out) +} + #[cfg(test)] mod tests { + use crate::dir::manifest::batch_to_extended_props; + use crate::dir::manifest_ext::{CreateMultiNamespacesRequestBuilder, ManifestNamespaceExt}; use crate::{DirectoryNamespaceBuilder, ManifestNamespace}; + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{Field, Schema}; + use arrow::record_batch::RecordBatch; + use arrow_schema::DataType; use bytes::Bytes; use lance_core::utils::tempfile::TempStdDir; + use lance_io::object_store::ObjectStore; use lance_namespace::models::{ - CreateTableRequest, DescribeTableRequest, DropTableRequest, ListTablesRequest, + CreateEmptyTableRequest, CreateNamespaceRequest, CreateTableRequest, + DescribeNamespaceRequest, DescribeTableRequest, DropNamespaceRequest, DropTableRequest, + ListNamespacesRequest, ListTablesRequest, NamespaceExistsRequest, RegisterTableRequest, TableExistsRequest, }; use lance_namespace::LanceNamespace; + use lance_namespace_reqwest_client::models::CreateNamespaceRequest as ClientCreateNamespaceRequest; + use lance_namespace_reqwest_client::models::DeclareTableRequest; use rstest::rstest; + use std::collections::HashMap; + use std::sync::Arc; fn create_test_ipc_data() -> Vec { use arrow::array::{Int32Array, StringArray}; @@ -2279,10 +2676,6 @@ mod tests { #[case::without_optimization(false)] #[tokio::test] async fn test_create_child_namespace(#[case] inline_optimization: bool) { - use lance_namespace::models::{ - CreateNamespaceRequest, ListNamespacesRequest, NamespaceExistsRequest, - }; - let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2329,10 +2722,6 @@ mod tests { #[case::without_optimization(false)] #[tokio::test] async fn test_create_nested_namespace(#[case] inline_optimization: bool) { - use lance_namespace::models::{ - CreateNamespaceRequest, ListNamespacesRequest, NamespaceExistsRequest, - }; - let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2384,8 +2773,6 @@ mod tests { #[case::without_optimization(false)] #[tokio::test] async fn test_create_namespace_without_parent_fails(#[case] inline_optimization: bool) { - use lance_namespace::models::CreateNamespaceRequest; - let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2407,10 +2794,6 @@ mod tests { #[case::without_optimization(false)] #[tokio::test] async fn test_drop_child_namespace(#[case] inline_optimization: bool) { - use lance_namespace::models::{ - CreateNamespaceRequest, DropNamespaceRequest, NamespaceExistsRequest, - }; - let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2449,8 +2832,6 @@ mod tests { #[case::without_optimization(false)] #[tokio::test] async fn test_drop_namespace_with_children_fails(#[case] inline_optimization: bool) { - use lance_namespace::models::{CreateNamespaceRequest, DropNamespaceRequest}; - let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2481,10 +2862,6 @@ mod tests { #[case::without_optimization(false)] #[tokio::test] async fn test_create_table_in_child_namespace(#[case] inline_optimization: bool) { - use lance_namespace::models::{ - CreateNamespaceRequest, CreateTableRequest, ListTablesRequest, - }; - let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2531,8 +2908,6 @@ mod tests { #[case::without_optimization(false)] #[tokio::test] async fn test_describe_child_namespace(#[case] inline_optimization: bool) { - use lance_namespace::models::{CreateNamespaceRequest, DescribeNamespaceRequest}; - let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2570,6 +2945,870 @@ mod tests { ); } + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_add_extended_properties_creates_columns_and_idempotent( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + let schema = manifest_ns.full_manifest_schema().await.unwrap(); + assert_eq!( + ManifestNamespace::basic_manifest_schema().fields().len(), + schema.fields().len() + ); + + // Adding extended properties should create new columns + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) + .await + .unwrap(); + + let schema = manifest_ns.full_manifest_schema().await.unwrap(); + let user_field = schema.field_with_name("user_id").unwrap(); + assert_eq!(user_field.data_type(), &DataType::Utf8); + let score_field = schema.field_with_name("score").unwrap(); + assert_eq!(score_field.data_type(), &DataType::Int32); + let initial_field_count = schema.fields().len(); + + // Adding the same properties again should be a no-op + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) + .await + .unwrap(); + let schema_after = manifest_ns.full_manifest_schema().await.unwrap(); + assert_eq!(schema_after.fields().len(), initial_field_count); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_add_extended_properties_rejects_missing_prefix( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + let result = manifest_ns + .add_extended_properties(&vec![("invalid_key", DataType::Utf8)]) + .await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("must start with prefix")); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_remove_extended_properties_drops_specified_columns( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let mut manifest_ns = + create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.group", DataType::Utf8), + ]) + .await + .unwrap(); + let schema = manifest_ns.full_manifest_schema().await.unwrap(); + assert!(schema.field_with_name("user_id").is_ok()); + assert!(schema.field_with_name("group").is_ok()); + + manifest_ns + .remove_extended_properties(&vec!["lance.manifest.extended.user_id"]) + .await + .unwrap(); + let schema_after = manifest_ns.full_manifest_schema().await.unwrap(); + assert!(schema_after.field_with_name("user_id").is_err()); + assert!(schema_after.field_with_name("group").is_ok()); + + // Remove non-existent property should be a no-op + manifest_ns + .remove_extended_properties(&vec!["lance.manifest.extended.user_id"]) + .await + .unwrap(); + let schema_after = manifest_ns.full_manifest_schema().await.unwrap(); + assert!(schema_after.field_with_name("user_id").is_err()); + assert!(schema_after.field_with_name("group").is_ok()); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_remove_extended_properties_rejects_missing_prefix( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let mut manifest_ns = + create_manifest_namespace_for_test(temp_path, inline_optimization).await; + let result = manifest_ns + .remove_extended_properties(&vec!["user_id"]) + .await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("must start with prefix")); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_namespace_with_extended_properties_without_columns_fails( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + let mut properties = std::collections::HashMap::new(); + properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); + + let mut create_req = CreateNamespaceRequest::new(); + create_req.id = Some(vec!["ns1".to_string()]); + create_req.properties = Some(properties); + + let result = namespace.create_namespace(create_req).await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Column user_id does not exist in extended properties")); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_namespace_with_extended_properties_succeeds_and_describe_unified( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![("lance.manifest.extended.user_id", DataType::Utf8)]) + .await + .unwrap(); + + let mut properties = std::collections::HashMap::new(); + properties.insert("owner".to_string(), "alice".to_string()); + properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); + let mut create_req = CreateNamespaceRequest::new(); + create_req.id = Some(vec!["ns1".to_string()]); + create_req.properties = Some(properties); + manifest_ns.create_namespace(create_req).await.unwrap(); + + let describe_req = DescribeNamespaceRequest { + id: Some(vec!["ns1".to_string()]), + ..Default::default() + }; + let response = manifest_ns.describe_namespace(describe_req).await.unwrap(); + let props = response.properties.expect("properties should be present"); + assert_eq!(props.get("owner"), Some(&"alice".to_string())); + assert_eq!( + props.get("lance.manifest.extended.user_id"), + Some(&"123".to_string()) + ); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_extended_properties_null_and_empty_values_omitted( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.null_prop", DataType::Utf8), + ("lance.manifest.extended.empty_prop", DataType::Utf8), + ("lance.manifest.extended.non_existed", DataType::Utf8), + ("lance.manifest.extended.valid_prop", DataType::Utf8), + ]) + .await + .unwrap(); + + let mut properties = std::collections::HashMap::new(); + properties.insert("owner".to_string(), "alice".to_string()); + properties.insert( + "lance.manifest.extended.null_prop".to_string(), + "null".to_string(), + ); + properties.insert( + "lance.manifest.extended.empty_prop".to_string(), + "".to_string(), + ); + properties.insert( + "lance.manifest.extended.valid_prop".to_string(), + "42".to_string(), + ); + let mut create_req = CreateNamespaceRequest::new(); + create_req.id = Some(vec!["ns1".to_string()]); + create_req.properties = Some(properties); + manifest_ns.create_namespace(create_req).await.unwrap(); + + let describe_req = DescribeNamespaceRequest { + id: Some(vec!["ns1".to_string()]), + ..Default::default() + }; + let response = manifest_ns.describe_namespace(describe_req).await.unwrap(); + let props = response.properties.expect("properties should be present"); + + assert_eq!(props.get("owner"), Some(&"alice".to_string())); + assert_eq!( + props.get("lance.manifest.extended.valid_prop"), + Some(&"42".to_string()) + ); + assert!(!props.contains_key("lance.manifest.extended.null_prop")); + assert!(!props.contains_key("lance.manifest.extended.empty_prop")); + assert!(!props.contains_key("lance.manifest.extended.non_existed")); + } + + async fn create_manifest_namespace_for_test( + root: &str, + inline_optimization: bool, + ) -> ManifestNamespace { + let (object_store, base_path) = ObjectStore::from_uri(root).await.unwrap(); + ManifestNamespace::from_directory( + root.to_string(), + None, + None, + object_store, + base_path, + true, + inline_optimization, + ) + .await + .unwrap() + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_multi_namespaces_extended_creates_nested_namespaces( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + + let mut props_a = HashMap::new(); + props_a.insert("owner".to_string(), "alice".to_string()); + let mut req_a = ClientCreateNamespaceRequest::new(); + req_a.id = Some(vec!["a".to_string()]); + req_a.properties = Some(props_a); + + let mut props_ab = HashMap::new(); + props_ab.insert("owner".to_string(), "bob".to_string()); + let mut req_ab = ClientCreateNamespaceRequest::new(); + req_ab.id = Some(vec!["a".to_string(), "b".to_string()]); + req_ab.properties = Some(props_ab); + + let request = CreateMultiNamespacesRequestBuilder::new() + .namespaces(vec![req_a, req_ab]) + .build(); + manifest_ns + .create_multi_namespaces_extended(request, vec![]) + .await + .unwrap(); + + // Parent namespace "a" should exist even though it was created in the same batch. + for (object_id, expected_owner) in [("a".to_string(), "alice"), ("a$b".to_string(), "bob")] + { + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner + .filter(&format!( + "object_type = 'namespace' AND object_id = '{}'", + object_id + )) + .unwrap(); + scanner.project(&["metadata"]).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 1); + let batch = batches + .into_iter() + .find(|b| b.num_rows() > 0) + .expect("expected a non-empty batch"); + let metadata_array = ManifestNamespace::get_string_column(&batch, "metadata").unwrap(); + let metadata_str = metadata_array.value(0); + let metadata_map: HashMap = serde_json::from_str(metadata_str).unwrap(); + assert_eq!(metadata_map.get("owner"), Some(&expected_owner.to_string())); + } + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_multi_namespaces_extended_extended_record_overrides_properties( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![("lance.manifest.extended.user_id", DataType::Utf8)]) + .await + .unwrap(); + + let mut props_1 = HashMap::new(); + props_1.insert("owner".to_string(), "alice".to_string()); + props_1.insert( + "lance.manifest.extended.user_id".to_string(), + "111".to_string(), + ); + let mut req_1 = ClientCreateNamespaceRequest::new(); + req_1.id = Some(vec!["ns1".to_string()]); + req_1.properties = Some(props_1); + + let mut props_2 = HashMap::new(); + props_2.insert("owner".to_string(), "bob".to_string()); + props_2.insert( + "lance.manifest.extended.user_id".to_string(), + "222".to_string(), + ); + let mut req_2 = ClientCreateNamespaceRequest::new(); + req_2.id = Some(vec!["ns2".to_string()]); + req_2.properties = Some(props_2); + + let ext_schema = Arc::new(Schema::new(vec![Field::new( + "user_id", + DataType::Utf8, + true, + )])); + let ext_batch_1 = RecordBatch::try_new( + ext_schema, + vec![Arc::new(StringArray::from(vec![Some("999")]))], + ) + .unwrap(); + + let request = CreateMultiNamespacesRequestBuilder::new() + .namespaces(vec![req_1, req_2]) + .build(); + manifest_ns + .create_multi_namespaces_extended(request, vec![Some(ext_batch_1), None]) + .await + .unwrap(); + + for (object_id, expected_user_id) in + [("ns1".to_string(), "999"), ("ns2".to_string(), "222")] + { + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner + .filter(&format!( + "object_type = 'namespace' AND object_id = '{}'", + object_id + )) + .unwrap(); + scanner.project(&["user_id"]).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 1); + let batch = batches + .into_iter() + .find(|b| b.num_rows() > 0) + .expect("expected a non-empty batch"); + let user_id_array = ManifestNamespace::get_string_column(&batch, "user_id").unwrap(); + assert_eq!(user_id_array.value(0), expected_user_id); + } + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_declare_table_extended_writes_extended_record(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) + .await + .unwrap(); + + let ext_schema = Arc::new(Schema::new(vec![ + Field::new("user_id", DataType::Utf8, true), + Field::new("score", DataType::Int32, true), + ])); + let ext_batch = RecordBatch::try_new( + ext_schema, + vec![ + Arc::new(StringArray::from(vec![Some("u1")])), + Arc::new(Int32Array::from(vec![Some(7)])), + ], + ) + .unwrap(); + + let mut props = HashMap::new(); + props.insert("owner".to_string(), "alice".to_string()); + + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + declare_req.properties = Some(props); + + let resp = manifest_ns + .declare_table_extended(declare_req, Some(ext_batch)) + .await + .unwrap(); + let resp_loc = resp.location.expect("response location should be present"); + assert!(resp_loc.ends_with("test_table.lance")); + + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner + .filter("object_type = 'table' AND object_id = 'test_table'") + .unwrap(); + scanner + .project(&["metadata", "user_id", "score", "location"]) + .unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 1); + let batch = batches + .into_iter() + .find(|b| b.num_rows() > 0) + .expect("expected a non-empty batch"); + + let metadata_array = ManifestNamespace::get_string_column(&batch, "metadata").unwrap(); + let metadata_str = metadata_array.value(0); + let metadata_map: HashMap = serde_json::from_str(metadata_str).unwrap(); + assert_eq!(metadata_map.get("owner"), Some(&"alice".to_string())); + + let user_id_array = ManifestNamespace::get_string_column(&batch, "user_id").unwrap(); + assert_eq!(user_id_array.value(0), "u1"); + let score_array = batch + .column_by_name("score") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(score_array.value(0), 7); + + let location_array = ManifestNamespace::get_string_column(&batch, "location").unwrap(); + assert!(location_array.value(0).ends_with("test_table.lance")); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_table_with_properties_persisted(#[case] inline_optimization: bool) { + let (_temp_dir, manifest_ns, properties) = + create_manifest_and_persist_properties(inline_optimization).await; + + let buffer = create_test_ipc_data(); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + create_req.properties = Some(properties); + + manifest_ns + .create_table(create_req, Bytes::from(buffer)) + .await + .unwrap(); + verify_persist_properties(&manifest_ns, "test_table").await; + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_declare_table_with_properties_persisted(#[case] inline_optimization: bool) { + let (_temp_dir, manifest_ns, properties) = + create_manifest_and_persist_properties(inline_optimization).await; + + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + declare_req.properties = Some(properties); + + manifest_ns.declare_table(declare_req).await.unwrap(); + verify_persist_properties(&manifest_ns, "test_table").await; + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_register_table_with_properties_persisted(#[case] inline_optimization: bool) { + let (_temp_dir, manifest_ns, properties) = + create_manifest_and_persist_properties(inline_optimization).await; + + let mut register_req = RegisterTableRequest::new("registered_table.lance".to_string()); + register_req.id = Some(vec!["registered_table".to_string()]); + register_req.properties = Some(properties); + + LanceNamespace::register_table(&manifest_ns, register_req) + .await + .unwrap(); + verify_persist_properties(&manifest_ns, "registered_table").await; + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_empty_table_persist_properties(#[case] inline_optimization: bool) { + let (_temp_dir, manifest_ns, properties) = + create_manifest_and_persist_properties(inline_optimization).await; + + let mut request = CreateEmptyTableRequest::new(); + request.id = Some(vec!["empty_table".to_string()]); + request.properties = Some(properties); + + #[allow(deprecated)] + manifest_ns.create_empty_table(request).await.unwrap(); + verify_persist_properties(&manifest_ns, "empty_table").await; + } + + async fn create_manifest_and_persist_properties( + inline_optimization: bool, + ) -> (TempStdDir, ManifestNamespace, HashMap) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) + .await + .unwrap(); + + let properties = std::collections::HashMap::from([ + ("owner".to_string(), "alice".to_string()), + ( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ), + ( + "lance.manifest.extended.score".to_string(), + "42".to_string(), + ), + ]); + + (temp_dir, manifest_ns, properties) + } + + async fn verify_persist_properties(manifest_ns: &ManifestNamespace, table: &str) { + let object_id = ManifestNamespace::build_object_id(&[], table); + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + let filter = format!("object_id = '{}'", object_id); + scanner.filter(&filter).unwrap(); + scanner.project(&["metadata", "user_id", "score"]).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + assert_eq!(batch.num_rows(), 1); + + let metadata_array = ManifestNamespace::get_string_column(batch, "metadata").unwrap(); + let metadata_str = metadata_array.value(0); + let metadata_map: std::collections::HashMap = + serde_json::from_str(metadata_str).unwrap(); + assert_eq!(metadata_map.get("owner"), Some(&"alice".to_string())); + + let user_id_array = ManifestNamespace::get_string_column(batch, "user_id").unwrap(); + assert_eq!(user_id_array.value(0), "123"); + + let score_array = batch + .column_by_name("score") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(score_array.value(0), 42); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_table_with_extended_properties_without_columns_fails( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + + let mut properties = std::collections::HashMap::new(); + properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); + + let buffer = create_test_ipc_data(); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + create_req.properties = Some(properties); + + let result = manifest_ns + .create_table(create_req, Bytes::from(buffer)) + .await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Column user_id does not exist in extended properties")); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_declare_table_with_extended_properties_without_columns_fails( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + + let mut properties = std::collections::HashMap::new(); + properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); + + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + declare_req.properties = Some(properties); + + let result = manifest_ns.declare_table(declare_req).await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Column user_id does not exist in extended properties")); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_register_table_with_extended_properties_without_columns_fails( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + + let mut properties = std::collections::HashMap::new(); + properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); + + let mut register_req = RegisterTableRequest::new("registered_table.lance".to_string()); + register_req.id = Some(vec!["registered_table".to_string()]); + register_req.properties = Some(properties); + + let result = LanceNamespace::register_table(&manifest_ns, register_req).await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Column user_id does not exist in extended properties")); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_table_extended_properties_null_and_empty_values_omitted( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.null_prop", DataType::Utf8), + ("lance.manifest.extended.empty_prop", DataType::Utf8), + ("lance.manifest.extended.valid_prop", DataType::Utf8), + ("lance.manifest.extended.non_existed_prop", DataType::Utf8), + ]) + .await + .unwrap(); + + let mut properties = std::collections::HashMap::new(); + properties.insert("owner".to_string(), "alice".to_string()); + properties.insert( + "lance.manifest.extended.null_prop".to_string(), + "null".to_string(), + ); + properties.insert( + "lance.manifest.extended.empty_prop".to_string(), + "".to_string(), + ); + properties.insert( + "lance.manifest.extended.valid_prop".to_string(), + "42".to_string(), + ); + + let buffer = create_test_ipc_data(); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + create_req.properties = Some(properties); + manifest_ns + .create_table(create_req, Bytes::from(buffer)) + .await + .unwrap(); + + let object_id = ManifestNamespace::build_object_id(&[], "test_table"); + + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + let filter = format!("object_id = '{}'", object_id); + scanner.filter(&filter).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + assert_eq!(batch.num_rows(), 1); + + let extended_props = batch_to_extended_props(batch); + + assert_eq!( + extended_props.get("lance.manifest.extended.valid_prop"), + Some(&"42".to_string()) + ); + assert!(!extended_props.contains_key("lance.manifest.extended.null_prop")); + assert!(!extended_props.contains_key("lance.manifest.extended.empty_prop")); + assert!(!extended_props.contains_key("lance.manifest.extended.non_existed_prop")); + } + + #[tokio::test] + async fn test_describe_table_unifies_properties() { + let (_temp_dir, manifest_ns, base_properties) = prepare_properties_env().await; + + // create_table scenario + let buffer = create_test_ipc_data(); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["created_table".to_string()]); + create_req.properties = Some(base_properties.clone()); + manifest_ns + .create_table(create_req, Bytes::from(buffer)) + .await + .unwrap(); + + verify_describe_table_props(&manifest_ns, "created_table", Some(true), &base_properties) + .await; + verify_describe_table_props(&manifest_ns, "created_table", Some(false), &base_properties) + .await; + + // declare_table scenario + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["declared_table".to_string()]); + declare_req.properties = Some(base_properties.clone()); + manifest_ns.declare_table(declare_req).await.unwrap(); + + verify_describe_table_props(&manifest_ns, "declared_table", Some(true), &base_properties) + .await; + verify_describe_table_props( + &manifest_ns, + "declared_table", + Some(false), + &base_properties, + ) + .await; + + // register_table scenario + let mut register_req = RegisterTableRequest::new("registered_table.lance".to_string()); + register_req.id = Some(vec!["registered_table".to_string()]); + register_req.properties = Some(base_properties.clone()); + LanceNamespace::register_table(&manifest_ns, register_req) + .await + .unwrap(); + + verify_describe_table_props( + &manifest_ns, + "registered_table", + Some(true), + &base_properties, + ) + .await; + verify_describe_table_props( + &manifest_ns, + "registered_table", + Some(false), + &base_properties, + ) + .await; + } + + async fn prepare_properties_env() -> ( + TempStdDir, + ManifestNamespace, + std::collections::HashMap, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, true).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) + .await + .unwrap(); + + // prepare base properties + let mut base_properties = std::collections::HashMap::new(); + base_properties.insert("owner".to_string(), "alice".to_string()); + base_properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); + base_properties.insert( + "lance.manifest.extended.score".to_string(), + "42".to_string(), + ); + + (temp_dir, manifest_ns, base_properties) + } + + async fn verify_describe_table_props( + manifest_ns: &ManifestNamespace, + table_name: &str, + load_detailed_metadata: Option, + base_properties: &std::collections::HashMap, + ) { + let req = DescribeTableRequest { + id: Some(vec![table_name.to_string()]), + load_detailed_metadata, + ..Default::default() + }; + let response = manifest_ns.describe_table(req).await.unwrap(); + let props = response.properties.expect("properties should be present"); + for (k, v) in base_properties.iter() { + assert_eq!(props.get(k), Some(v)); + } + } + #[test] fn test_construct_full_uri_with_cloud_urls() { // Test S3-style URL with nested path (no trailing slash) diff --git a/rust/lance-namespace-impls/src/dir/manifest_ext.rs b/rust/lance-namespace-impls/src/dir/manifest_ext.rs new file mode 100644 index 00000000000..cfb1d8456bc --- /dev/null +++ b/rust/lance-namespace-impls/src/dir/manifest_ext.rs @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use crate::dir::manifest::ObjectType; +use crate::ManifestNamespace; +use arrow::datatypes::Schema as ArrowSchema; +use async_trait::async_trait; +use lance::deps::arrow_array::{ + new_null_array, ArrayRef, RecordBatch, RecordBatchIterator, StringArray, +}; +use lance::deps::datafusion::common::ScalarValue; +use lance_core::{box_error, Error}; +use lance_namespace_reqwest_client::models::{ + CreateNamespaceRequest, DeclareTableRequest, DeclareTableResponse, +}; +use snafu::location; +use std::collections::HashMap; +use std::collections::HashSet; +use std::sync::Arc; + +/// Request for creating multiple namespaces with a single merge insert. +#[derive(Debug, Clone)] +pub struct CreateMultiNamespacesRequest { + pub namespaces: Vec, + /// Columns used for merge insert deduplication. + pub on: Vec, +} + +#[derive(Debug, Default, Clone)] +pub struct CreateMultiNamespacesRequestBuilder { + namespaces: Vec, + on: Option>, +} + +impl CreateMultiNamespacesRequestBuilder { + pub fn new() -> Self { + Self::default() + } + + pub fn namespaces(mut self, namespaces: Vec) -> Self { + self.namespaces = namespaces; + self + } + + pub fn push_namespace(mut self, namespace: CreateNamespaceRequest) -> Self { + self.namespaces.push(namespace); + self + } + + pub fn on(mut self, on: Vec) -> Self { + self.on = Some(on); + self + } + + pub fn build(self) -> CreateMultiNamespacesRequest { + let on = self.on.unwrap_or_else(|| vec!["object_id".to_string()]); + CreateMultiNamespacesRequest { + namespaces: self.namespaces, + on, + } + } +} + +#[async_trait] +pub trait ManifestNamespaceExt { + /// Create multiple namespaces atomically with extended properties. + async fn create_multi_namespaces_extended( + &self, + request: CreateMultiNamespacesRequest, + extended_records: Vec>, + ) -> lance_core::Result<()>; + + /// Declare a table with extended properties (metadata only operation). + async fn declare_table_extended( + &self, + request: DeclareTableRequest, + extended_record: Option, + ) -> lance_core::Result; +} + +#[async_trait] +impl ManifestNamespaceExt for ManifestNamespace { + async fn create_multi_namespaces_extended( + &self, + request: CreateMultiNamespacesRequest, + extended_records: Vec>, + ) -> lance_core::Result<()> { + if request.namespaces.is_empty() { + return Ok(()); + } + + if !extended_records.is_empty() && extended_records.len() != request.namespaces.len() { + return Err(Error::InvalidInput { + source: format!( + "extended_records length {} must match namespaces length {}", + extended_records.len(), + request.namespaces.len() + ) + .into(), + location: location!(), + }); + } + + let mut object_id_vec: Vec = Vec::with_capacity(request.namespaces.len()); + let mut metadata_vec: Vec> = Vec::with_capacity(request.namespaces.len()); + let mut props_vec: Vec>> = + Vec::with_capacity(request.namespaces.len()); + + // Allow creating nested namespaces in the same batch: parent namespaces that are + // also part of this request are treated as existing for validation. + let mut to_create: HashSet = HashSet::with_capacity(request.namespaces.len()); + for ns_req in request.namespaces.iter() { + if let Some(id) = ns_req.id.as_ref() { + if !id.is_empty() { + to_create.insert(id.join(crate::dir::manifest::DELIMITER)); + } + } + } + + for ns_req in request.namespaces.iter() { + let namespace_id = ns_req.id.as_ref().ok_or_else(|| Error::InvalidInput { + source: "Namespace ID is required".into(), + location: location!(), + })?; + + if namespace_id.is_empty() { + return Err(Error::Namespace { + source: "Root namespace already exists and cannot be created".into(), + location: location!(), + }); + } + + if namespace_id.len() > 1 { + // Validate parent namespaces that are NOT included in this batch. + // If a parent is included in this batch, it will be inserted together. + for i in 1..namespace_id.len() { + let parent = &namespace_id[..i]; + let parent_object_id = parent.join(crate::dir::manifest::DELIMITER); + if !to_create.contains(&parent_object_id) { + self.validate_namespace_levels_exist(parent).await?; + } + } + } + + object_id_vec.push(namespace_id.join(crate::dir::manifest::DELIMITER)); + metadata_vec.push(Self::build_metadata_json(&ns_req.properties)); + props_vec.push(ns_req.properties.clone()); + } + + let dataset_guard = self.manifest_dataset.get().await?; + let full_schema = Arc::new(ArrowSchema::from(dataset_guard.schema())); + drop(dataset_guard); + + let n = object_id_vec.len(); + let mut columns: Vec = Vec::with_capacity(full_schema.fields().len()); + + // Ensure each extended record has exactly 1 row if provided. + if !extended_records.is_empty() { + for (idx, r) in extended_records.iter().enumerate() { + if let Some(r) = r { + if r.num_rows() != 1 { + return Err(Error::InvalidInput { + source: format!( + "extended_records[{}] must have exactly 1 row, got {}", + idx, + r.num_rows() + ) + .into(), + location: location!(), + }); + } + } + } + } + + for f in full_schema.fields() { + let name = f.name().as_str(); + match name { + "object_id" => columns.push(Arc::new(StringArray::from(object_id_vec.clone()))), + "object_type" => columns.push(Arc::new(StringArray::from(vec!["namespace"; n]))), + "location" => columns.push(Arc::new(StringArray::from(vec![None::; n]))), + "metadata" => columns.push(Arc::new(StringArray::from(metadata_vec.clone()))), + "base_objects" => columns.push(new_null_array(f.data_type(), n)), + _ => { + let key = format!("{}{}", crate::dir::manifest::EXTENDED_PREFIX, name); + let null_scalar = + ScalarValue::try_from(f.data_type()).map_err(|e| Error::IO { + source: box_error(std::io::Error::other(format!( + "Failed to create null scalar for column {}: {}", + name, e + ))), + location: location!(), + })?; + + let mut scalars: Vec = Vec::with_capacity(n); + for i in 0..n { + // Prefer the value from extended_records if present. + let mut from_extended: Option = None; + if !extended_records.is_empty() { + if let Some(record) = extended_records[i].as_ref() { + if let Some((col_idx, field)) = + record.schema().column_with_name(name) + { + if field.data_type() != f.data_type() { + return Err(Error::InvalidInput { + source: format!( + "extended_records[{}] column '{}' has type {:?}, expected {:?}", + i, + name, + field.data_type(), + f.data_type() + ) + .into(), + location: location!(), + }); + } + let col = record.column(col_idx); + let scalar = ScalarValue::try_from_array(col.as_ref(), 0) + .map_err(|e| Error::Internal { + message: format!( + "Failed to convert extended_records[{}] column '{}' to scalar: {}", + i, name, e + ), + location: location!(), + })?; + from_extended = Some(scalar); + } + } + } + + if let Some(s) = from_extended { + scalars.push(s); + continue; + } + + // Fallback to properties-based extended values. + let v = props_vec[i].as_ref().and_then(|m| m.get(&key)); + match v { + Some(s) if s != "null" && !s.is_empty() => { + scalars + .push(crate::dir::manifest::scalar_from_str(f.data_type(), s)?); + } + _ => scalars.push(null_scalar.clone()), + } + } + + let array = + ScalarValue::iter_to_array(scalars.into_iter()).map_err(|e| Error::IO { + source: box_error(std::io::Error::other(format!( + "Failed to build array for column {}: {}", + name, e + ))), + location: location!(), + })?; + columns.push(array); + } + } + } + + let batch = RecordBatch::try_new(full_schema.clone(), columns).map_err(|e| { + Error::io( + format!("Failed to create manifest batch: {}", e), + location!(), + ) + })?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], full_schema); + + let dataset_guard = self.manifest_dataset.get().await?; + let dataset_arc = Arc::new(dataset_guard.clone()); + drop(dataset_guard); + + let mut merge_builder = + lance::dataset::MergeInsertBuilder::try_new(dataset_arc, request.on).map_err(|e| { + Error::IO { + source: box_error(std::io::Error::other(format!( + "Failed to create merge builder: {}", + e + ))), + location: location!(), + } + })?; + merge_builder.when_matched(lance::dataset::WhenMatched::DoNothing); + merge_builder.when_not_matched(lance::dataset::WhenNotMatched::InsertAll); + + let (new_dataset_arc, _merge_stats) = merge_builder + .try_build() + .map_err(|e| Error::IO { + source: box_error(std::io::Error::other(format!( + "Failed to build merge: {}", + e + ))), + location: location!(), + })? + .execute_reader(Box::new(reader)) + .await + .map_err(|e| Error::IO { + source: box_error(std::io::Error::other(format!( + "Failed to execute merge: {}", + e + ))), + location: location!(), + })?; + + let new_dataset = Arc::try_unwrap(new_dataset_arc).unwrap_or_else(|arc| (*arc).clone()); + self.manifest_dataset.set_latest(new_dataset).await; + if let Err(e) = self.run_inline_optimization().await { + log::warn!( + "Unexpected failure when running inline optimization: {:?}", + e + ); + } + + Ok(()) + } + + async fn declare_table_extended( + &self, + request: DeclareTableRequest, + extended_record: Option, + ) -> lance_core::Result { + let table_id = request.id.as_ref().ok_or_else(|| Error::InvalidInput { + source: "Table ID is required".into(), + location: location!(), + })?; + + if table_id.is_empty() { + return Err(Error::InvalidInput { + source: "Table ID cannot be empty".into(), + location: location!(), + }); + } + + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); + + // Check if table already exists in manifest + let existing = self.query_manifest_for_table(&object_id).await?; + if existing.is_some() { + return Err(Error::Namespace { + source: format!("Table '{}' already exists", table_name).into(), + location: location!(), + }); + } + + // Serialize properties and compute extended batch if provided + let metadata = Self::build_metadata_json(&request.properties); + + // Create table location path with hash-based naming + // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance + // Otherwise, use hash-based naming: {hash}_{object_id} + let dir_name = if namespace.is_empty() && self.dir_listing_enabled { + // Root table with directory listing enabled: use {table_name}.lance + format!("{}.lance", table_name) + } else { + // Child namespace table or dir listing disabled: use hash-based naming + Self::generate_dir_name(&object_id) + }; + let table_path = self.base_path.child(dir_name.as_str()); + let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; + + // Validate location if provided + if let Some(req_location) = &request.location { + let req_location = req_location.trim_end_matches('/'); + if req_location != table_uri { + return Err(Error::Namespace { + source: format!( + "Cannot declare table {} at location {}, must be at location {}", + table_name, req_location, table_uri + ) + .into(), + location: location!(), + }); + } + } + + // Create the .lance-reserved file to mark the table as existing + let reserved_file_path = table_path.child(".lance-reserved"); + + self.object_store + .create(&reserved_file_path) + .await + .map_err(|e| Error::Namespace { + source: format!( + "Failed to create .lance-reserved file for table {}: {}", + table_name, e + ) + .into(), + location: location!(), + })? + .shutdown() + .await + .map_err(|e| Error::Namespace { + source: format!( + "Failed to finalize .lance-reserved file for table {}: {}", + table_name, e + ) + .into(), + location: location!(), + })?; + + // Add entry to manifest marking this as a declared table (store dir_name, not full path) + self.insert_into_manifest_with_metadata( + object_id, + ObjectType::Table, + Some(dir_name), + metadata, + None, + extended_record, + ) + .await?; + + log::info!( + "Declared table '{}' in manifest at {}", + table_name, + table_uri + ); + + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + + Ok(DeclareTableResponse { + location: Some(table_uri), + storage_options, + properties: request.properties.clone(), + ..Default::default() + }) + } +} diff --git a/rust/lance-namespace-impls/src/lib.rs b/rust/lance-namespace-impls/src/lib.rs index 83fb93ddc0e..d9233d36dcf 100644 --- a/rust/lance-namespace-impls/src/lib.rs +++ b/rust/lance-namespace-impls/src/lib.rs @@ -72,10 +72,12 @@ pub mod connect; pub mod context; pub mod credentials; pub mod dir; +pub mod udf; #[cfg(feature = "rest")] pub mod rest; +pub mod partition; #[cfg(feature = "rest-adapter")] pub mod rest_adapter; diff --git a/rust/lance-namespace-impls/src/partition.rs b/rust/lance-namespace-impls/src/partition.rs new file mode 100644 index 00000000000..1e4825d3524 --- /dev/null +++ b/rust/lance-namespace-impls/src/partition.rs @@ -0,0 +1,3158 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors +// NOTE: Keep this module warning-clean; avoid `#![allow(unused)]`. + +use crate::dir::manifest::{ManifestObject, EXTENDED_PREFIX}; +use crate::dir::manifest_ext::{ + CreateMultiNamespacesRequestBuilder, ManifestNamespaceExt as ManifestNamespaceCreateExt, +}; +use crate::udf::MURMUR3_MULTI_UDF; +use crate::{context::DynamicContextProvider, DirectoryNamespace, ManifestNamespace}; +use arrow::array::{ + new_null_array, Array, ArrayRef, BinaryArray, Date32Array, Date64Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, + LargeStringArray, RecordBatch, StringArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, +}; +use arrow::datatypes::{Field as ArrowField, Schema as ArrowSchema}; +use arrow::util::display::array_value_to_string; +use arrow_schema::{DataType, SchemaRef}; +use async_trait::async_trait; +use bytes::Bytes; +use lance::deps::datafusion::logical_expr::{Expr, Operator}; +use lance::deps::datafusion::prelude::{col, lit, SessionContext}; +use lance::deps::datafusion::scalar::ScalarValue; +use lance::io::exec::Planner; +use lance_core::datatypes::{Field, Schema}; +use lance_core::{Error, Result}; +use lance_namespace::models::{ + AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, + AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, + AlterTransactionRequest, AlterTransactionResponse, AnalyzeTableQueryPlanRequest, + CountTableRowsRequest, CreateEmptyTableRequest, CreateEmptyTableResponse, + CreateNamespaceRequest, CreateNamespaceResponse, CreateTableIndexRequest, + CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, + CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, + DeclareTableRequest, DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, + DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, + DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, + DescribeTransactionRequest, DescribeTransactionResponse, DropNamespaceRequest, + DropNamespaceResponse, DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, + DropTableResponse, ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, + GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, + InsertIntoTableResponse, JsonArrowSchema, ListNamespacesRequest, ListNamespacesResponse, + ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, + MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, + QueryTableRequest, RegisterTableRequest, RegisterTableResponse, RenameTableRequest, + RenameTableResponse, RestoreTableRequest, RestoreTableResponse, TableExistsRequest, + UpdateTableRequest, UpdateTableResponse, UpdateTableSchemaMetadataRequest, + UpdateTableSchemaMetadataResponse, UpdateTableTagRequest, UpdateTableTagResponse, +}; +use lance_namespace::schema::{arrow_schema_to_json, convert_json_arrow_schema}; +use lance_namespace::LanceNamespace; +use lance_namespace_reqwest_client::models::PartitionField as JsonPartitionField; +use lance_namespace_reqwest_client::models::PartitionSpec as JsonPartitionSpec; +use lance_namespace_reqwest_client::models::PartitionTransform as JsonPartitionTransform; +use snafu::location; +use std::collections::HashMap; +use std::collections::HashSet; +use std::fmt::{Debug, Formatter}; +use std::sync::Arc; + +/// A PartitionedNamespace is a directory namespace containing a collection of tables that share a +/// common schema. These tables are physically separated and independent, but logically related +/// through partition fields definition. +pub struct PartitionedNamespace { + /// Underlying directory namespace used for physical storage. + directory: DirectoryNamespace, + /// Underlying manifest namespace used for metadata and table discovery. + /// + /// This is derived from `directory.manifest_ns`. + manifest: Arc, + /// Root location URI of this partitioned namespace. + location: String, + /// Shared logical schema enforced across all partition tables. + schema: Schema, +} + +impl Debug for PartitionedNamespace { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "PartitionedNamespace({})", self.namespace_id()) + } +} + +impl PartitionedNamespace { + /// Partition pruning for the given filter expression. + /// + /// # Arguments + /// + /// * `filter` - The filter expression to be applied. + /// + /// Returns the list of (partition table, refine expr) that are required to scan. + pub async fn plan_scan( + &self, + filter: &Expr, + ) -> lance_core::Result> { + // 1) Convert to CDF (OR-of-ANDs), then perform partition prune for each partition field. + let cdf = expr_to_cdf(filter); + let mut manifest_pred = lit(true); + for field in self.all_partition_fields().await? { + let expr = field.partition_prune(self.schema(), &cdf).await?; + manifest_pred = manifest_pred.and(expr); + } + + // 2) Query manifest to get candidate tables. + let table_filter = col("object_type").eq(lit("table")); + let objects = self + .manifest + .query_manifest(table_filter.and(manifest_pred)) + .await?; + + // 3) Build the scan plan. For now, refine expr is always the original filter. + let tables = extract_tables(objects)?; + let mut table_expr_tuples = vec![]; + for t in tables.into_iter() { + table_expr_tuples.push((t, filter.clone())); + } + Ok(table_expr_tuples) + } + + /// Resolve the target partition table for the input row. Create it (empty table) if not exists. + /// + /// # Arguments + /// + /// * `record` - The record batch to be resolved, it should contain only one row. + /// + /// Returns the partition table that the input row belongs to. + pub async fn resolve_or_create_partition_table( + &self, + record: &RecordBatch, + ) -> Result { + let spec = self.current_partition_spec().await?; + let partition_values = partition_values(&spec.fields, record).await?; + + if let Some(table) = self + .resolve_partition_table(&spec, &partition_values) + .await? + { + Ok(table) + } else { + self.create_partition_table(&spec, &partition_values).await + } + } + + async fn create_partition_table( + &self, + spec: &PartitionSpec, + partition_values: &[ScalarValue], + ) -> Result { + self.ensure_namespace_exists(vec![spec.spec_id_str()]) + .await?; + + // Create partition namespace for each level. + // Format: $$...$ + let mut namespace_path: Vec = vec![spec.spec_id_str()]; + for _ in 0..partition_values.len() { + namespace_path.push(random_partition_namespace_id()); + } + + // Create namespace rows + if !partition_values.is_empty() { + let mut ns_reqs: Vec = + Vec::with_capacity(partition_values.len()); + let mut ns_records: Vec> = + Vec::with_capacity(partition_values.len()); + + for level in 0..partition_values.len() { + let id: Vec = namespace_path[..(level + 2)].to_vec(); + ns_reqs.push(CreateNamespaceRequest { + id: Some(id), + ..Default::default() + }); + + let batch = + build_partition_extended_record(&spec.fields, partition_values, Some(level))?; + ns_records.push(Some(batch)); + } + + let on_columns: Vec = spec + .fields + .iter() + .map(|f| format!("partition_field_{}", f.field_id)) + .collect(); + let create_req = CreateMultiNamespacesRequestBuilder::new() + .namespaces(ns_reqs) + .on(on_columns) + .build(); + self.manifest + .create_multi_namespaces_extended(create_req, ns_records) + .await?; + } + + // Declare the leaf table + let mut table_id: Vec = vec![]; + let partition_expr = partition_expressions(&spec.fields, partition_values)?; + let mut ns_expr = col("object_type").eq(lit("namespace")); + for expr in partition_expr { + ns_expr = ns_expr.and(expr); + } + let objects = self.manifest.query_manifest(ns_expr).await?; + for object in objects.into_iter() { + if let ManifestObject::Namespace(ns) = object { + if ns.namespace.len() == 0 { + continue; + } + if ns.namespace.get(0).unwrap() != &format!("v{}", spec.id) { + continue + } + table_id.extend(ns.namespace); + table_id.push(ns.name); + break; + } else { + continue; + } + } + if table_id.is_empty() { + return Err(Error::Internal { + message: "Couldn't find partitioned namespace of table".into(), + location: location!(), + }) + } + table_id.push("dataset".to_string()); + + let table_record = build_partition_extended_record(&spec.fields, partition_values, None)?; + + let declare_req = DeclareTableRequest { + id: Some(table_id.clone()), + ..Default::default() + }; + + // Handle concurrent creation: if table already exists, resolve and return it. + if self + .manifest + .declare_table_extended(declare_req, Some(table_record)) + .await + .is_err() + { + if let Some(table) = self.resolve_partition_table(spec, partition_values).await? { + return Ok(table); + } + return Err(Error::Internal { + message: "Failed to declare partition table".to_string(), + location: location!(), + }); + } + + Ok(PartitionTable { + id: table_id, + read_version: None, + }) + } + + async fn resolve_partition_table( + &self, + spec: &PartitionSpec, + partition_values: &[ScalarValue], + ) -> Result> { + let partition_expr = partition_expressions(&spec.fields, partition_values)?; + let mut table_expr = col("object_type").eq(lit("table")); + for expr in partition_expr { + table_expr = table_expr.and(expr); + } + + let objects = self.manifest.query_manifest(table_expr).await?; + let tables = extract_tables(objects)?; + for table in tables.into_iter() { + if table.id.first() == Some(&spec.spec_id_str()) { + return Ok(Some(table)); + } + } + Ok(None) + } + + /// Commit the partition table changes. + /// + /// If ACID is disabled, commit does nothing. + /// Otherwise, if the partition namespace is changed after read version, this method will + /// auto-detect the conflicts. + /// + /// # Arguments + /// + /// * `read_version` - The partition tables that are read in the transaction. + /// * `new_version` - The partition tables that are written in the transaction. + /// + /// Returns the new version of the partitioned namespace. + pub fn commit( + &self, + read_version: Option>, + new_version: Option>, + ) -> lance_core::Result>> { + let _ = (read_version, new_version); + Err(Error::Internal { + message: "PartitionedNamespace.commit is not implemented".to_string(), + location: location!(), + }) + } + + /// Schema of the partitioned namespace. + pub fn schema(&self) -> Schema { + self.schema.clone() + } + + /// All partition tables of the partitioned namespace. + pub async fn tables(&self) -> Result> { + let objects = self + .manifest + .query_manifest(col("object_type").eq(lit("table"))) + .await?; + extract_tables(objects) + } + + /// Partitioning of the partitioned namespace. + pub async fn partitioning(&self) -> Result { + let metadata = self.manifest.get_metadata().await?; + let mut partitioning: Vec = vec![]; + for (k, v) in metadata.iter() { + if k.starts_with("partition_spec_v") { + let json_partition_spec: JsonPartitionSpec = serde_json::from_str(v.as_str()) + .map_err(|e| Error::Internal { + message: format!("Failed to parse schema from _manifest metadata: {}", e), + location: location!(), + })?; + + let partition_spec = PartitionSpec::from_json(&json_partition_spec)?; + let expected_key = partition_spec_key(&partition_spec); + if k != &expected_key { + return Err(Error::Internal { + message: format!( + "Inconsistent __manifest metadata key: expected '{}' but got '{}'", + expected_key, k + ), + location: location!(), + }); + } + partitioning.push(partition_spec); + } + } + Ok(Partitioning::new(partitioning)) + } + + // Partition Evolution. + + /// Update the partition spec. + /// + /// # Arguments + /// + /// * `partition_spec` - The new partition spec. + /// + /// Returns the new partition spec. + pub async fn update_partition_spec( + &self, + partition_spec: Vec, + ) -> lance_core::Result { + // Sanity check + let mut all_sigs = HashSet::new(); + let mut all_field_ids = HashSet::new(); + for f in &partition_spec { + if f.source_ids.is_empty() { + return Err(Error::InvalidInput { + source: "partition field source_ids must not be empty".into(), + location: location!(), + }); + } + let has_transform = f.transform.is_some(); + let has_expression = f + .expression + .as_ref() + .map(|e| !e.trim().is_empty()) + .unwrap_or(false); + if has_transform == has_expression { + return Err(Error::InvalidInput { + source: "Exactly one of transform or expression must be set".into(), + location: location!(), + }); + } + if !all_sigs.insert(f.signature()) || !all_field_ids.insert(f.field_id.clone()) { + return Err(Error::InvalidInput { + source: "Partition fields signature and field_id should be unique.".into(), + location: location!(), + }); + } + } + + // Build the new spec fields, reusing existing field_id where possible. + let partitioning = self.partitioning().await?; + let new_spec_id = + partitioning + .current() + .map(|s| s.id + 1) + .ok_or_else(|| Error::Internal { + message: "Partition spec doesn't exist".to_string(), + location: location!(), + })?; + + let mut new_fields: Vec = Vec::with_capacity(partition_spec.len()); + for mut f in partition_spec.into_iter() { + if let Some(existing_id) = partitioning.get_field_id(&f) { + // Reuse field_id for the same signature. + f.field_id = existing_id.clone(); + } else if let Some(existing_sig) = partitioning.get_signature(&f) { + // Field IDs must never be reused for a different meaning. + if existing_sig != &f.signature() { + return Err(Error::InvalidInput { + source: format!( + "Partition field_id '{}' is already used by another field; cannot reuse it", + f.field_id + ) + .into(), + location: location!(), + }); + } + } + new_fields.push(f); + } + + let new_spec = PartitionSpec { + id: new_spec_id, + fields: new_fields, + }; + self.force_sink_partition_spec(&new_spec).await?; + + Ok(new_spec) + } + + pub(crate) async fn force_sink_partition_spec( + &self, + new_spec: &PartitionSpec, + ) -> lance_core::Result<()> { + self.ensure_namespace_exists(vec![new_spec.spec_id_str()]) + .await?; + self.ensure_partition_fields_exists(&new_spec.fields) + .await?; + + // Persist the new spec in __manifest table metadata. + let json = serde_json::to_string(&new_spec.to_json()).map_err(|e| Error::Internal { + message: format!("Failed to serialize partition spec: {}", e), + location: location!(), + })?; + let key = partition_spec_key(new_spec); + self.manifest + .update_metadata([(key.as_str(), json.as_str())]) + .await?; + + Ok(()) + } + + // Schema Evolution. + + /// Add a new column to the partitioned namespace. + /// + /// # Arguments + /// + /// * `column` - The column to be added. + /// + /// Returns the new schema. + pub fn add_column(&self, _column: &Field) -> lance_core::Result { + Err(Error::Internal { + message: "PartitionedNamespace.add_column is not implemented".to_string(), + location: location!(), + }) + } + + /// Drop the given column from the partitioned namespace. + /// + /// # Arguments + /// + /// * `column` - The column to be dropped. + /// + /// Returns the new schema. + pub fn drop_column(&self, _column: &str) -> lance_core::Result { + Err(Error::Internal { + message: "PartitionedNamespace.drop_column is not implemented".to_string(), + location: location!(), + }) + } + + /// Rename the given column in the partitioned namespace. + /// + /// # Arguments + /// + /// * `old_name` - The old name of the column. + /// * `new_name` - The new name of the column. + /// + /// Returns the new schema. + pub fn rename_column(&self, _old_name: &str, _new_name: &str) -> lance_core::Result { + Err(Error::Internal { + message: "PartitionedNamespace.rename_column is not implemented".to_string(), + location: location!(), + }) + } + + /// Promote the type of the given column to the new type in the partitioned namespace. + /// + /// # Arguments + /// + /// * `column` - The column to be promoted. + /// * `new_type` - The new type of the column. + /// + /// Returns the new schema. + pub fn type_promotion( + &self, + _column: &str, + _new_type: &DataType, + ) -> lance_core::Result { + Err(Error::Internal { + message: "PartitionedNamespace.type_promotion is not implemented".to_string(), + location: location!(), + }) + } +} + +/// Convert boolean expression into a conservative CDF (OR-of-ANDs) form. +/// +/// The returned structure is `Vec>`, where outer `Vec` is OR, and +/// inner `Vec` is AND of atomic predicates. +fn expr_to_cdf(expr: &Expr) -> Vec> { + match expr { + Expr::BinaryExpr(binary) if binary.op == Operator::And => { + let left = expr_to_cdf(&binary.left); + let right = expr_to_cdf(&binary.right); + let mut out = Vec::new(); + for l in left { + for r in &right { + let mut clause = Vec::with_capacity(l.len() + r.len()); + clause.extend(l.iter().cloned()); + clause.extend(r.iter().cloned()); + out.push(clause); + } + } + out + } + Expr::BinaryExpr(binary) if binary.op == Operator::Or => { + let mut left = expr_to_cdf(&binary.left); + let mut right = expr_to_cdf(&binary.right); + left.append(&mut right); + left + } + _ => vec![vec![expr.clone()]], + } +} + +fn extract_tables(objects: Vec) -> Result> { + let mut tables: Vec = Vec::new(); + for obj in objects { + let ManifestObject::Table(t) = obj else { + continue; + }; + // Only consider partitioned namespace leaf tables. + if t.name != "dataset" { + continue; + } + if t.namespace.is_empty() { + continue; + } + if !t.namespace[0].starts_with('v') { + continue; + } + + let mut id = t.namespace; + id.push(t.name); + tables.push(PartitionTable { + id, + read_version: None, + }); + } + Ok(tables) +} + +/// Parse a SQL filter expression into a DataFusion [`Expr`]. +pub async fn parse_filter_expr_from_sql(filter: &str, arrow_schema: &ArrowSchema) -> Result { + let filter = filter.trim(); + if filter.is_empty() { + return Ok(lit(true)); + } + let planner = Planner::new(Arc::new(arrow_schema.clone())); + planner.parse_filter(filter) +} + +fn is_comparison_op(op: Operator) -> bool { + matches!( + op, + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq + ) +} + +/// Transform the first row in record into partition values. +async fn partition_values( + fields: &[PartitionField], + record: &RecordBatch, +) -> Result> { + let mut values: Vec = Vec::with_capacity(fields.len()); + for field in fields.iter() { + let partition_value = field.value(record).await?; + let scalar = + ScalarValue::try_from_array(&partition_value, 0).map_err(|e| Error::Internal { + message: format!( + "Failed to convert partition value for field '{}' to scalar: {}", + field.field_id, e + ), + location: location!(), + })?; + values.push(scalar); + } + Ok(values) +} + +/// Transform partition values into manifest filter expressions. +/// +/// Partition values are stored in `__manifest` as extended columns named +/// `partition_field_{field_id}`. +fn partition_expressions(fields: &[PartitionField], values: &[ScalarValue]) -> Result> { + if fields.len() != values.len() { + return Err(Error::InvalidInput { + source: format!( + "fields len {} must be equal to values len {}", + fields.len(), + values.len() + ) + .into(), + location: location!(), + }); + } + + let mut expr_vec: Vec = Vec::with_capacity(fields.len()); + for (field, value) in fields.iter().zip(values.iter()) { + let col_name = format!("partition_field_{}", field.field_id); + let expr = col(col_name).eq(lit(value.clone())); + expr_vec.push(expr); + } + Ok(expr_vec) +} + +/// The key of partition spec saved in metadata. +fn partition_spec_key(spec: &PartitionSpec) -> String { + format!("partition_spec_v{}", spec.id) +} + +/// Build extended record for partition fields. If `partition_values_effective_idx` is some, +/// partition values greater than it would be NULL. +fn build_partition_extended_record( + fields: &[PartitionField], + partition_values: &[ScalarValue], + partition_values_effective_idx: Option, +) -> Result { + let mut arrow_fields: Vec = Vec::with_capacity(fields.len()); + let mut arrays: Vec = Vec::with_capacity(fields.len()); + + for (idx, f) in fields.iter().enumerate() { + let col_name = format!("partition_field_{}", f.field_id); + arrow_fields.push(ArrowField::new(&col_name, f.result_type.clone(), true)); + + let scalar = match partition_values_effective_idx { + Some(max) if idx > max => { + ScalarValue::try_from(&f.result_type).map_err(|e| Error::Internal { + message: format!( + "Failed to create null scalar for partition field '{}': {}", + f.field_id, e + ), + location: location!(), + })? + } + _ => partition_values + .get(idx) + .cloned() + .ok_or_else(|| Error::InvalidInput { + source: format!( + "partition_values length {} is smaller than required index {}", + partition_values.len(), + idx + ) + .into(), + location: location!(), + })?, + }; + + let arr = scalar.to_array().map_err(|e| Error::Internal { + message: format!( + "Failed to convert scalar for manifest column '{}' to array: {}", + col_name, e + ), + location: location!(), + })?; + arrays.push(arr); + } + + let schema = Arc::new(ArrowSchema::new(arrow_fields)); + RecordBatch::try_new(schema, arrays).map_err(|e| Error::Internal { + message: format!("Failed to create extended record batch: {}", e), + location: location!(), + }) +} + +// Manifest dataset related methods +impl PartitionedNamespace { + async fn current_partition_spec(&self) -> Result { + let partitioning = self.partitioning().await?; + let spec = partitioning.current().ok_or_else(|| Error::Internal { + message: "PartitionSpec not found in manifest".to_string(), + location: location!(), + })?; + Ok(spec.clone()) + } + + /// Get all unique partition fields + async fn all_partition_fields(&self) -> Result> { + let mut id_set = HashSet::new(); + let mut partition_fields = vec![]; + let partitioning = self.partitioning().await?; + for spec in partitioning.all() { + for pf in spec.fields.iter() { + if id_set.insert(pf.field_id.clone()) { + partition_fields.push(pf.clone()); + } + } + } + Ok(partition_fields) + } + + /// Ensure __manifest has columns for all partition fields. + async fn ensure_partition_fields_exists( + &self, + partition_fields: &[PartitionField], + ) -> Result<()> { + let full_schema = self.manifest.full_manifest_schema().await?; + let existing_fields: HashMap = full_schema + .fields() + .iter() + .map(|f| (f.name().to_string(), f.data_type().clone())) + .collect(); + + let mut to_add: Vec<(String, DataType)> = Vec::new(); + for f in partition_fields.iter() { + let col_name = format!("partition_field_{}", f.field_id); + if let Some(existing_ty) = existing_fields.get(&col_name) { + if existing_ty != &f.result_type { + return Err(Error::InvalidInput { + source: format!( + "Manifest column '{}' already exists with type {:?}, expected {:?}", + col_name, existing_ty, f.result_type + ) + .into(), + location: location!(), + }); + } + continue; + } + + // add_extended_properties requires keys to have EXTENDED_PREFIX and strips it + // to form the physical column name. + to_add.push(( + format!("{}{}", EXTENDED_PREFIX, col_name), + f.result_type.clone(), + )); + } + if !to_add.is_empty() { + let to_add_param: Vec<(&str, DataType)> = to_add + .iter() + .map(|(k, t)| (k.as_str(), t.clone())) + .collect(); + self.manifest.add_extended_properties(&to_add_param).await?; + } + Ok(()) + } + + /// Ensure the spec version namespace exists (vN). + async fn ensure_namespace_exists(&self, id: Vec) -> Result<()> { + let exists_req = NamespaceExistsRequest { + id: Some(id.clone()), + ..Default::default() + }; + + if self.namespace_exists(exists_req.clone()).await.is_err() { + let create_req = CreateNamespaceRequest { + id: Some(id), + ..Default::default() + }; + if self.create_namespace(create_req).await.is_err() { + // Maybe the namespace has been created by competitor, retry namespace exists. + return self.namespace_exists(exists_req).await; + } + } + + Ok(()) + } +} + +#[async_trait] +impl LanceNamespace for PartitionedNamespace { + async fn list_namespaces( + &self, + request: ListNamespacesRequest, + ) -> lance_core::Result { + self.directory.list_namespaces(request).await + } + + async fn describe_namespace( + &self, + request: DescribeNamespaceRequest, + ) -> lance_core::Result { + self.directory.describe_namespace(request).await + } + + async fn create_namespace( + &self, + request: CreateNamespaceRequest, + ) -> lance_core::Result { + self.directory.create_namespace(request).await + } + + async fn drop_namespace( + &self, + request: DropNamespaceRequest, + ) -> lance_core::Result { + self.directory.drop_namespace(request).await + } + + async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> { + self.directory.namespace_exists(request).await + } + + async fn list_tables( + &self, + request: ListTablesRequest, + ) -> lance_core::Result { + self.directory.list_tables(request).await + } + + async fn describe_table( + &self, + request: DescribeTableRequest, + ) -> lance_core::Result { + // Delegate to DirectoryNamespace to reuse credential vending and any future context behavior. + self.directory.describe_table(request).await + } + + async fn register_table( + &self, + request: RegisterTableRequest, + ) -> lance_core::Result { + self.directory.register_table(request).await + } + + async fn table_exists(&self, request: TableExistsRequest) -> lance_core::Result<()> { + self.directory.table_exists(request).await + } + + async fn drop_table(&self, request: DropTableRequest) -> lance_core::Result { + self.directory.drop_table(request).await + } + + async fn deregister_table( + &self, + request: DeregisterTableRequest, + ) -> lance_core::Result { + self.directory.deregister_table(request).await + } + + async fn count_table_rows(&self, request: CountTableRowsRequest) -> lance_core::Result { + self.directory.count_table_rows(request).await + } + + async fn create_table( + &self, + request: CreateTableRequest, + request_data: Bytes, + ) -> lance_core::Result { + self.directory.create_table(request, request_data).await + } + + async fn declare_table( + &self, + request: DeclareTableRequest, + ) -> lance_core::Result { + self.directory.declare_table(request).await + } + + async fn create_empty_table( + &self, + request: CreateEmptyTableRequest, + ) -> lance_core::Result { + #[allow(deprecated)] + self.directory.create_empty_table(request).await + } + + async fn insert_into_table( + &self, + request: InsertIntoTableRequest, + request_data: Bytes, + ) -> lance_core::Result { + self.directory + .insert_into_table(request, request_data) + .await + } + + async fn merge_insert_into_table( + &self, + request: MergeInsertIntoTableRequest, + request_data: Bytes, + ) -> lance_core::Result { + self.directory + .merge_insert_into_table(request, request_data) + .await + } + + async fn update_table( + &self, + request: UpdateTableRequest, + ) -> lance_core::Result { + self.directory.update_table(request).await + } + + async fn delete_from_table( + &self, + request: DeleteFromTableRequest, + ) -> lance_core::Result { + self.directory.delete_from_table(request).await + } + + async fn query_table(&self, request: QueryTableRequest) -> lance_core::Result { + self.directory.query_table(request).await + } + + async fn create_table_index( + &self, + request: CreateTableIndexRequest, + ) -> lance_core::Result { + self.directory.create_table_index(request).await + } + + async fn list_table_indices( + &self, + request: ListTableIndicesRequest, + ) -> lance_core::Result { + self.directory.list_table_indices(request).await + } + + async fn describe_table_index_stats( + &self, + request: DescribeTableIndexStatsRequest, + ) -> lance_core::Result { + self.directory.describe_table_index_stats(request).await + } + + async fn describe_transaction( + &self, + request: DescribeTransactionRequest, + ) -> lance_core::Result { + self.directory.describe_transaction(request).await + } + + async fn alter_transaction( + &self, + request: AlterTransactionRequest, + ) -> lance_core::Result { + self.directory.alter_transaction(request).await + } + + async fn create_table_scalar_index( + &self, + request: CreateTableIndexRequest, + ) -> lance_core::Result { + self.directory.create_table_scalar_index(request).await + } + + async fn drop_table_index( + &self, + request: DropTableIndexRequest, + ) -> lance_core::Result { + self.directory.drop_table_index(request).await + } + + async fn list_all_tables( + &self, + request: ListTablesRequest, + ) -> lance_core::Result { + self.directory.list_all_tables(request).await + } + + async fn restore_table( + &self, + request: RestoreTableRequest, + ) -> lance_core::Result { + self.directory.restore_table(request).await + } + + async fn rename_table( + &self, + request: RenameTableRequest, + ) -> lance_core::Result { + self.directory.rename_table(request).await + } + + async fn list_table_versions( + &self, + request: ListTableVersionsRequest, + ) -> lance_core::Result { + self.directory.list_table_versions(request).await + } + + async fn update_table_schema_metadata( + &self, + request: UpdateTableSchemaMetadataRequest, + ) -> lance_core::Result { + self.directory.update_table_schema_metadata(request).await + } + + async fn get_table_stats( + &self, + request: GetTableStatsRequest, + ) -> lance_core::Result { + self.directory.get_table_stats(request).await + } + + async fn explain_table_query_plan( + &self, + request: ExplainTableQueryPlanRequest, + ) -> lance_core::Result { + self.directory.explain_table_query_plan(request).await + } + + async fn analyze_table_query_plan( + &self, + request: AnalyzeTableQueryPlanRequest, + ) -> lance_core::Result { + self.directory.analyze_table_query_plan(request).await + } + + async fn alter_table_add_columns( + &self, + request: AlterTableAddColumnsRequest, + ) -> lance_core::Result { + self.directory.alter_table_add_columns(request).await + } + + async fn alter_table_alter_columns( + &self, + request: AlterTableAlterColumnsRequest, + ) -> lance_core::Result { + self.directory.alter_table_alter_columns(request).await + } + + async fn alter_table_drop_columns( + &self, + request: AlterTableDropColumnsRequest, + ) -> lance_core::Result { + self.directory.alter_table_drop_columns(request).await + } + + async fn list_table_tags( + &self, + request: ListTableTagsRequest, + ) -> lance_core::Result { + self.directory.list_table_tags(request).await + } + + async fn get_table_tag_version( + &self, + request: GetTableTagVersionRequest, + ) -> lance_core::Result { + self.directory.get_table_tag_version(request).await + } + + async fn create_table_tag( + &self, + request: CreateTableTagRequest, + ) -> lance_core::Result { + self.directory.create_table_tag(request).await + } + + async fn delete_table_tag( + &self, + request: DeleteTableTagRequest, + ) -> lance_core::Result { + self.directory.delete_table_tag(request).await + } + + async fn update_table_tag( + &self, + request: UpdateTableTagRequest, + ) -> lance_core::Result { + self.directory.update_table_tag(request).await + } + + fn namespace_id(&self) -> String { + format!("partitioned(root={})", self.location) + } +} + +/// Builder for creating or loading a [`PartitionedNamespace`]. +/// +/// - If `__manifest` already contains `schema` and `partition_spec_v*` metadata, then +/// [`build`](PartitionedNamespaceBuilder::build) loads the existing namespace. +/// - Otherwise, it creates a new namespace using the provided schema and initial partition spec. +#[derive(Debug, Default)] +pub struct PartitionedNamespaceBuilder { + location: String, + schema: Option, + partition_spec: Option, + directory: Option, + credential_vendor_properties: HashMap, + context_provider: Option>, +} + +impl PartitionedNamespaceBuilder { + pub fn new(location: impl Into) -> Self { + Self { + location: location.into().trim_end_matches('/').to_string(), + schema: None, + partition_spec: None, + directory: None, + credential_vendor_properties: HashMap::new(), + context_provider: None, + } + } + + /// Use an already constructed [`DirectoryNamespace`] when building or loading. + pub fn directory(mut self, directory: DirectoryNamespace) -> Self { + self.directory = Some(directory); + self + } + + /// Add a credential vendor property. + pub fn credential_vendor_property( + mut self, + key: impl Into, + value: impl Into, + ) -> Self { + self.credential_vendor_properties + .insert(key.into(), value.into()); + self + } + + /// Add multiple credential vendor properties. + pub fn credential_vendor_properties(mut self, properties: HashMap) -> Self { + self.credential_vendor_properties.extend(properties); + self + } + + /// Set a dynamic context provider for per-request context. + pub fn context_provider(mut self, provider: Arc) -> Self { + self.context_provider = Some(provider); + self + } + + pub fn schema(mut self, schema: Schema) -> Self { + self.schema = Some(schema); + self + } + + pub fn partition_spec(mut self, partition_spec: PartitionSpec) -> Self { + self.partition_spec = Some(partition_spec); + self + } + + /// Build with upsert semantics: load if initialized, otherwise create. + pub async fn build(self) -> Result { + let (directory, manifest_ns) = Self::open_directory( + &self.location, + self.directory, + &self.credential_vendor_properties, + self.context_provider, + ) + .await?; + let metadata = manifest_ns.get_metadata().await?; + + let has_schema = metadata.contains_key("schema"); + let has_spec = metadata.keys().any(|k| k.starts_with("partition_spec_v")); + + match (has_schema, has_spec) { + (true, true) => { + let loaded = + Self::load_from_manifest(&self.location, directory, manifest_ns).await?; + Ok(loaded) + } + (false, false) => { + let schema = self.schema.ok_or_else(|| Error::InvalidInput { + source: "schema is required when creating a new partitioned namespace".into(), + location: location!(), + })?; + let partition = self.partition_spec.ok_or_else(|| Error::InvalidInput { + source: "partition_spec is required when creating a new partitioned namespace" + .into(), + location: location!(), + })?; + + Self::create_new(&self.location, directory, manifest_ns, schema, partition).await + } + _ => Err(Error::Internal { + message: "Inconsistent __manifest metadata: schema and partition_spec_v* must either both exist or both be absent".to_string(), + location: location!(), + }), + } + } + + /// Load an existing [`PartitionedNamespace`]. + /// + /// Returns an error if the namespace has not been initialized yet. + pub async fn load(self) -> Result { + let (directory, manifest_ns) = Self::open_directory( + &self.location, + self.directory, + &self.credential_vendor_properties, + self.context_provider, + ) + .await?; + let metadata = manifest_ns.get_metadata().await?; + + let has_schema = metadata.contains_key("schema"); + let has_spec = metadata.keys().any(|k| k.starts_with("partition_spec_v")); + + if !has_schema || !has_spec { + return Err(Error::InvalidInput { + source: "PartitionedNamespace is not initialized".into(), + location: location!(), + }); + } + + Self::load_from_manifest(&self.location, directory, manifest_ns).await + } + + async fn open_directory( + location: &str, + directory: Option, + credential_vendor_properties: &HashMap, + context_provider: Option>, + ) -> Result<(DirectoryNamespace, Arc)> { + if directory.is_some() + && (!credential_vendor_properties.is_empty() || context_provider.is_some()) + { + return Err(Error::InvalidInput { + source: "Cannot set credential_vendor/context_provider when directory is explicitly provided".into(), + location: location!(), + }); + } + + let directory = match directory { + Some(d) => d, + None => { + let mut builder = crate::DirectoryNamespaceBuilder::new(location) + .manifest_enabled(true) + .dir_listing_enabled(false) + .inline_optimization_enabled(true); + + for (k, v) in credential_vendor_properties.iter() { + builder = builder.credential_vendor_property(k.clone(), v.clone()); + } + if let Some(provider) = context_provider { + builder = builder.context_provider(provider); + } + + builder.build().await? + } + }; + let manifest_ns = directory.manifest_namespace()?; + Ok((directory, manifest_ns)) + } + + async fn load_from_manifest( + location: &str, + directory: DirectoryNamespace, + manifest_ns: Arc, + ) -> Result { + let metadata = manifest_ns.get_metadata().await?; + let json_schema = metadata.get("schema").ok_or_else(|| Error::Internal { + message: "Schema not found in __manifest metadata".to_string(), + location: location!(), + })?; + + let json_schema: JsonArrowSchema = + serde_json::from_str(json_schema).map_err(|e| Error::Internal { + message: format!("Failed to parse schema from __manifest metadata: {}", e), + location: location!(), + })?; + + let arrow_schema = convert_json_arrow_schema(&json_schema)?; + let schema = lance_core::datatypes::Schema::try_from(&arrow_schema)?; + + Ok(PartitionedNamespace { + directory, + manifest: manifest_ns, + location: location.to_string(), + schema, + }) + } + + async fn create_new( + location: &str, + directory: DirectoryNamespace, + manifest_ns: Arc, + schema: Schema, + partition: PartitionSpec, + ) -> Result { + if partition.id != 1 { + return Err(Error::InvalidInput { + source: "initial partition spec id must be 1".into(), + location: location!(), + }); + } + + // Persist schema metadata + let arrow_schema: ArrowSchema = (&schema).into(); + let json_schema = arrow_schema_to_json(&arrow_schema)?; + let schema_json = serde_json::to_string(&json_schema).map_err(|e| Error::Internal { + message: format!("Failed to serialize schema: {}", e), + location: location!(), + })?; + manifest_ns + .update_metadata([("schema", schema_json.as_str())]) + .await?; + + // Persist initial partition spec + let spec_json = + serde_json::to_string(&partition.to_json()).map_err(|e| Error::Internal { + message: format!("Failed to serialize partition spec: {}", e), + location: location!(), + })?; + let spec_key = partition_spec_key(&partition); + manifest_ns + .update_metadata([(spec_key.as_str(), spec_json.as_str())]) + .await?; + let ns = PartitionedNamespace { + directory, + manifest: manifest_ns, + location: location.to_string(), + schema, + }; + ns.force_sink_partition_spec(&partition).await?; + + Ok(ns) + } +} + +/// Partition table of the partitioned namespace. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PartitionTable { + /// Namespace id path for this partition table, e.g. ["v1", "abc123", "def456"] + pub id: Vec, + /// Optional read version used in strong transaction mode + pub read_version: Option, +} + +/// Partitioning contains all partition specs of the partitioned namespace. +#[derive(Debug, Clone, Default)] +pub struct Partitioning { + specs: Vec, + sig_to_field_id: HashMap, + field_id_to_sig: HashMap, +} + +impl Partitioning { + /// Create a new Partitioning from a list of specs. + pub fn new(specs: Vec) -> Self { + let mut sig_to_field_id: HashMap = HashMap::new(); + let mut field_id_to_sig: HashMap = HashMap::new(); + for spec in specs.iter() { + for f in spec.fields.iter() { + let sig = f.signature(); + sig_to_field_id + .entry(sig.clone()) + .or_insert_with(|| f.field_id.clone()); + field_id_to_sig + .entry(f.field_id.clone()) + .or_insert_with(|| sig); + } + } + + Self { + specs, + sig_to_field_id, + field_id_to_sig, + } + } + + /// Return the current (highest id) partition spec if any. + pub fn current(&self) -> Option<&PartitionSpec> { + self.specs.iter().max_by_key(|s| s.id) + } + + /// Get a partition spec by id. + pub fn by_id(&self, id: i32) -> Option<&PartitionSpec> { + self.specs.iter().find(|s| s.id == id) + } + + /// All partition specs in this namespace. + pub fn all(&self) -> &[PartitionSpec] { + &self.specs + } + + /// Mutable access to all specs (used for evolution tests). + pub fn all_mut(&mut self) -> &mut Vec { + &mut self.specs + } + + /// Get field id with the same signature if exists + pub fn get_field_id(&self, field: &PartitionField) -> Option<&String> { + let signature = field.signature(); + self.sig_to_field_id.get(&signature) + } + + /// Get signature with the same field id if exists + pub fn get_signature(&self, field: &PartitionField) -> Option<&String> { + self.field_id_to_sig.get(&field.field_id) + } +} + +/// Partition specification defines how to derive partition values from a record in a partitioned +/// namespace. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PartitionSpec { + /// Spec version id, matching the N in `partition_spec_vN`. + pub id: i32, + /// Fields in this spec in evaluation order. + pub fields: Vec, +} + +impl PartitionSpec { + /// Convert from JSON representation stored in __manifest metadata. + pub fn from_json(json: &JsonPartitionSpec) -> lance_core::Result { + let mut fields = Vec::with_capacity(json.fields.len()); + for f in &json.fields { + fields.push(PartitionField::from_json(f)?); + } + Ok(Self { + id: json.id, + fields, + }) + } + + /// Convert to JSON representation for storing in __manifest metadata. + pub fn to_json(&self) -> JsonPartitionSpec { + JsonPartitionSpec { + id: self.id, + fields: self.fields.iter().map(PartitionField::to_json).collect(), + } + } + + pub fn spec_id_str(&self) -> String { + format!("v{}", self.id) + } +} + +/// Supported well-known partition transforms. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum PartitionTransform { + Identity, + Year, + Month, + Day, + Hour, + Bucket { num_buckets: i32 }, + MultiBucket { num_buckets: i32 }, + Truncate { width: i32 }, +} + +fn first_id_name<'a>(schema: &'a SchemaRef, source_ids: &[i32]) -> Result<&'a str> { + let id = source_ids.first().ok_or_else(|| Error::InvalidInput { + source: "source_ids should have at least one element".into(), + location: location!(), + })?; + let id = *id as usize; + + let field = schema.fields().get(id).ok_or_else(|| Error::InvalidInput { + source: format!( + "source_id {} is out of bounds for schema with {} fields", + id, + schema.fields().len() + ) + .into(), + location: location!(), + })?; + Ok(field.name()) +} + +/// Partition field definition. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PartitionField { + /// Unique identifier for this partition field + pub field_id: String, + /// Field IDs of the source columns in the schema + pub source_ids: Vec, + /// Well-known transform to apply. Exactly one of `transform` or `expression` + /// should be set. + pub transform: Option, + /// Custom SQL expression used when `transform` is not set. + pub expression: Option, + /// Result type of the partition value + pub result_type: DataType, +} + +impl PartitionField { + /// Convert this field into its JSON representation. + pub fn to_json(&self) -> JsonPartitionField { + let transform = self.transform.as_ref().map(|t| match t { + PartitionTransform::Identity => Box::new(JsonPartitionTransform { + r#type: "identity".to_string(), + num_buckets: None, + width: None, + }), + PartitionTransform::Year => Box::new(JsonPartitionTransform { + r#type: "year".to_string(), + num_buckets: None, + width: None, + }), + PartitionTransform::Month => Box::new(JsonPartitionTransform { + r#type: "month".to_string(), + num_buckets: None, + width: None, + }), + PartitionTransform::Day => Box::new(JsonPartitionTransform { + r#type: "day".to_string(), + num_buckets: None, + width: None, + }), + PartitionTransform::Hour => Box::new(JsonPartitionTransform { + r#type: "hour".to_string(), + num_buckets: None, + width: None, + }), + PartitionTransform::Bucket { num_buckets } => Box::new(JsonPartitionTransform { + r#type: "bucket".to_string(), + num_buckets: Some(*num_buckets), + width: None, + }), + PartitionTransform::MultiBucket { num_buckets } => Box::new(JsonPartitionTransform { + r#type: "multi_bucket".to_string(), + num_buckets: Some(*num_buckets), + width: None, + }), + PartitionTransform::Truncate { width } => Box::new(JsonPartitionTransform { + r#type: "truncate".to_string(), + num_buckets: None, + width: Some(*width), + }), + }); + + JsonPartitionField { + field_id: self.field_id.clone(), + source_ids: self.source_ids.clone(), + transform, + expression: self.expression.clone(), + result_type: Box::new(datatype_to_json_type(&self.result_type)), + } + } + + /// Construct a `PartitionField` from its JSON representation. + pub fn from_json(json: &JsonPartitionField) -> lance_core::Result { + let has_transform = json.transform.is_some(); + let has_expression = json + .expression + .as_ref() + .map(|e| !e.trim().is_empty()) + .unwrap_or(false); + + if has_transform == has_expression { + return Err(lance_core::Error::Namespace { + source: "Exactly one of transform or expression must be set".into(), + location: snafu::location!(), + }); + } + + let transform = json + .transform + .as_ref() + .map(|t| { + let result = match t.r#type.as_str() { + "identity" => PartitionTransform::Identity, + "year" => PartitionTransform::Year, + "month" => PartitionTransform::Month, + "day" => PartitionTransform::Day, + "hour" => PartitionTransform::Hour, + "bucket" => PartitionTransform::Bucket { + num_buckets: t.num_buckets.unwrap_or(0), + }, + "multi_bucket" => PartitionTransform::MultiBucket { + num_buckets: t.num_buckets.unwrap_or(0), + }, + "truncate" => PartitionTransform::Truncate { + width: t.width.unwrap_or(0), + }, + other => { + return Err(lance_core::Error::Namespace { + source: format!("Unsupported partition transform: {}", other).into(), + location: snafu::location!(), + }); + } + }; + Ok(result) + }) + .transpose()?; + + Ok(Self { + field_id: json.field_id.clone(), + source_ids: json.source_ids.clone(), + transform, + expression: json.expression.clone(), + result_type: json_type_to_datatype(&json.result_type)?, + }) + } + + /// Parse partition value from the record. The record should contain exactly one row. + pub async fn value(&self, record: &RecordBatch) -> Result> { + if record.num_rows() != 1 { + return Err(Error::InvalidInput { + source: "record must contain exactly one row".into(), + location: location!(), + }); + } + + let array = match (self.expression.as_ref(), self.transform.as_ref()) { + (Some(expr), None) => parse_partition_value_from_expr(record, expr).await?, + (None, Some(transform)) => { + parse_partition_value_from_transform(&self.source_ids, record, transform).await? + } + _ => { + return Err(Error::Internal { + message: "expression and transform can't both be set or unset".to_string(), + location: location!(), + }) + } + }; + + if array.is_empty() { + return Err(Error::Internal { + message: "partition expression returned empty array".to_string(), + location: location!(), + }); + } + Ok(array) + } + + /// Signature of this field + pub fn signature(&self) -> String { + let mut out = String::new(); + out.push_str("src="); + out.push_str(&format!("{:?}", self.source_ids)); + out.push(';'); + match (&self.transform, &self.expression) { + (Some(t), None) => { + let t = match t { + PartitionTransform::Identity => "identity".to_string(), + PartitionTransform::Year => "year".to_string(), + PartitionTransform::Month => "month".to_string(), + PartitionTransform::Day => "day".to_string(), + PartitionTransform::Hour => "hour".to_string(), + PartitionTransform::Bucket { num_buckets } => { + format!("bucket:{}", num_buckets) + } + PartitionTransform::MultiBucket { num_buckets } => { + format!("multi_bucket:{}", num_buckets) + } + PartitionTransform::Truncate { width } => format!("truncate:{}", width), + }; + out.push_str(&format!("t={};", t)); + } + (None, Some(e)) => out.push_str(&format!("e={};", e.trim())), + _ => out.push_str("invalid;"), + } + + let rt = datatype_to_json_type(&self.result_type); + out.push_str(&format!("rt={}", rt.r#type.to_lowercase())); + out + } + + /// Transform the input predicate (in CDF format) into a filter of `__manifest`. + /// + /// This is a best-effort partition pruning rewrite for a single [`PartitionField`]. + /// It must be conservative: if we cannot safely rewrite a clause, we return a + /// literal TRUE for that clause (i.e. keep all partitions). + /// + /// Per OR-clause (AND of atoms): + /// - If we can rewrite atoms directly against the manifest partition column + /// (identity transform, or well-known `date_part`/`year`/`month`/`day`/`hour` + /// predicates for time transforms), we do so. + /// - Otherwise, if *all* source columns are constrained by equality to literals, + /// we synthesize a single-row [`RecordBatch`], reuse [`PartitionField::value`] + /// to compute the partition value, then rewrite into + /// `partition_field_{field_id} == `. + pub async fn partition_prune(&self, schema: Schema, cdf: &Vec>) -> Result { + // Resolve source column names from schema and source_ids. + let mut source_col_names: Vec = Vec::with_capacity(self.source_ids.len()); + for source_id in &self.source_ids { + let f = schema + .field_by_id(*source_id) + .ok_or_else(|| Error::InvalidInput { + source: format!("Field id {} not found in schema", source_id).into(), + location: location!(), + })?; + source_col_names.push(f.name.clone()); + } + let source_col_set: HashSet = source_col_names.iter().cloned().collect(); + + let manifest_col = format!("partition_field_{}", self.field_id); + let mut manifest_expr = lit(false); + + for clause in cdf { + // Collect atoms that are relevant to this field (they reference any source column). + let mut relevant_atoms: Vec = Vec::new(); + for atom in clause { + if expr_references_any_column(atom, &source_col_set) { + relevant_atoms.push(atom.clone()); + } + } + + // If this OR-clause doesn't restrict this partition field at all, it + // cannot prune partitions. + if relevant_atoms.is_empty() { + manifest_expr = manifest_expr.or(lit(true)); + continue; + } + + // 1) Try direct rewrites (do not require full equality coverage). + let mut clause_pred = lit(true); + let mut rewrote_any = false; + for atom in &relevant_atoms { + let rewritten = match (&self.transform, &self.expression) { + (Some(PartitionTransform::Identity), None) if source_col_names.len() == 1 => { + rewrite_identity_atom(atom, &source_col_names[0], &manifest_col) + } + (Some(PartitionTransform::Year), None) if source_col_names.len() == 1 => { + rewrite_time_transform_atom( + atom, + &source_col_names[0], + &manifest_col, + "year", + ) + } + (Some(PartitionTransform::Month), None) if source_col_names.len() == 1 => { + rewrite_time_transform_atom( + atom, + &source_col_names[0], + &manifest_col, + "month", + ) + } + (Some(PartitionTransform::Day), None) if source_col_names.len() == 1 => { + rewrite_time_transform_atom( + atom, + &source_col_names[0], + &manifest_col, + "day", + ) + } + (Some(PartitionTransform::Hour), None) if source_col_names.len() == 1 => { + rewrite_time_transform_atom( + atom, + &source_col_names[0], + &manifest_col, + "hour", + ) + } + _ => None, + }; + + // If we cannot rewrite this atom, keep it as TRUE (conservative). + if let Some(expr) = rewritten { + rewrote_any = true; + clause_pred = clause_pred.and(expr); + } else { + clause_pred = clause_pred.and(lit(true)); + } + } + + // If we rewrote at least one atom, we can use it for pruning. + if rewrote_any { + manifest_expr = manifest_expr.or(clause_pred); + continue; + } + + // 2) Try equality-driven value computation. + // This requires that every relevant atom is an equality between a source column + // and a literal, and that all source columns are covered. + let mut eq_map: HashMap = HashMap::new(); + let mut contradictory = false; + let mut all_relevant_are_eq = true; + + for atom in &relevant_atoms { + if let Some((col_name, scalar)) = extract_eq_on_source_column(atom, &source_col_set) + { + if let Some(existing) = eq_map.get(&col_name) { + if existing != &scalar { + contradictory = true; + break; + } + } + eq_map.insert(col_name, scalar); + } else { + all_relevant_are_eq = false; + break; + } + } + + if contradictory { + // This AND-clause is unsatisfiable; no partitions can match. + manifest_expr = manifest_expr.or(lit(false)); + continue; + } + + let covers_all_sources = source_col_names + .iter() + .all(|name| eq_map.contains_key(name)); + + if all_relevant_are_eq && covers_all_sources { + if let Some(rb) = build_single_row_record_batch(&schema, &eq_map) { + // Compute the partition value using the same logic as table creation. + let arr = self.value(&rb).await?; + if let Ok(scalar) = ScalarValue::try_from_array(&arr, 0) { + let base = col(&manifest_col).eq(lit(scalar)); + manifest_expr = manifest_expr.or(base.or(col(&manifest_col).is_null())); + continue; + } + } + } + + // 3) Fallback: cannot safely prune this clause. + manifest_expr = manifest_expr.or(lit(true)); + } + + Ok(manifest_expr) + } +} + +fn expr_references_any_column(expr: &Expr, cols: &HashSet) -> bool { + match expr { + Expr::Column(c) => cols.contains(&c.name), + Expr::BinaryExpr(b) => { + expr_references_any_column(&b.left, cols) || expr_references_any_column(&b.right, cols) + } + Expr::IsNull(e) | Expr::IsNotNull(e) => expr_references_any_column(e, cols), + Expr::Cast(c) => expr_references_any_column(&c.expr, cols), + Expr::TryCast(c) => expr_references_any_column(&c.expr, cols), + Expr::ScalarFunction(fun) => fun.args.iter().any(|a| expr_references_any_column(a, cols)), + _ => false, + } +} + +fn extract_eq_on_source_column( + atom: &Expr, + source_cols: &HashSet, +) -> Option<(String, ScalarValue)> { + let Expr::BinaryExpr(binary) = atom else { + return None; + }; + if binary.op != Operator::Eq { + return None; + } + match (&*binary.left, &*binary.right) { + (Expr::Column(c), Expr::Literal(v, _)) if source_cols.contains(&c.name) => { + Some((c.name.clone(), v.clone())) + } + (Expr::Literal(v, _), Expr::Column(c)) if source_cols.contains(&c.name) => { + Some((c.name.clone(), v.clone())) + } + _ => None, + } +} + +fn rewrite_identity_atom(atom: &Expr, source_col: &str, manifest_col: &str) -> Option { + match atom { + Expr::BinaryExpr(binary) if is_comparison_op(binary.op) => { + match (&*binary.left, &*binary.right) { + (Expr::Column(c), Expr::Literal(v, _)) if c.name == source_col => { + let base = + Expr::BinaryExpr(lance::deps::datafusion::logical_expr::BinaryExpr { + left: Box::new(col(manifest_col)), + op: binary.op, + right: Box::new(Expr::Literal(v.clone(), None)), + }); + Some(base.or(col(manifest_col).is_null())) + } + (Expr::Literal(v, _), Expr::Column(c)) if c.name == source_col => { + let base = + Expr::BinaryExpr(lance::deps::datafusion::logical_expr::BinaryExpr { + left: Box::new(Expr::Literal(v.clone(), None)), + op: binary.op, + right: Box::new(col(manifest_col)), + }); + Some(base.or(col(manifest_col).is_null())) + } + _ => None, + } + } + Expr::IsNull(e) if matches!(e.as_ref(), Expr::Column(c) if c.name == source_col) => { + Some(col(manifest_col).is_null()) + } + Expr::IsNotNull(e) if matches!(e.as_ref(), Expr::Column(c) if c.name == source_col) => { + Some(col(manifest_col).is_not_null()) + } + _ => None, + } +} + +fn rewrite_time_transform_atom( + atom: &Expr, + source_col: &str, + manifest_col: &str, + unit: &str, +) -> Option { + let Expr::BinaryExpr(binary) = atom else { + return None; + }; + if !is_comparison_op(binary.op) { + return None; + } + + let is_matching_transform_call = |expr: &Expr| -> bool { + let Expr::ScalarFunction(fun) = expr else { + return false; + }; + + // Accept either `year(col)` style or `date_part('year', col)` style. + let is_unit_fn = fun.name() == unit && fun.args.len() == 1; + let is_date_part = fun.name() == "date_part" + && fun.args.len() == 2 + && matches!(&fun.args[0], Expr::Literal(v, _) if matches!(v, ScalarValue::Utf8(Some(s)) if s == unit)); + + let col_arg = if is_unit_fn { + fun.args.first() + } else if is_date_part { + fun.args.get(1) + } else { + None + }; + let Some(col_arg) = col_arg else { + return false; + }; + matches!(col_arg, Expr::Column(c) if c.name == source_col) + }; + + // func(col) literal + if is_matching_transform_call(&binary.left) { + let base = Expr::BinaryExpr(lance::deps::datafusion::logical_expr::BinaryExpr { + left: Box::new(col(manifest_col)), + op: binary.op, + right: Box::new(binary.right.as_ref().clone()), + }); + return Some(base.or(col(manifest_col).is_null())); + } + + // literal func(col) + if is_matching_transform_call(&binary.right) { + let base = Expr::BinaryExpr(lance::deps::datafusion::logical_expr::BinaryExpr { + left: Box::new(binary.left.as_ref().clone()), + op: binary.op, + right: Box::new(col(manifest_col)), + }); + return Some(base.or(col(manifest_col).is_null())); + } + + None +} + +fn build_single_row_record_batch( + schema: &Schema, + eq_map: &HashMap, +) -> Option { + // Build an Arrow schema from Lance schema. + let arrow_fields: Vec = schema.fields.iter().map(ArrowField::from).collect(); + let rb_schema: SchemaRef = Arc::new(ArrowSchema::new(arrow_fields)); + + let mut arrays: Vec = Vec::with_capacity(schema.fields.len()); + for f in schema.fields.iter() { + if let Some(scalar) = eq_map.get(&f.name) { + // Ensure the literal's array type matches the field type; otherwise, + // give up pruning for this clause. + let arr = scalar.to_array_of_size(1).ok()?; + if arr.data_type() != &f.data_type() { + return None; + } + arrays.push(arr); + } else { + arrays.push(new_null_array(&f.data_type(), 1)); + } + } + + RecordBatch::try_new(rb_schema, arrays).ok() +} + +/// Evaluate a partition expression using DataFusion and return the resulting +/// Arrow array (single column) for the given record batch. +async fn parse_partition_value_from_expr( + record: &RecordBatch, + expr: &str, +) -> Result> { + let ctx = SessionContext::new(); + ctx.register_udf(MURMUR3_MULTI_UDF.clone()); + ctx.register_batch("record_batch", record.clone())?; + let df = ctx + .sql(&format!("SELECT {} FROM record_batch", expr)) + .await?; + let records = df.collect().await?; + let partition_batch = records.first().ok_or_else(|| Error::Internal { + message: "expect one row of partition value but got nothing".to_string(), + location: location!(), + })?; + let partition_col = partition_batch.column(0); + Ok(Arc::clone(partition_col)) +} + +/// Compute partition values using the transform description by delegating to +/// the expression-based path. The resulting array type matches whatever +/// `parse_partition_value_from_expr` would return for the equivalent SQL +/// expression. +/// TODO: implement parse logic by code instead of datafusion + expr for better performance. +async fn parse_partition_value_from_transform( + ids: &[i32], + record: &RecordBatch, + transform: &PartitionTransform, +) -> Result> { + if record.num_columns() == 0 { + return Err(Error::InvalidInput { + source: "record must contain at least one column".into(), + location: location!(), + }); + } + + // Map transform to an equivalent SQL expression over the record batch. + let expr = match transform { + PartitionTransform::Identity => first_id_name(&record.schema(), ids)?.to_string(), + PartitionTransform::Year => format!( + "date_part('year', {})", + first_id_name(&record.schema(), ids)? + ), + PartitionTransform::Month => format!( + "date_part('month', {})", + first_id_name(&record.schema(), ids)? + ), + PartitionTransform::Day => format!( + "date_part('day', {})", + first_id_name(&record.schema(), ids)? + ), + PartitionTransform::Hour => format!( + "date_part('hour', {})", + first_id_name(&record.schema(), ids)? + ), + PartitionTransform::Bucket { num_buckets } => { + if *num_buckets <= 0 { + return Err(Error::InvalidInput { + source: format!("num_buckets must be positive, got {}", num_buckets).into(), + location: location!(), + }); + } + format!( + "abs(murmur3_multi({})) % {}", + first_id_name(&record.schema(), ids)?, + num_buckets + ) + } + PartitionTransform::MultiBucket { num_buckets } => { + if *num_buckets <= 0 { + return Err(Error::InvalidInput { + source: format!("num_buckets must be positive, got {}", num_buckets).into(), + location: location!(), + }); + } + let cols: Vec = ids + .iter() + .map(|id| record.schema().field(*id as usize).name().to_string()) + .collect(); + if cols.is_empty() { + return Err(Error::InvalidInput { + source: "source_ids should have at least one element".into(), + location: location!(), + }); + } + format!("abs(murmur3_multi({})) % {}", cols.join(", "), num_buckets) + } + PartitionTransform::Truncate { width } => { + if *width <= 0 { + return Err(Error::InvalidInput { + source: format!("truncate width must be positive, got {}", width).into(), + location: location!(), + }); + } + + let schema = record.schema(); + let id = *ids.first().ok_or_else(|| Error::InvalidInput { + source: "source_ids should have at least one element".into(), + location: location!(), + })? as usize; + let field = schema.fields().get(id).ok_or_else(|| Error::InvalidInput { + source: format!( + "source_id {} is out of bounds for record schema with {} fields", + id, + schema.fields().len() + ) + .into(), + location: location!(), + })?; + + match field.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + format!("substring({}, 1, {})", field.name(), width) + } + _ => format!("{} - ({} % {})", field.name(), field.name(), width), + } + } + }; + + parse_partition_value_from_expr(record, &expr).await +} + +pub(crate) fn scalar_to_bytes(array: &dyn arrow::array::Array, row: usize) -> Result> { + if array.is_null(row) { + return Ok(Vec::new()); + } + + macro_rules! to_bytes_primitive { + ($array_ty:ty, $array:expr, $row:expr) => {{ + let a = + $array + .as_any() + .downcast_ref::<$array_ty>() + .ok_or_else(|| Error::InvalidInput { + source: format!( + "Expected array type '{}' but got '{:?}'", + stringify!($array_ty), + $array.data_type() + ) + .into(), + location: location!(), + })?; + a.value($row).to_le_bytes().to_vec() + }}; + } + + macro_rules! to_bytes_utf8_like { + ($array_ty:ty, $array:expr, $row:expr) => {{ + let a = + $array + .as_any() + .downcast_ref::<$array_ty>() + .ok_or_else(|| Error::InvalidInput { + source: format!( + "Expected array type '{}' but got '{:?}'", + stringify!($array_ty), + $array.data_type() + ) + .into(), + location: location!(), + })?; + a.value($row).as_bytes().to_vec() + }}; + } + + macro_rules! to_bytes_binary_like { + ($array_ty:ty, $array:expr, $row:expr) => {{ + let a = + $array + .as_any() + .downcast_ref::<$array_ty>() + .ok_or_else(|| Error::InvalidInput { + source: format!( + "Expected array type '{}' but got '{:?}'", + stringify!($array_ty), + $array.data_type() + ) + .into(), + location: location!(), + })?; + a.value($row).to_vec() + }}; + } + + let dt = array.data_type(); + let bytes = match dt { + DataType::Int8 => to_bytes_primitive!(Int8Array, array, row), + DataType::Int16 => to_bytes_primitive!(Int16Array, array, row), + DataType::Int32 => to_bytes_primitive!(Int32Array, array, row), + DataType::Int64 => to_bytes_primitive!(Int64Array, array, row), + DataType::UInt8 => to_bytes_primitive!(UInt8Array, array, row), + DataType::UInt16 => to_bytes_primitive!(UInt16Array, array, row), + DataType::UInt32 => to_bytes_primitive!(UInt32Array, array, row), + DataType::UInt64 => to_bytes_primitive!(UInt64Array, array, row), + DataType::Float32 => to_bytes_primitive!(Float32Array, array, row), + DataType::Float64 => to_bytes_primitive!(Float64Array, array, row), + DataType::Date32 => to_bytes_primitive!(Date32Array, array, row), + DataType::Date64 => to_bytes_primitive!(Date64Array, array, row), + DataType::Timestamp(arrow_schema::TimeUnit::Second, _) => { + to_bytes_primitive!(TimestampSecondArray, array, row) + } + DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => { + to_bytes_primitive!(TimestampMillisecondArray, array, row) + } + DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => { + to_bytes_primitive!(TimestampMicrosecondArray, array, row) + } + DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => { + to_bytes_primitive!(TimestampNanosecondArray, array, row) + } + DataType::Utf8 => to_bytes_utf8_like!(StringArray, array, row), + DataType::LargeUtf8 => to_bytes_utf8_like!(LargeStringArray, array, row), + DataType::Binary => to_bytes_binary_like!(BinaryArray, array, row), + DataType::LargeBinary => to_bytes_binary_like!(LargeBinaryArray, array, row), + _ => { + let s = array_value_to_string(array, row).map_err(lance_core::Error::from)?; + s.into_bytes() + } + }; + + Ok(bytes) +} + +fn datatype_to_json_type(dt: &DataType) -> lance_namespace::models::JsonArrowDataType { + use lance_namespace::models::JsonArrowDataType; + + let type_name = match dt { + DataType::Boolean => "bool", + DataType::Int8 => "int8", + DataType::Int16 => "int16", + DataType::Int32 => "int32", + DataType::Int64 => "int64", + DataType::UInt8 => "uint8", + DataType::UInt16 => "uint16", + DataType::UInt32 => "uint32", + DataType::UInt64 => "uint64", + DataType::Float16 => "float16", + DataType::Float32 => "float32", + DataType::Float64 => "float64", + DataType::Date32 => "date32", + DataType::Date64 => "date64", + DataType::Utf8 => "utf8", + DataType::Binary => "binary", + other => { + // Fallback to debug string for unsupported types. This keeps the + // conversion infallible while still surfacing the type. + return JsonArrowDataType::new(format!("{:?}", other)); + } + }; + + JsonArrowDataType::new(type_name.to_string()) +} + +fn json_type_to_datatype( + json: &lance_namespace::models::JsonArrowDataType, +) -> lance_core::Result { + let type_name = json.r#type.to_lowercase(); + let dt = match type_name.as_str() { + "bool" | "boolean" => DataType::Boolean, + "int8" => DataType::Int8, + "int16" => DataType::Int16, + "int32" => DataType::Int32, + "int64" => DataType::Int64, + "uint8" => DataType::UInt8, + "uint16" => DataType::UInt16, + "uint32" => DataType::UInt32, + "uint64" => DataType::UInt64, + "float16" => DataType::Float16, + "float32" => DataType::Float32, + "float64" => DataType::Float64, + "date32" => DataType::Date32, + "date64" => DataType::Date64, + "utf8" => DataType::Utf8, + "binary" => DataType::Binary, + other => { + return Err(lance_core::Error::Namespace { + source: format!("Unsupported partition field result type: {}", other).into(), + location: snafu::location!(), + }); + } + }; + + Ok(dt) +} + +/// Generate a random 16-character base36 identifier (a-z0-9). +fn random_partition_namespace_id() -> String { + const CHARS: &[u8] = b"abcdefghijklmnopqrstuvwxyz0123456789"; + let mut buf = [0u8; 16]; + for b in &mut buf { + let idx = (rand::random::() as usize) % CHARS.len(); + *b = CHARS[idx]; + } + // Safety: all bytes are ASCII alphanumerics. + String::from_utf8_lossy(&buf).into_owned() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dir::manifest::TableInfo; + use arrow::array::{BinaryArray, Date32Array, Int32Array, RecordBatch, StringArray}; + use arrow_schema::{Field, Field as ArrowField, Schema, Schema as ArrowSchema}; + use lance_core::utils::tempfile::TempStdDir; + use lance_namespace::models::JsonArrowDataType; + use std::collections::{HashMap, HashSet}; + + fn const_bool(expr: &Expr) -> Option { + match expr { + Expr::Literal(ScalarValue::Boolean(Some(b)), _) => Some(*b), + Expr::BinaryExpr(b) if b.op == Operator::And => { + Some(const_bool(&b.left)? && const_bool(&b.right)?) + } + Expr::BinaryExpr(b) if b.op == Operator::Or => { + Some(const_bool(&b.left)? || const_bool(&b.right)?) + } + _ => None, + } + } + + fn collect_column_names(expr: &Expr, out: &mut Vec) { + match expr { + Expr::Column(c) => out.push(c.name.clone()), + Expr::BinaryExpr(b) => { + collect_column_names(&b.left, out); + collect_column_names(&b.right, out); + } + Expr::IsNull(e) | Expr::IsNotNull(e) => collect_column_names(e, out), + Expr::Cast(c) => collect_column_names(&c.expr, out), + Expr::TryCast(c) => collect_column_names(&c.expr, out), + Expr::ScalarFunction(fun) => fun.args.iter().for_each(|a| collect_column_names(a, out)), + _ => {} + } + } + + async fn setup_multi_version_namespace( + temp_path: &str, + ) -> ( + PartitionedNamespace, + PartitionSpec, + PartitionSpec, + Vec<(PartitionTable, Vec)>, + Vec<(PartitionTable, Vec)>, + ) { + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("ts", DataType::Date32, true), + ArrowField::new("country", DataType::Utf8, true), + ArrowField::new("business_unit", DataType::Int32, true), + ]); + let schema = lance_core::datatypes::Schema::try_from(&arrow_schema).unwrap(); + + let spec_v1 = PartitionSpec { + id: 1, + fields: vec![ + PartitionField { + field_id: "event_year".to_string(), + source_ids: vec![0], + transform: Some(PartitionTransform::Year), + expression: None, + result_type: DataType::Int32, + }, + PartitionField { + field_id: "country".to_string(), + source_ids: vec![1], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Utf8, + }, + ], + }; + + let ns = PartitionedNamespaceBuilder::new(temp_path) + .schema(schema) + .partition_spec(spec_v1.clone()) + .build() + .await + .unwrap(); + + // v1 tables + let v1_vals_1 = vec![ + ScalarValue::Int32(Some(2020)), + ScalarValue::Utf8(Some("US".to_string())), + ]; + let v1_t1 = ns + .create_partition_table(&spec_v1, &v1_vals_1) + .await + .unwrap(); + + let v1_vals_2 = vec![ + ScalarValue::Int32(Some(2021)), + ScalarValue::Utf8(Some("CN".to_string())), + ]; + let v1_t2 = ns + .create_partition_table(&spec_v1, &v1_vals_2) + .await + .unwrap(); + + // evolve to v2 + let spec_v2 = ns + .update_partition_spec(vec![ + PartitionField { + field_id: "business_unit".to_string(), + source_ids: vec![2], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Int32, + }, + PartitionField { + field_id: "country".to_string(), + source_ids: vec![1], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Utf8, + }, + ]) + .await + .unwrap(); + + // v2 tables + let v2_vals_1 = vec![ + ScalarValue::Int32(Some(1)), + ScalarValue::Utf8(Some("US".to_string())), + ]; + let v2_t1 = ns + .create_partition_table(&spec_v2, &v2_vals_1) + .await + .unwrap(); + + let v2_vals_2 = vec![ + ScalarValue::Int32(Some(2)), + ScalarValue::Utf8(Some("FR".to_string())), + ]; + let v2_t2 = ns + .create_partition_table(&spec_v2, &v2_vals_2) + .await + .unwrap(); + + ( + ns, + spec_v1, + spec_v2, + vec![(v1_t1, v1_vals_1), (v1_t2, v1_vals_2)], + vec![(v2_t1, v2_vals_1), (v2_t2, v2_vals_2)], + ) + } + + #[test] + fn partition_field_json_transform() { + let field = PartitionField { + field_id: "event_year".to_string(), + source_ids: vec![1], + transform: Some(PartitionTransform::Year), + expression: None, + result_type: DataType::Int32, + }; + + let json = field.to_json(); + assert_eq!(json.field_id, "event_year"); + assert!(json.expression.is_none()); + let transform = json.transform.as_ref().expect("transform should be set"); + assert_eq!(transform.r#type, "year"); + + let other_field = PartitionField::from_json(&json).expect("from_json should succeed"); + assert_eq!(other_field.field_id, "event_year"); + assert_eq!(other_field.source_ids, vec![1]); + assert_eq!(other_field.transform, Some(PartitionTransform::Year)); + assert_eq!(other_field.expression, None); + assert_eq!(other_field.result_type, DataType::Int32); + } + + #[tokio::test] + async fn test_partition_prune_rewrites_to_manifest_column() { + // Case 1: identity transform (direct rewrite) + { + let arrow_schema = + ArrowSchema::new(vec![ArrowField::new("country", DataType::Utf8, true)]); + let schema = lance_core::datatypes::Schema::try_from(&arrow_schema).unwrap(); + + let field = PartitionField { + field_id: "country".to_string(), + source_ids: vec![0], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Utf8, + }; + + let filter = col("country").eq(lit("US")); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema, &cdf).await.unwrap(); + + let mut cols = Vec::new(); + collect_column_names(&manifest_filter, &mut cols); + assert!(cols.contains(&"partition_field_country".to_string())); + assert!(!cols.contains(&"country".to_string())); + } + + // Case 2: time transform (direct rewrite via SQL-parsed scalar functions) + { + let arrow_schema = + ArrowSchema::new(vec![ArrowField::new("ts", DataType::Date32, true)]); + let schema = lance_core::datatypes::Schema::try_from(&arrow_schema).unwrap(); + let field = PartitionField { + field_id: "event_year".to_string(), + source_ids: vec![0], + transform: Some(PartitionTransform::Year), + expression: None, + result_type: DataType::Int32, + }; + + // date_part('year', ts) = 2020 + let filter = + super::parse_filter_expr_from_sql("date_part('year', ts) = 2020", &arrow_schema) + .await + .unwrap(); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema, &cdf).await.unwrap(); + + let mut cols = Vec::new(); + collect_column_names(&manifest_filter, &mut cols); + assert!(cols.contains(&"partition_field_event_year".to_string())); + assert!(!cols.contains(&"ts".to_string())); + } + } + + #[tokio::test] + async fn test_partition_prune_bucket_computes_partition_value() { + // Case 1: single source_id bucket transform, can prune by computing partition value. + { + let arrow_schema = + ArrowSchema::new(vec![ArrowField::new("country", DataType::Utf8, true)]); + let schema = lance_core::datatypes::Schema::try_from(&arrow_schema).unwrap(); + + let field = PartitionField { + field_id: "country_bucket".to_string(), + source_ids: vec![0], + transform: Some(PartitionTransform::Bucket { num_buckets: 16 }), + expression: None, + result_type: DataType::Int64, + }; + + let filter = col("country").eq(lit("US")); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema, &cdf).await.unwrap(); + + let mut cols = Vec::new(); + collect_column_names(&manifest_filter, &mut cols); + assert!(cols.contains(&"partition_field_country_bucket".to_string())); + assert!(!cols.contains(&"country".to_string())); + } + + // Case 2: multi source_ids, can prune only when all sources are constrained by equality. + { + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("country", DataType::Utf8, true), + ArrowField::new("business_unit", DataType::Int32, true), + ]); + let schema = lance_core::datatypes::Schema::try_from(&arrow_schema).unwrap(); + + let field = PartitionField { + field_id: "mb".to_string(), + source_ids: vec![0, 1], + transform: Some(PartitionTransform::MultiBucket { num_buckets: 16 }), + expression: None, + result_type: DataType::Int64, + }; + + // 2.1 can prune + let filter = col("country") + .eq(lit("US")) + .and(col("business_unit").eq(lit(1i32))); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema.clone(), &cdf).await.unwrap(); + + assert_ne!(const_bool(&manifest_filter), Some(true)); + let mut cols = Vec::new(); + collect_column_names(&manifest_filter, &mut cols); + assert!(cols.contains(&"partition_field_mb".to_string())); + assert!(!cols.contains(&"country".to_string())); + assert!(!cols.contains(&"business_unit".to_string())); + + // 2.2 cannot prune: missing equality on one source + let filter = col("country").eq(lit("US")); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema.clone(), &cdf).await.unwrap(); + assert_eq!(const_bool(&manifest_filter), Some(true)); + + // 2.3 cannot prune: non-equality predicate present + let filter = col("country") + .eq(lit("US")) + .and(col("business_unit").gt(lit(1i32))); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema.clone(), &cdf).await.unwrap(); + assert_eq!(const_bool(&manifest_filter), Some(true)); + + // 2.4 contradiction: no partitions can match + let filter = col("country") + .eq(lit("US")) + .and(col("country").eq(lit("CN"))) + .and(col("business_unit").eq(lit(1i32))); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema, &cdf).await.unwrap(); + assert_eq!(const_bool(&manifest_filter), Some(false)); + } + } + + #[test] + fn partition_field_json_expression() { + let json = JsonPartitionField { + field_id: "country".to_string(), + source_ids: vec![2], + transform: None, + expression: Some("col0".to_string()), + result_type: Box::new(JsonArrowDataType::new("utf8".to_string())), + }; + + let field = PartitionField::from_json(&json).expect("from_json should succeed"); + assert_eq!(field.field_id, "country"); + assert_eq!(field.source_ids, vec![2]); + assert!(field.transform.is_none()); + assert_eq!(field.expression.as_deref(), Some("col0")); + assert_eq!(field.result_type, DataType::Utf8); + + let json2 = field.to_json(); + assert!(json2.transform.is_none()); + assert_eq!(json2.expression.as_deref(), Some("col0")); + assert_eq!(json2.result_type.r#type.to_lowercase(), "utf8"); + } + + #[test] + fn partition_field_json_requires_exactly_one_of_transform_or_expression() { + let json = JsonPartitionField { + field_id: "bad".to_string(), + source_ids: vec![1], + transform: Some(Box::new(JsonPartitionTransform { + r#type: "identity".to_string(), + num_buckets: None, + width: None, + })), + expression: Some("col0".to_string()), + result_type: Box::new(JsonArrowDataType::new("int32".to_string())), + }; + + let err = PartitionField::from_json(&json).expect_err("should fail"); + assert!(err + .to_string() + .contains("Exactly one of transform or expression")); + } + + #[test] + fn partition_spec_json() { + let field1 = PartitionField { + field_id: "event_date".to_string(), + source_ids: vec![1], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Date32, + }; + let field2 = PartitionField { + field_id: "country".to_string(), + source_ids: vec![2], + transform: None, + expression: Some("col0".to_string()), + result_type: DataType::Utf8, + }; + + let spec = PartitionSpec { + id: 1, + fields: vec![field1, field2], + }; + + let json_spec = spec.to_json(); + assert_eq!(json_spec.id, 1); + assert_eq!(json_spec.fields.len(), 2); + + let other_spec = PartitionSpec::from_json(&json_spec).expect("from_json should succeed"); + assert_eq!(other_spec.id, spec.id); + assert_eq!(other_spec.fields.len(), spec.fields.len()); + for (a, b) in other_spec.fields.iter().zip(spec.fields.iter()) { + assert_eq!(a.field_id, b.field_id); + assert_eq!(a.source_ids, b.source_ids); + assert_eq!(a.transform, b.transform); + assert_eq!(a.expression, b.expression); + assert_eq!(a.result_type, b.result_type); + } + } + + #[tokio::test] + async fn test_update_partition_spec_reuse_and_manifest_update() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Minimal namespace schema (not used by update_partition_spec today, but required by constructor) + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("c0", DataType::Date32, true), + ArrowField::new("c1", DataType::Utf8, true), + ArrowField::new("c2", DataType::Int32, true), + ]); + let schema = lance_core::datatypes::Schema::try_from(&arrow_schema).unwrap(); + + let initial_spec = PartitionSpec { + id: 1, + fields: vec![ + PartitionField { + field_id: "event_year".to_string(), + source_ids: vec![0], + transform: Some(PartitionTransform::Year), + expression: None, + result_type: DataType::Int32, + }, + PartitionField { + field_id: "country".to_string(), + source_ids: vec![1], + transform: None, + expression: Some("col0".to_string()), + result_type: DataType::Utf8, + }, + ], + }; + let event_year_id_v1 = initial_spec + .fields + .iter() + .find(|f| f.transform == Some(PartitionTransform::Year)) + .unwrap() + .field_id + .clone(); + + let ns = PartitionedNamespaceBuilder::new(temp_path) + .schema(schema) + .partition_spec(initial_spec) + .build() + .await + .unwrap(); + + let spec_v2 = ns + .update_partition_spec(vec![ + // Same signature as event_year but different incoming id should be overridden + PartitionField { + field_id: "should_be_overridden".to_string(), + source_ids: vec![0], + transform: Some(PartitionTransform::Year), + expression: None, + result_type: DataType::Int32, + }, + // New field keeps requested id + PartitionField { + field_id: "business_unit".to_string(), + source_ids: vec![2], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Int32, + }, + // Existing signature should reuse field_id + PartitionField { + field_id: "ignored".to_string(), + source_ids: vec![1], + transform: None, + expression: Some("col0".to_string()), + result_type: DataType::Utf8, + }, + ]) + .await + .unwrap(); + assert_eq!(spec_v2.id, 2); + let event_year_id_v2 = spec_v2 + .fields + .iter() + .find(|f| f.transform == Some(PartitionTransform::Year)) + .unwrap() + .field_id + .clone(); + assert_eq!(event_year_id_v2, event_year_id_v1); + + // Verify __manifest table metadata has both partition_spec_v1 and partition_spec_v2 + let meta = ns.manifest.get_metadata().await.unwrap(); + assert!(meta.contains_key("partition_spec_v1")); + assert!(meta.contains_key("partition_spec_v2")); + + // Verify __manifest schema has columns for partition fields + let manifest_cols: HashSet = ns + .manifest + .get_extended_properties_keys() + .await + .unwrap() + .into_iter() + .collect(); + assert!(manifest_cols.contains(&format!("partition_field_{}", event_year_id_v1))); + assert!(manifest_cols.contains("partition_field_country")); + assert!(manifest_cols.contains("partition_field_business_unit")); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_identity() { + let array = Int32Array::from(vec![1]); + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new( + "col0", + DataType::Int32, + false, + )])), + vec![Arc::new(array)], + ) + .unwrap(); + + let ids = vec![0]; + let transform = PartitionTransform::Identity; + let expr = "col0"; + + let v_expr = parse_partition_value_from_expr(&batch, expr).await.unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_year_date32() { + let value: i32 = 19723; + let array = Date32Array::from(vec![value]); + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new( + "col0", + DataType::Date32, + false, + )])), + vec![Arc::new(array)], + ) + .unwrap(); + + let ids = vec![0]; + let transform = PartitionTransform::Year; + let expr = "date_part('year', col0)"; + + let v_expr = parse_partition_value_from_expr(&batch, expr).await.unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_truncate_utf8() { + let array = StringArray::from(vec!["abcdef"]); + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("col0", DataType::Utf8, false)])), + vec![Arc::new(array)], + ) + .unwrap(); + + let ids = vec![0]; + let width = 3; + let transform = PartitionTransform::Truncate { width }; + let expr = "substring(col0, 1, 3)"; + + let v_expr = parse_partition_value_from_expr(&batch, expr).await.unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_truncate_int32() { + let array = Int32Array::from(vec![17]); + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new( + "col0", + DataType::Int32, + false, + )])), + vec![Arc::new(array)], + ) + .unwrap(); + + let ids = vec![0]; + let width = 5; + let transform = PartitionTransform::Truncate { width }; + let expr = "col0 - (col0 % 5)"; + + let v_expr = parse_partition_value_from_expr(&batch, expr).await.unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_bucket_binary() { + let data: Vec<&[u8]> = vec![b"abc".as_ref()]; + let array = BinaryArray::from(data); + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new( + "col0", + DataType::Binary, + false, + )])), + vec![Arc::new(array)], + ) + .unwrap(); + + let ids = vec![0]; + let num_buckets = 8; + let transform = PartitionTransform::Bucket { num_buckets }; + let expr = format!("abs(murmur3_multi(col0)) % {}", num_buckets); + + let v_expr = parse_partition_value_from_expr(&batch, &expr) + .await + .unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_multi_bucket_utf8() { + let col0 = StringArray::from(vec!["ab"]); + let col1 = StringArray::from(vec!["12"]); + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("col0", DataType::Utf8, false), + Field::new("col1", DataType::Utf8, false), + ])), + vec![Arc::new(col0), Arc::new(col1)], + ) + .unwrap(); + + let ids = vec![0, 1]; + let num_buckets = 16; + let transform = PartitionTransform::MultiBucket { num_buckets }; + let expr = format!("abs(murmur3_multi(col0, col1)) % {}", num_buckets); + + let v_expr = parse_partition_value_from_expr(&batch, &expr) + .await + .unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[test] + fn random_partition_namespace_id_is_base36() { + let id = random_partition_namespace_id(); + assert_eq!(id.len(), 16); + for ch in id.chars() { + assert!( + ch.is_ascii_lowercase() || ch.is_ascii_digit(), + "unexpected character in id: {}", + ch + ); + } + } + + #[tokio::test] + async fn test_resolve_partition_table_multiple_partition_spec_versions() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let (ns, spec_v1, spec_v2, v1_tables, v2_tables) = + setup_multi_version_namespace(temp_path).await; + + for (t, vals) in v1_tables.iter() { + let resolved = ns + .resolve_partition_table(&spec_v1, vals) + .await + .unwrap() + .expect("should resolve v1 table"); + assert_eq!(&resolved, t); + assert_eq!(resolved.id.first().map(|s| s.as_str()), Some("v1")); + } + + for (t, vals) in v2_tables.iter() { + let resolved = ns + .resolve_partition_table(&spec_v2, vals) + .await + .unwrap() + .expect("should resolve v2 table"); + assert_eq!(&resolved, t); + assert_eq!(resolved.id.first().map(|s| s.as_str()), Some("v2")); + } + + // missing partition should return None + let missing = vec![ + ScalarValue::Int32(Some(1999)), + ScalarValue::Utf8(Some("US".to_string())), + ]; + let resolved = ns + .resolve_partition_table(&spec_v1, &missing) + .await + .unwrap(); + assert!(resolved.is_none()); + } + + #[tokio::test] + async fn test_create_partition_table_multiple_versions_manifest_properties() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // create partition table + let (ns, _spec_v1, _spec_v2, v1_tables, v2_tables) = + setup_multi_version_namespace(temp_path).await; + + // verify created partition tables + let objects = ns + .manifest + .query_manifest(col("object_type").eq(lit("table"))) + .await + .unwrap(); + + let mut tables_by_id: HashMap = HashMap::new(); + let mut v1_count = 0usize; + let mut v2_count = 0usize; + for obj in objects { + let ManifestObject::Table(t) = obj else { + continue; + }; + if t.name != "dataset" { + continue; + } + if t.namespace.first().map(|s| s.as_str()) == Some("v1") { + v1_count += 1; + } + if t.namespace.first().map(|s| s.as_str()) == Some("v2") { + v2_count += 1; + } + + let mut id = t.namespace.clone(); + id.push(t.name.clone()); + tables_by_id.insert(id.join("."), t); + } + + assert_eq!(v1_count, v1_tables.len()); + assert_eq!(v2_count, v2_tables.len()); + + // Check one v1 table has expected extended props and no v2-only field + let (t_v1, vals_v1) = &v1_tables[0]; + let tbl = tables_by_id + .get(&t_v1.id.join(".")) + .expect("v1 table should exist in manifest"); + let props = tbl.properties.as_ref().expect("properties should exist"); + let v1_year = crate::dir::manifest::scalar_to_str(&vals_v1[0]) + .unwrap() + .unwrap(); + assert_eq!( + props + .get(&format!("{}partition_field_event_year", EXTENDED_PREFIX)) + .map(|s| s.as_str()), + Some(v1_year.as_str()) + ); + assert_eq!( + props + .get(&format!("{}partition_field_country", EXTENDED_PREFIX)) + .map(|s| s.as_str()), + Some("US") + ); + assert!(!props.contains_key(&format!("{}partition_field_business_unit", EXTENDED_PREFIX))); + + // Check one v2 table has expected extended props including v2-only field + let (t_v2, vals_v2) = &v2_tables[0]; + let tbl = tables_by_id + .get(&t_v2.id.join(".")) + .expect("v2 table should exist in manifest"); + let props = tbl.properties.as_ref().expect("properties should exist"); + let v2_bu = crate::dir::manifest::scalar_to_str(&vals_v2[0]) + .unwrap() + .unwrap(); + let v2_co = crate::dir::manifest::scalar_to_str(&vals_v2[1]) + .unwrap() + .unwrap(); + assert_eq!( + props + .get(&format!("{}partition_field_event_year", EXTENDED_PREFIX)) + .map(|s| s.as_str()), + None + ); + assert_eq!( + props + .get(&format!("{}partition_field_business_unit", EXTENDED_PREFIX)) + .map(|s| s.as_str()), + Some(v2_bu.as_str()) + ); + assert_eq!( + props + .get(&format!("{}partition_field_country", EXTENDED_PREFIX)) + .map(|s| s.as_str()), + Some(v2_co.as_str()) + ); + } + + #[tokio::test] + async fn test_plan_scan_multiple_partition_spec_versions() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let (ns, _spec_v1, _spec_v2, v1_tables, v2_tables) = + setup_multi_version_namespace(temp_path).await; + + let v1_us_2020 = &v1_tables[0].0; + let v2_us_2020_bu1 = &v2_tables[0].0; + let v1_cn_2021 = &v1_tables[1].0; + let v2_fr_2022_bu2 = &v2_tables[1].0; + + // (country = 'US') should match exactly the two US/2020 tables. + let filter = col("country").eq(lit("US")); + let planned = ns.plan_scan(&filter).await.unwrap(); + let got: HashSet = planned.into_iter().map(|(t, _)| t.id.join(".")).collect(); + + let expected: HashSet = [v1_us_2020.id.join("."), v2_us_2020_bu1.id.join(".")] + .into_iter() + .collect(); + assert_eq!(got, expected); + + // business_unit = 1 should include v2 bu=1 table and all v1 tables (NULL => conservative keep), but not v2 bu=2. + let filter = col("business_unit").eq(lit(1i32)); + let planned = ns.plan_scan(&filter).await.unwrap(); + let got: HashSet = planned.into_iter().map(|(t, _)| t.id.join(".")).collect(); + let expected: HashSet = [ + v2_us_2020_bu1.id.join("."), + v1_us_2020.id.join("."), + v1_cn_2021.id.join("."), + ] + .into_iter() + .collect(); + assert_eq!(got, expected); + + // (business_unit = 1) AND (country = 'US') should prune away v1 CN table. + let filter = col("business_unit") + .eq(lit(1i32)) + .and(col("country").eq(lit("US"))); + let planned = ns.plan_scan(&filter).await.unwrap(); + let got: HashSet = planned.into_iter().map(|(t, _)| t.id.join(".")).collect(); + let expected: HashSet = [v1_us_2020.id.join("."), v2_us_2020_bu1.id.join(".")] + .into_iter() + .collect(); + assert_eq!(got, expected); + + // (year(ts) = 2020) should match v1 2020 and all v2 tables. + let arrow_schema = ArrowSchema::from(&ns.schema()); + let filter = parse_filter_expr_from_sql("date_part('year', ts)=2020", &arrow_schema) + .await + .unwrap(); + let planned = ns.plan_scan(&filter).await.unwrap(); + let got: HashSet = planned.into_iter().map(|(t, _)| t.id.join(".")).collect(); + + let expected: HashSet = [ + v1_us_2020.id.join("."), + v2_fr_2022_bu2.id.join("."), + v2_us_2020_bu1.id.join("."), + ] + .into_iter() + .collect(); + assert_eq!(got, expected); + } +} diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index b63331c8a66..a190a1bcf81 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -2872,6 +2872,7 @@ mod tests { mode: Some("create".to_string()), identity: None, context: None, + properties: None, }; let result = namespace.create_table(create_table_req, table_data).await; assert!(result.is_ok(), "Failed to create table: {:?}", result); diff --git a/rust/lance-namespace-impls/src/udf.rs b/rust/lance-namespace-impls/src/udf.rs new file mode 100644 index 00000000000..40bb5193631 --- /dev/null +++ b/rust/lance-namespace-impls/src/udf.rs @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use crate::partition::scalar_to_bytes; +/// DataFusion UDFs used by partitioned namespace implementation. +use arrow::array::{Array, ArrayRef, Int32Builder}; +use arrow_schema::DataType; +use datafusion_functions::utils::make_scalar_function; +use lance::deps::datafusion::error::DataFusionError; +use lance::deps::datafusion::logical_expr::{ScalarUDF, Signature, SimpleScalarUDF, Volatility}; +use std::sync::{Arc, LazyLock}; + +/// A variadic murmur3 UDF. +/// +/// - Accepts any number of arguments (>= 1) +/// - Accepts any argument types (bytes are derived via `scalar_to_bytes`) +/// - Skips NULL arguments; returns NULL if all arguments are NULL for a row +fn murmur3_multi() -> ScalarUDF { + let function = Arc::new(make_scalar_function( + |args: &[ArrayRef]| { + if args.is_empty() { + return Err(DataFusionError::Execution( + "murmur3_multi expects at least 1 argument".to_string(), + )); + } + + let len = args[0].len(); + for a in args.iter().skip(1) { + if a.len() != len { + return Err(DataFusionError::Execution( + "All arguments to murmur3_multi must have the same length".to_string(), + )); + } + } + + let mut builder = Int32Builder::new(); + for row in 0..len { + let mut buf = Vec::new(); + let mut has_value = false; + + for col in args { + let array = col.as_ref(); + if array.is_null(row) { + continue; + } + has_value = true; + let value_bytes = scalar_to_bytes(array, row) + .map_err(|e| DataFusionError::Execution(e.to_string()))?; + buf.extend_from_slice(&value_bytes); + } + + if !has_value { + builder.append_null(); + continue; + } + + let hash = murmur3::murmur3_32(&mut std::io::Cursor::new(&buf), 0)? as i32; + builder.append_value(hash); + } + + Ok(Arc::new(builder.finish()) as ArrayRef) + }, + vec![], + )); + + ScalarUDF::from(SimpleScalarUDF::new_with_signature( + "murmur3_multi", + Signature::variadic_any(Volatility::Immutable), + DataType::Int32, + function, + )) +} + +pub static MURMUR3_MULTI_UDF: LazyLock = LazyLock::new(murmur3_multi); diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 20c5138af7e..2947e9b1a4c 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -806,6 +806,7 @@ impl Dataset { transaction_id: fallback_resp.transaction_id, location: fallback_resp.location, storage_options: fallback_resp.storage_options, + properties: fallback_resp.properties, } } Err(e) => {