diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs index 8a6ad448d895d..f5f5d249319e2 100644 --- a/datafusion-cli/src/functions.rs +++ b/datafusion-cli/src/functions.rs @@ -703,10 +703,13 @@ impl TableFunctionImpl for StatisticsCacheFunc { } } -// Implementation of the `list_files_cache` table function in datafusion-cli. +/// Implementation of the `list_files_cache` table function in datafusion-cli. +/// +/// This function returns the cached results of running a LIST command on a +/// particular object store path for a table. The object metadata is returned as +/// a List of Structs, with one Struct for each object. DataFusion uses these +/// cached results to plan queries against external tables. /// -/// This function returns the cached results of running a LIST command on a particular object store path for a table. The object metadata is returned as a List of Structs, with one Struct for each object. -/// DataFusion uses these cached results to plan queries against external tables. /// # Schema /// ```sql /// > describe select * from list_files_cache(); @@ -788,7 +791,7 @@ impl TableFunctionImpl for ListFilesCacheFunc { Field::new("metadata", DataType::Struct(nested_fields.clone()), true); let schema = Arc::new(Schema::new(vec![ - Field::new("table", DataType::Utf8, false), + Field::new("table", DataType::Utf8, true), Field::new("path", DataType::Utf8, false), Field::new("metadata_size_bytes", DataType::UInt64, false), // expires field in ListFilesEntry has type Instant when set, from which we cannot get "the number of seconds", hence using Duration instead of Timestamp as data type. @@ -821,7 +824,7 @@ impl TableFunctionImpl for ListFilesCacheFunc { let mut current_offset: i32 = 0; for (path, entry) in list_files_cache.list_entries() { - table_arr.push(path.table.map_or("NULL".to_string(), |t| t.to_string())); + table_arr.push(path.table.map(|t| t.to_string())); path_arr.push(path.path.to_string()); metadata_size_bytes_arr.push(entry.size_bytes as u64); // calculates time left before entry expires diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index 86af691fd7248..98f61a8528aa0 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -161,9 +161,7 @@ impl TableProviderFactory for ListingTableFactory { } None => format!("*.{}", cmd.file_type.to_lowercase()), }; - table_path = table_path - .with_glob(glob.as_ref())? - .with_table_ref(cmd.name.clone()); + table_path = table_path.with_glob(glob.as_ref())?; } let schema = options.infer_schema(session_state, &table_path).await?; let df_schema = Arc::clone(&schema).to_dfschema()?; diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs index 2428275ac3c36..f2149727b1a59 100644 --- a/datafusion/datasource/src/url.rs +++ b/datafusion/datasource/src/url.rs @@ -42,7 +42,7 @@ pub struct ListingTableUrl { prefix: Path, /// An optional glob expression used to filter files glob: Option, - + /// Optional table reference for the table this url belongs to table_ref: Option, } @@ -340,17 +340,19 @@ impl ListingTableUrl { } /// Returns a copy of current [`ListingTableUrl`] with a specified `glob` - pub fn with_glob(self, glob: &str) -> Result { - let glob = - Pattern::new(glob).map_err(|e| DataFusionError::External(Box::new(e)))?; - Self::try_new(self.url, Some(glob)) + pub fn with_glob(mut self, glob: &str) -> Result { + self.glob = + Some(Pattern::new(glob).map_err(|e| DataFusionError::External(Box::new(e)))?); + Ok(self) } + /// Set the table reference for this [`ListingTableUrl`] pub fn with_table_ref(mut self, table_ref: TableReference) -> Self { self.table_ref = Some(table_ref); self } + /// Return the table reference for this [`ListingTableUrl`] pub fn get_table_ref(&self) -> &Option { &self.table_ref } diff --git a/datafusion/execution/src/cache/cache_manager.rs b/datafusion/execution/src/cache/cache_manager.rs index 162074d909ead..7c1fec9c05054 100644 --- a/datafusion/execution/src/cache/cache_manager.rs +++ b/datafusion/execution/src/cache/cache_manager.rs @@ -100,6 +100,7 @@ pub trait ListFilesCache: /// Retrieves the information about the entries currently cached. fn list_entries(&self) -> HashMap; + /// Drop all entries for the given table reference. fn drop_table_entries(&self, table_ref: &Option) -> Result<()>; } diff --git a/datafusion/execution/src/cache/list_files_cache.rs b/datafusion/execution/src/cache/list_files_cache.rs index 858219e5b883f..c9e833ed49c65 100644 --- a/datafusion/execution/src/cache/list_files_cache.rs +++ b/datafusion/execution/src/cache/list_files_cache.rs @@ -149,6 +149,11 @@ pub const DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT: usize = 1024 * 1024; // 1MiB /// The default cache TTL for the [`DefaultListFilesCache`] pub const DEFAULT_LIST_FILES_CACHE_TTL: Option = None; // Infinite +/// Key for [`DefaultListFilesCache`] +/// +/// Each entry is scoped to its use within a specific table so that the cache +/// can differentiate between identical paths in different tables, and +/// table-level cache invalidation. #[derive(PartialEq, Eq, Hash, Clone, Debug)] pub struct TableScopedPath { pub table: Option,