diff --git a/Cargo.lock b/Cargo.lock index ae57f51..b576242 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2366,6 +2366,7 @@ dependencies = [ "rand 0.8.5", "reqwest", "ring", + "rustls-pemfile", "serde", "serde_json", "snafu", diff --git a/Cargo.toml b/Cargo.toml index e60eb94..9c819d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ tokio = { version="1.44.2", features = ["rt-multi-thread"]} predicates = "3.1.3" reqwest = "0.12.15" tempfile = "3" -object_store = { version="0.11.2", features = ["aws"] } +object_store = { version="0.11.2", features = ["aws", "gcp"] } url = "2.5.4" [dev-dependencies] diff --git a/README.md b/README.md index 9ce232b..5fc3b36 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,8 @@ dfkit is an extensive suite of command-line functions to easily view, query, and ## Highlights Here's a high level overview of some of the features in dfkit: -- Supports viewing and manipulating files stored locally, from URLs, and S3 storage. +- Supports viewing, querying, and manipulating files stored locally, on the web, +or from cloud storage services such as Amazon S3 and Google Cloud Storage. - Works with CSV, JSON, Parquet, and Avro files - Ultra-fast performance powered by Apache Arrow and DataFusion - Transform data with SQL or with several other built-in functions diff --git a/src/utils.rs b/src/utils.rs index cfca775..b496e29 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -8,6 +8,7 @@ use std::sync::Arc; use tempfile::NamedTempFile; use thiserror::Error; use object_store::aws::AmazonS3Builder; +use object_store::gcp::GoogleCloudStorageBuilder; use url::Url; #[derive(Debug, PartialEq, Eq)] @@ -23,6 +24,7 @@ pub enum StorageType { Local, Url, S3, + GCS } #[derive(Error, Debug)] @@ -95,6 +97,8 @@ pub fn storage_type(file_path: &Path) -> Result { Ok(StorageType::Url) } else if path_str.starts_with("s3://") { Ok(StorageType::S3) + } else if path_str.starts_with("gs://") { + Ok(StorageType::GCS) } else if file_path.is_absolute() { Ok(StorageType::Local) } else { @@ -140,6 +144,27 @@ pub async fn register_table( let store= Arc::from(AmazonS3Builder::from_env() .with_bucket_name(bucket).build()?); + ctx.runtime_env() + .register_object_store(&url, store); + + let file_format = file_type(&file_path.to_path_buf())?; + (file_format, path_str.to_string()) + } + StorageType::GCS => { + let path_str = file_path + .to_str() + .ok_or(DfKitError::FileParse(FileParseError::InvalidExtension))?; + + let url = Url::parse(path_str)?; + let bucket = url.host_str() + .ok_or_else(|| DfKitError::CustomError("Missing bucket in GCS URL".into()))?; + + let store = Arc::from( + GoogleCloudStorageBuilder::from_env() + .with_bucket_name(bucket) + .build()? + ); + ctx.runtime_env() .register_object_store(&url, store);