Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ tokio = { version="1.44.2", features = ["rt-multi-thread"]}
predicates = "3.1.3"
reqwest = "0.12.15"
tempfile = "3"
object_store = { version="0.11.2", features = ["aws"] }
object_store = { version="0.11.2", features = ["aws", "gcp"] }
url = "2.5.4"

[dev-dependencies]
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ dfkit is an extensive suite of command-line functions to easily view, query, and
## Highlights
Here's a high level overview of some of the features in dfkit:

- Supports viewing and manipulating files stored locally, from URLs, and S3 storage.
- Supports viewing, querying, and manipulating files stored locally, on the web,
or from cloud storage services such as Amazon S3 and Google Cloud Storage.
- Works with CSV, JSON, Parquet, and Avro files
- Ultra-fast performance powered by Apache Arrow and DataFusion
- Transform data with SQL or with several other built-in functions
Expand Down
25 changes: 25 additions & 0 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::sync::Arc;
use tempfile::NamedTempFile;
use thiserror::Error;
use object_store::aws::AmazonS3Builder;
use object_store::gcp::GoogleCloudStorageBuilder;
use url::Url;

#[derive(Debug, PartialEq, Eq)]
Expand All @@ -23,6 +24,7 @@ pub enum StorageType {
Local,
Url,
S3,
GCS
}

#[derive(Error, Debug)]
Expand Down Expand Up @@ -95,6 +97,8 @@ pub fn storage_type(file_path: &Path) -> Result<StorageType, DfKitError> {
Ok(StorageType::Url)
} else if path_str.starts_with("s3://") {
Ok(StorageType::S3)
} else if path_str.starts_with("gs://") {
Ok(StorageType::GCS)
} else if file_path.is_absolute() {
Ok(StorageType::Local)
} else {
Expand Down Expand Up @@ -140,6 +144,27 @@ pub async fn register_table(
let store= Arc::from(AmazonS3Builder::from_env()
.with_bucket_name(bucket).build()?);

ctx.runtime_env()
.register_object_store(&url, store);

let file_format = file_type(&file_path.to_path_buf())?;
(file_format, path_str.to_string())
}
StorageType::GCS => {
let path_str = file_path
.to_str()
.ok_or(DfKitError::FileParse(FileParseError::InvalidExtension))?;

let url = Url::parse(path_str)?;
let bucket = url.host_str()
.ok_or_else(|| DfKitError::CustomError("Missing bucket in GCS URL".into()))?;

let store = Arc::from(
GoogleCloudStorageBuilder::from_env()
.with_bucket_name(bucket)
.build()?
);

ctx.runtime_env()
.register_object_store(&url, store);

Expand Down