Skip to content

Make Disk Cache Configurable by environment variable #196

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 13 additions & 2 deletions chunk_cache/src/disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use cas_types::{ChunkRange, Key};
use error_printer::ErrorPrinter;
use file_utils::SafeFileCreator;
use merklehash::MerkleHash;
use tracing::{debug, warn};
use tracing::{debug, info, warn};
#[cfg(feature = "analysis")]
use utils::output_bytes;

Expand Down Expand Up @@ -147,6 +147,12 @@ impl DiskCache {
let mut num_items = 0;
let max_num_bytes = 2 * capacity;

// short circuit if no capacity to cache.
if capacity == 0 {
info!("cache capacity is 0, not loading any cache state");
return Ok(CacheState::new(state, 0, 0));
}

let Some(cache_root_readdir) = read_dir(cache_root)? else {
return Ok(CacheState::new(state, 0, 0));
};
Expand All @@ -168,7 +174,7 @@ impl DiskCache {
continue;
};

// loop throught key directories inside prefix directory
// loop through key directories inside prefix directory
for key_dir in key_prefix_readdir {
let key_dir = match is_ok_dir(key_dir) {
Ok(Some(dirent)) => dirent,
Expand Down Expand Up @@ -812,6 +818,11 @@ mod tests {

const RANDOM_SEED: u64 = 9089 << 20 | 120043;

#[test]
fn test_default_capacity() {
assert_eq!(DEFAULT_CAPACITY, 10 << 30, "DEFAULT_CAPACITY should be 10GB");
}

#[test]
fn test_get_cache_empty() {
let mut rng = StdRng::seed_from_u64(RANDOM_SEED);
Expand Down
6 changes: 2 additions & 4 deletions chunk_cache/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
mod cache_manager;
mod disk;
pub mod disk;
pub mod error;

use std::path::PathBuf;

pub use cache_manager::get_cache;
use cas_types::{ChunkRange, Key};
pub use disk::test_utils::*;
pub use disk::DiskCache;
pub use disk::{DiskCache, DEFAULT_CAPACITY};
use error::ChunkCacheError;
use mockall::automock;

use crate::disk::DEFAULT_CAPACITY;

/// ChunkCache is a trait for storing and fetching Xorb ranges.
/// implementors are expected to return bytes for a key and a given chunk range
/// (no compression or further deserialization should be required)
Expand Down
1 change: 1 addition & 0 deletions data/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ path = "src/bin/xtool.rs"
cas_client = { path = "../cas_client" }
cas_object = { path = "../cas_object" }
cas_types = { path = "../cas_types" }
chunk_cache = { path = "../chunk_cache" }
merkledb = { path = "../merkledb" }
merklehash = { path = "../merklehash" }
mdb_shard = { path = "../mdb_shard" }
Expand Down
3 changes: 2 additions & 1 deletion data/src/configurations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::str::FromStr;

use cas_client::CacheConfig;
use cas_object::CompressionScheme;
use chunk_cache::disk::DEFAULT_CAPACITY;
use utils::auth::AuthConfig;

use crate::errors::Result;
Expand Down Expand Up @@ -112,7 +113,7 @@ impl TranslatorConfig {
prefix: "default".into(),
cache_config: Some(CacheConfig {
cache_directory: path.join("cache"),
cache_size: 10 * 1024 * 1024 * 1024, // 10 GiB
cache_size: DEFAULT_CAPACITY,
}),
staging_directory: None,
},
Expand Down
52 changes: 51 additions & 1 deletion data/src/data_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::{env, fs};

use cas_client::CacheConfig;
use cas_object::CompressionScheme;
use chunk_cache::disk::DEFAULT_CAPACITY;
use dirs::home_dir;
use lazy_static::lazy_static;
use merkledb::constants::IDEAL_CAS_BLOCK_SIZE;
Expand All @@ -32,6 +33,14 @@ const MAX_CONCURRENT_DOWNLOADS: usize = 8; // Download is not CPU-bound
const DEFAULT_CAS_ENDPOINT: &str = "http://localhost:8080";
const READ_BLOCK_SIZE: usize = 1024 * 1024;

fn get_configured_cache_size() -> u64 {
env::var("HF_XET_CACHE_SIZE_GB")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should use new configurable_constants! macro for this here, if needs to rebase off main

.ok()
.and_then(|s| s.parse::<u64>().ok())
.map(|size| size << 30) // Convert GB to bytes
.unwrap_or(DEFAULT_CAPACITY)
}

pub fn default_config(
endpoint: String,
xorb_compression: Option<CompressionScheme>,
Expand Down Expand Up @@ -84,7 +93,7 @@ pub fn default_config(
prefix: "default".into(),
cache_config: Some(CacheConfig {
cache_directory: cache_path.join("chunk-cache"),
cache_size: 10 * 1024 * 1024 * 1024, // 10 GiB
cache_size: get_configured_cache_size(),
}),
staging_directory: None,
},
Expand Down Expand Up @@ -237,6 +246,47 @@ mod tests {

use super::*;

#[test]
#[serial(xet_cache_size)]
fn test_cache_size_env_var() {
// Test with different cache sizes
let test_cases = vec![
(Some("0"), 0), // 0GB (disable chunk-cache)
(Some("5"), 5 << 30), // 5GB
(Some("20"), 20 << 30), // 20GB
(Some("1"), 1 << 30), // 1GB
(None, DEFAULT_CAPACITY), // Default when not set
(Some("invalid"), DEFAULT_CAPACITY), // Default when invalid value
];

for (env_value, expected_size) in test_cases {
// Set or unset the environment variable
if let Some(value) = env_value {
env::set_var("HF_XET_CACHE_SIZE_GB", value);
} else {
env::remove_var("HF_XET_CACHE_SIZE_GB");
}

// Verify the configured cache size matches the expected size
assert_eq!(
get_configured_cache_size(),
expected_size,
"Cache size mismatch for env var value {:?}. Expected {}GB, got {}GB",
env_value,
expected_size >> 30,
get_configured_cache_size() >> 30
);

let endpoint = "http://localhost:8080".to_string();
let result = default_config(endpoint, None, None, None);

assert!(result.is_ok());
let (config, _tempdir) = result.unwrap();
assert!(config.cas_storage_config.cache_config.is_some());
assert_eq!(config.cas_storage_config.cache_config.unwrap().cache_size, expected_size);
}
}

#[test]
#[serial(default_config_env)]
fn test_default_config_with_hf_home() {
Expand Down