Skip to content

Commit f411547

Browse files
committed
Remove cache manager, only have one config for cache size
1 parent ee95c46 commit f411547

File tree

3 files changed

+45
-104
lines changed

3 files changed

+45
-104
lines changed

pyiceberg/manifest.py

Lines changed: 23 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
PrimitiveType,
5151
StringType,
5252
StructType,
53-
strtobool,
5453
)
5554
from pyiceberg.utils.config import Config
5655

@@ -893,79 +892,30 @@ def __hash__(self) -> int:
893892
return hash(self.manifest_path)
894893

895894

896-
class _ManifestCacheManager:
897-
"""Manages the manifest cache with lazy initialization from config."""
898-
899-
_DEFAULT_SIZE = 128
900-
901-
def __init__(self) -> None:
902-
self._cache: LRUCache[str, ManifestFile] | None = None
903-
self._initialized = False
904-
self._lock = threading.RLock()
905-
906-
def get_cache(self) -> LRUCache[str, ManifestFile] | None:
907-
"""Return the cache if enabled, else None. Initializes from config on first call."""
908-
with self._lock:
909-
if self._initialized:
910-
return self._cache
911-
912-
config = Config().config
913-
914-
# Extract nested config
915-
manifest_val = config.get("manifest")
916-
manifest_config: dict[str, Any] = manifest_val if isinstance(manifest_val, dict) else {}
917-
cache_val = manifest_config.get("cache")
918-
cache_config: dict[str, Any] = cache_val if isinstance(cache_val, dict) else {}
919-
920-
# Parse and validate enabled flag
921-
enabled_raw = cache_config.get("enabled")
922-
enabled = True
923-
if enabled_raw is not None:
924-
try:
925-
enabled = bool(strtobool(str(enabled_raw)))
926-
except (ValueError, AttributeError) as err:
927-
raise ValueError(
928-
f"manifest.cache.enabled should be a boolean or left unset. Current value: {enabled_raw}"
929-
) from err
930-
931-
# Parse and validate cache size
932-
size_raw = cache_config.get("size")
933-
size = self._DEFAULT_SIZE
934-
if size_raw is not None:
935-
try:
936-
size = int(str(size_raw))
937-
except (ValueError, TypeError) as err:
938-
raise ValueError(
939-
f"manifest.cache.size should be a positive integer or left unset. Current value: {size_raw}"
940-
) from err
941-
if size < 1:
942-
raise ValueError(f"manifest.cache.size must be >= 1. Current value: {size}")
943-
944-
if enabled:
945-
self._cache = LRUCache(maxsize=size)
946-
self._initialized = True
947-
return self._cache
948-
949-
def clear(self) -> None:
950-
"""Clear the cache contents. No-op if cache is disabled."""
951-
cache = self.get_cache()
952-
if cache is not None:
953-
with self._lock:
954-
cache.clear()
955-
956-
957-
# Module-level cache manager instance
958-
_manifest_cache_manager = _ManifestCacheManager()
959-
960-
961-
def _get_manifest_cache() -> LRUCache[str, ManifestFile] | None:
962-
"""Return the manifest cache if enabled, else None. Initializes from config on first call."""
963-
return _manifest_cache_manager.get_cache()
895+
_DEFAULT_MANIFEST_CACHE_SIZE = 128
896+
_manifest_cache_lock = threading.RLock()
897+
898+
899+
def _init_manifest_cache() -> LRUCache[str, ManifestFile] | None:
900+
"""Initialize the manifest cache from config."""
901+
manifest_cache_size = Config().get_int("manifest-cache-size")
902+
if manifest_cache_size is None:
903+
manifest_cache_size = _DEFAULT_MANIFEST_CACHE_SIZE
904+
if manifest_cache_size < 0:
905+
raise ValueError(f"manifest-cache-size must be >= 0. Current value: {manifest_cache_size}")
906+
if manifest_cache_size == 0:
907+
return None
908+
return LRUCache(maxsize=manifest_cache_size)
909+
910+
911+
_manifest_cache = _init_manifest_cache()
964912

965913

966914
def clear_manifest_cache() -> None:
967915
"""Clear the manifest cache. No-op if cache is disabled."""
968-
_manifest_cache_manager.clear()
916+
if _manifest_cache is not None:
917+
with _manifest_cache_lock:
918+
_manifest_cache.clear()
969919

970920

971921
def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]:
@@ -995,12 +945,12 @@ def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]:
995945
file = io.new_input(manifest_list)
996946
manifest_files = list(read_manifest_list(file))
997947

998-
cache = _get_manifest_cache()
999-
if cache is None:
948+
if _manifest_cache is None:
1000949
return tuple(manifest_files)
1001950

1002951
result = []
1003-
with _manifest_cache_manager._lock:
952+
with _manifest_cache_lock:
953+
cache = _manifest_cache
1004954
for manifest_file in manifest_files:
1005955
manifest_path = manifest_file.manifest_path
1006956
if manifest_path in cache:

tests/benchmark/test_memory_benchmark.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@
3232
import pyarrow as pa
3333
import pytest
3434

35+
from pyiceberg import manifest as manifest_module
3536
from pyiceberg.catalog.memory import InMemoryCatalog
36-
from pyiceberg.manifest import _get_manifest_cache, clear_manifest_cache
37+
from pyiceberg.manifest import clear_manifest_cache
3738

3839

3940
def generate_test_dataframe() -> pa.Table:
@@ -95,7 +96,7 @@ def test_manifest_cache_memory_growth(memory_catalog: InMemoryCatalog) -> None:
9596
# Sample memory at intervals
9697
if (i + 1) % 10 == 0:
9798
current, _ = tracemalloc.get_traced_memory()
98-
cache = _get_manifest_cache()
99+
cache = manifest_module._manifest_cache
99100
cache_size = len(cache) if cache is not None else 0
100101

101102
memory_samples.append((i + 1, current, cache_size))
@@ -151,7 +152,7 @@ def test_memory_after_gc_with_cache_cleared(memory_catalog: InMemoryCatalog) ->
151152

152153
gc.collect()
153154
before_clear_memory, _ = tracemalloc.get_traced_memory()
154-
cache = _get_manifest_cache()
155+
cache = manifest_module._manifest_cache
155156
cache_size_before = len(cache) if cache is not None else 0
156157
print(f" Memory before clear: {before_clear_memory / 1024:.1f} KB")
157158
print(f" Cache size: {cache_size_before}")
@@ -193,7 +194,6 @@ def test_manifest_cache_deduplication_efficiency() -> None:
193194
FileFormat,
194195
ManifestEntry,
195196
ManifestEntryStatus,
196-
_get_manifest_cache,
197197
_manifests,
198198
clear_manifest_cache,
199199
write_manifest,
@@ -269,7 +269,7 @@ def test_manifest_cache_deduplication_efficiency() -> None:
269269
_manifests(io, list_path)
270270

271271
# Analyze cache efficiency
272-
cache = _get_manifest_cache()
272+
cache = manifest_module._manifest_cache
273273
cache_entries = len(cache) if cache is not None else 0
274274
# List i contains manifests 0..i, so only the first num_lists manifests are actually used
275275
manifests_actually_used = num_lists

tests/utils/test_manifest.py

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
ManifestEntryStatus,
3535
ManifestFile,
3636
PartitionFieldSummary,
37-
_get_manifest_cache,
3837
_manifests,
3938
clear_manifest_cache,
4039
read_manifest_list,
@@ -50,9 +49,9 @@
5049

5150
@pytest.fixture(autouse=True)
5251
def reset_global_manifests_cache() -> None:
53-
# Reset cache state before each test so config is re-read
54-
manifest_module._manifest_cache_manager._cache = None
55-
manifest_module._manifest_cache_manager._initialized = False
52+
with manifest_module._manifest_cache_lock:
53+
manifest_module._manifest_cache = manifest_module._init_manifest_cache()
54+
clear_manifest_cache()
5655

5756

5857
def _verify_metadata_with_fastavro(avro_file: str, expected_metadata: dict[str, str]) -> None:
@@ -808,7 +807,7 @@ def test_manifest_cache_deduplicates_manifest_files() -> None:
808807

809808
# Verify cache size - should only have 3 unique ManifestFile objects
810809
# instead of 1 + 2 + 3 = 6 objects as with the old approach
811-
cache = _get_manifest_cache()
810+
cache = manifest_module._manifest_cache
812811
assert cache is not None, "Manifest cache should be enabled for this test"
813812
assert len(cache) == 3, f"Cache should contain exactly 3 unique ManifestFile objects, but has {len(cache)}"
814813

@@ -883,7 +882,7 @@ def test_manifest_cache_efficiency_with_many_overlapping_lists() -> None:
883882
# With the new approach, we should have exactly N objects
884883

885884
# Verify cache has exactly N unique entries
886-
cache = _get_manifest_cache()
885+
cache = manifest_module._manifest_cache
887886
assert cache is not None, "Manifest cache should be enabled for this test"
888887
assert len(cache) == num_manifests, (
889888
f"Cache should contain exactly {num_manifests} ManifestFile objects, "
@@ -991,15 +990,15 @@ def test_clear_manifest_cache() -> None:
991990
_manifests(io, list_path)
992991

993992
# Verify cache has entries
994-
cache = _get_manifest_cache()
993+
cache = manifest_module._manifest_cache
995994
assert cache is not None, "Cache should be enabled"
996995
assert len(cache) > 0, "Cache should have entries after reading manifests"
997996

998997
# Clear the cache
999998
clear_manifest_cache()
1000999

10011000
# Verify cache is empty but still enabled
1002-
cache_after = _get_manifest_cache()
1001+
cache_after = manifest_module._manifest_cache
10031002
assert cache_after is not None, "Cache should still be enabled after clear"
10041003
assert len(cache_after) == 0, "Cache should be empty after clear"
10051004

@@ -1008,22 +1007,19 @@ def test_clear_manifest_cache() -> None:
10081007
"env_vars,expected_enabled,expected_size",
10091008
[
10101009
({}, True, 128), # defaults
1011-
({"PYICEBERG_MANIFEST__CACHE__ENABLED": "true"}, True, 128),
1012-
({"PYICEBERG_MANIFEST__CACHE__ENABLED": "false"}, False, 128),
1013-
({"PYICEBERG_MANIFEST__CACHE__SIZE": "64"}, True, 64),
1014-
({"PYICEBERG_MANIFEST__CACHE__SIZE": "256"}, True, 256),
1015-
({"PYICEBERG_MANIFEST__CACHE__ENABLED": "false", "PYICEBERG_MANIFEST__CACHE__SIZE": "64"}, False, 64),
1010+
({"PYICEBERG_MANIFEST_CACHE_SIZE": "64"}, True, 64),
1011+
({"PYICEBERG_MANIFEST_CACHE_SIZE": "256"}, True, 256),
1012+
({"PYICEBERG_MANIFEST_CACHE_SIZE": "0"}, False, 0), # size=0 disables cache
10161013
],
10171014
)
10181015
def test_manifest_cache_config_valid_values(env_vars: dict[str, str], expected_enabled: bool, expected_size: int) -> None:
10191016
"""Test that valid config values are applied correctly."""
10201017
import os
10211018

10221019
with mock.patch.dict(os.environ, env_vars, clear=False):
1023-
# Reset cache state so config is re-read
1024-
manifest_module._manifest_cache_manager._cache = None
1025-
manifest_module._manifest_cache_manager._initialized = False
1026-
cache = _get_manifest_cache()
1020+
with manifest_module._manifest_cache_lock:
1021+
manifest_module._manifest_cache = manifest_module._init_manifest_cache()
1022+
cache = manifest_module._manifest_cache
10271023

10281024
if expected_enabled:
10291025
assert cache is not None, "Cache should be enabled"
@@ -1035,20 +1031,15 @@ def test_manifest_cache_config_valid_values(env_vars: dict[str, str], expected_e
10351031
@pytest.mark.parametrize(
10361032
"env_vars,expected_error_substring",
10371033
[
1038-
({"PYICEBERG_MANIFEST__CACHE__ENABLED": "maybe"}, "manifest.cache.enabled should be a boolean"),
1039-
({"PYICEBERG_MANIFEST__CACHE__ENABLED": "invalid"}, "manifest.cache.enabled should be a boolean"),
1040-
({"PYICEBERG_MANIFEST__CACHE__SIZE": "abc"}, "manifest.cache.size should be a positive integer"),
1041-
({"PYICEBERG_MANIFEST__CACHE__SIZE": "0"}, "manifest.cache.size must be >= 1"),
1042-
({"PYICEBERG_MANIFEST__CACHE__SIZE": "-5"}, "manifest.cache.size must be >= 1"),
1034+
({"PYICEBERG_MANIFEST_CACHE_SIZE": "abc"}, "manifest-cache-size should be an integer"),
1035+
({"PYICEBERG_MANIFEST_CACHE_SIZE": "-5"}, "manifest-cache-size must be >= 0"),
10431036
],
10441037
)
10451038
def test_manifest_cache_config_invalid_values(env_vars: dict[str, str], expected_error_substring: str) -> None:
10461039
"""Test that invalid config values raise ValueError with appropriate message."""
10471040
import os
10481041

10491042
with mock.patch.dict(os.environ, env_vars, clear=False):
1050-
# Reset cache state so config is re-read
1051-
manifest_module._manifest_cache_manager._cache = None
1052-
manifest_module._manifest_cache_manager._initialized = False
10531043
with pytest.raises(ValueError, match=expected_error_substring):
1054-
_get_manifest_cache()
1044+
with manifest_module._manifest_cache_lock:
1045+
manifest_module._manifest_cache = manifest_module._init_manifest_cache()

0 commit comments

Comments
 (0)