From 0a9eb2c10a0430c41b16c4cb755811abe9b9d8d5 Mon Sep 17 00:00:00 2001 From: chenzihong-gavin Date: Wed, 3 Dec 2025 15:32:05 +0800 Subject: [PATCH 1/3] refactor: replace sqlite with rocksdb --- graphgen/models/__init__.py | 2 +- graphgen/models/storage/__init__.py | 1 + graphgen/models/storage/rocksdb_cache.py | 32 +++++++++++++++++++ .../operators/read/parallel_file_scanner.py | 5 ++- requirements.txt | 4 ++- 5 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 graphgen/models/storage/rocksdb_cache.py diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 00a5cc56..3ef1ff69 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -33,5 +33,5 @@ from .searcher.web.bing_search import BingSearch from .searcher.web.google_search import GoogleSearch from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter -from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage +from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage, RocksDBCache from .tokenizer import Tokenizer diff --git a/graphgen/models/storage/__init__.py b/graphgen/models/storage/__init__.py index 56338984..1e8f8341 100644 --- a/graphgen/models/storage/__init__.py +++ b/graphgen/models/storage/__init__.py @@ -1,2 +1,3 @@ from .json_storage import JsonKVStorage, JsonListStorage from .networkx_storage import NetworkXStorage +from .rocksdb_cache import RocksDBCache diff --git a/graphgen/models/storage/rocksdb_cache.py b/graphgen/models/storage/rocksdb_cache.py new file mode 100644 index 00000000..556c3c11 --- /dev/null +++ b/graphgen/models/storage/rocksdb_cache.py @@ -0,0 +1,32 @@ +from pathlib import Path +from typing import Any, Iterator, Optional + +# rocksdict is a lightweight C wrapper around RocksDB for Python, pylint may not recognize it +# pylint: disable=no-name-in-module +from rocksdict import Rdict + + +class RocksDBCache: + def __init__(self, cache_dir: str): + self.db_path = Path(cache_dir) + self.db = Rdict(str(self.db_path)) + + def get(self, key: str) -> Optional[Any]: + return self.db.get(key) + + def set(self, key: str, value: Any): + self.db[key] = value + + def delete(self, key: str): + try: + del self.db[key] + except KeyError: + pass + + def close(self): + if hasattr(self, "db") and self.db is not None: + self.db.close() + self.db = None + + def __iter__(self) -> Iterator[str]: + return iter(self.db.keys()) diff --git a/graphgen/operators/read/parallel_file_scanner.py b/graphgen/operators/read/parallel_file_scanner.py index 890a50a9..73b477c3 100644 --- a/graphgen/operators/read/parallel_file_scanner.py +++ b/graphgen/operators/read/parallel_file_scanner.py @@ -4,8 +4,7 @@ from pathlib import Path from typing import Any, Dict, List, Set, Union -from diskcache import Cache - +from graphgen.models import RocksDBCache from graphgen.utils import logger @@ -13,7 +12,7 @@ class ParallelFileScanner: def __init__( self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4 ): - self.cache = Cache(cache_dir) + self.cache = RocksDBCache(os.path.join(cache_dir, "file_paths_cache")) self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None self.rescan = rescan self.max_workers = max_workers diff --git a/requirements.txt b/requirements.txt index fa2b1efc..85fc43e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,13 +20,15 @@ requests fastapi trafilatura aiohttp -diskcache socksio leidenalg igraph python-louvain +# storage +rocksdict + # KG rdflib From 4e9fb76bd996076e613a7a60cedfe541ff5b2856 Mon Sep 17 00:00:00 2001 From: chenzihong <58508660+ChenZiHong-Gavin@users.noreply.github.com> Date: Wed, 3 Dec 2025 15:34:55 +0800 Subject: [PATCH 2/3] Potential fix for pull request finding 'Empty except' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- graphgen/models/storage/rocksdb_cache.py | 1 + 1 file changed, 1 insertion(+) diff --git a/graphgen/models/storage/rocksdb_cache.py b/graphgen/models/storage/rocksdb_cache.py index 556c3c11..258cab35 100644 --- a/graphgen/models/storage/rocksdb_cache.py +++ b/graphgen/models/storage/rocksdb_cache.py @@ -21,6 +21,7 @@ def delete(self, key: str): try: del self.db[key] except KeyError: + # If the key does not exist, do nothing (deletion is idempotent for caches) pass def close(self): From 43c801f905cc11594a2bce9de33a2f658c6af7e2 Mon Sep 17 00:00:00 2001 From: chenzihong <58508660+ChenZiHong-Gavin@users.noreply.github.com> Date: Wed, 3 Dec 2025 15:39:50 +0800 Subject: [PATCH 3/3] Update graphgen/models/storage/rocksdb_cache.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- graphgen/models/storage/rocksdb_cache.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/graphgen/models/storage/rocksdb_cache.py b/graphgen/models/storage/rocksdb_cache.py index 258cab35..2345b5b5 100644 --- a/graphgen/models/storage/rocksdb_cache.py +++ b/graphgen/models/storage/rocksdb_cache.py @@ -29,5 +29,15 @@ def close(self): self.db.close() self.db = None + def __del__(self): + # Ensure the database is closed when the object is destroyed + self.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + def __iter__(self) -> Iterator[str]: return iter(self.db.keys())