Skip to content

Commit 0a9eb2c

Browse files
refactor: replace sqlite with rocksdb
1 parent 837a1ee commit 0a9eb2c

File tree

5 files changed

+39
-5
lines changed

5 files changed

+39
-5
lines changed

graphgen/models/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,5 @@
3333
from .searcher.web.bing_search import BingSearch
3434
from .searcher.web.google_search import GoogleSearch
3535
from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
36-
from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage
36+
from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage, RocksDBCache
3737
from .tokenizer import Tokenizer
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
from .json_storage import JsonKVStorage, JsonListStorage
22
from .networkx_storage import NetworkXStorage
3+
from .rocksdb_cache import RocksDBCache
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from pathlib import Path
2+
from typing import Any, Iterator, Optional
3+
4+
# rocksdict is a lightweight C wrapper around RocksDB for Python, pylint may not recognize it
5+
# pylint: disable=no-name-in-module
6+
from rocksdict import Rdict
7+
8+
9+
class RocksDBCache:
10+
def __init__(self, cache_dir: str):
11+
self.db_path = Path(cache_dir)
12+
self.db = Rdict(str(self.db_path))
13+
14+
def get(self, key: str) -> Optional[Any]:
15+
return self.db.get(key)
16+
17+
def set(self, key: str, value: Any):
18+
self.db[key] = value
19+
20+
def delete(self, key: str):
21+
try:
22+
del self.db[key]
23+
except KeyError:
24+
pass
25+
26+
def close(self):
27+
if hasattr(self, "db") and self.db is not None:
28+
self.db.close()
29+
self.db = None
30+
31+
def __iter__(self) -> Iterator[str]:
32+
return iter(self.db.keys())

graphgen/operators/read/parallel_file_scanner.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,15 @@
44
from pathlib import Path
55
from typing import Any, Dict, List, Set, Union
66

7-
from diskcache import Cache
8-
7+
from graphgen.models import RocksDBCache
98
from graphgen.utils import logger
109

1110

1211
class ParallelFileScanner:
1312
def __init__(
1413
self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4
1514
):
16-
self.cache = Cache(cache_dir)
15+
self.cache = RocksDBCache(os.path.join(cache_dir, "file_paths_cache"))
1716
self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None
1817
self.rescan = rescan
1918
self.max_workers = max_workers

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ requests
2020
fastapi
2121
trafilatura
2222
aiohttp
23-
diskcache
2423
socksio
2524

2625
leidenalg
2726
igraph
2827
python-louvain
2928

29+
# storage
30+
rocksdict
31+
3032
# KG
3133
rdflib
3234

0 commit comments

Comments
 (0)