feat: add llm as actors

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 0fbfcf20f409 · 2025-12-15T19:11:43.000+08:00
diff --git a/graphgen/bases/base_llm_wrapper.py b/graphgen/bases/base_llm_wrapper.py
@@ -72,9 +72,3 @@ def filter_think_tags(text: str, think_tag: str = "think") -> str:
 
         filtered = filtered.strip()
         return filtered if filtered else text.strip()
-
-    def shutdown(self) -> None:
-        """Shutdown the LLM engine if applicable."""
-
-    def restart(self) -> None:
-        """Reinitialize the LLM engine if applicable."""
diff --git a/graphgen/common/init_llm.py b/graphgen/common/init_llm.py
@@ -1,57 +1,152 @@
 import os
 from typing import Any, Dict, Optional
 
+import ray
+
 from graphgen.bases import BaseLLMWrapper
+from graphgen.common.init_storage import get_actor_handle
 from graphgen.models import Tokenizer
 
 
-class LLMFactory:
+class LLMServiceActor:
     """
-    A factory class to create LLM wrapper instances based on the specified backend.
-    Supported backends include:
-    - http_api: HTTPClient
-    - openai_api: OpenAIClient
-    - ollama_api: OllamaClient
-    - huggingface: HuggingFaceWrapper
-    - sglang: SGLangWrapper
+    A Ray actor class to wrap LLM wrapper instances for distributed usage.
     """
 
-    @staticmethod
-    def create_llm_wrapper(backend: str, config: Dict[str, Any]) -> BaseLLMWrapper:
-        # add tokenizer
-        tokenizer: Tokenizer = Tokenizer(
-            os.environ.get("TOKENIZER_MODEL", "cl100k_base"),
-        )
+    def __init__(self, backend: str, config: Dict[str, Any]):
+        self.backend = backend
+        tokenizer_model = os.environ.get("TOKENIZER_MODEL", "cl100k_base")
+        tokenizer = Tokenizer(model_name=tokenizer_model)
         config["tokenizer"] = tokenizer
+
         if backend == "http_api":
             from graphgen.models.llm.api.http_client import HTTPClient
 
-            return HTTPClient(**config)
-        if backend in ("openai_api", "azure_openai_api"):
+            self.llm_instance = HTTPClient(**config)
+        elif backend in ("openai_api", "azure_openai_api"):
             from graphgen.models.llm.api.openai_client import OpenAIClient
 
             # pass in concrete backend to the OpenAIClient so that internally we can distinguish
             # between OpenAI and Azure OpenAI
-            return OpenAIClient(**config, backend=backend)
-        if backend == "ollama_api":
+            self.llm_instance = OpenAIClient(**config, backend=backend)
+        elif backend == "ollama_api":
             from graphgen.models.llm.api.ollama_client import OllamaClient
 
-            return OllamaClient(**config)
-        if backend == "huggingface":
+            self.llm_instance = OllamaClient(**config)
+        elif backend == "huggingface":
             from graphgen.models.llm.local.hf_wrapper import HuggingFaceWrapper
 
-            return HuggingFaceWrapper(**config)
-        if backend == "sglang":
+            self.llm_instance = HuggingFaceWrapper(**config)
+        elif backend == "sglang":
             from graphgen.models.llm.local.sglang_wrapper import SGLangWrapper
 
-            return SGLangWrapper(**config)
+            self.llm_instance = SGLangWrapper(**config)
+
+        elif backend == "vllm":
+            from graphgen.models.llm.local.vllm_wrapper import VLLMWrapper
+
+            self.llm_instance = VLLMWrapper(**config)
+        else:
+            raise NotImplementedError(f"Backend {backend} is not implemented yet.")
+
+    async def generate_answer(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> str:
+        return await self.llm_instance.generate_answer(text, history, **extra)
+
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        return await self.llm_instance.generate_topk_per_token(text, history, **extra)
 
-        # if backend == "vllm":
-        #     from graphgen.models.llm.local.vllm_wrapper import VLLMWrapper
-        #
-        #     return VLLMWrapper(**config)
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        return await self.llm_instance.generate_inputs_prob(text, history, **extra)
 
-        raise NotImplementedError(f"Backend {backend} is not implemented yet.")
+    def ready(self) -> bool:
+        """A simple method to check if the actor is ready."""
+        return True
+
+
+class LLMServiceProxy(BaseLLMWrapper):
+    """
+    A proxy class to interact with the LLMServiceActor for distributed LLM operations.
+    """
+
+    def __init__(self, actor_name: str):
+        super().__init__()
+        self.actor_handle = get_actor_handle(actor_name)
+        self._create_local_tokenizer()
+
+    async def generate_answer(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> str:
+        object_ref = self.actor_handle.generate_answer.remote(text, history, **extra)
+        return await object_ref
+
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        object_ref = self.actor_handle.generate_topk_per_token.remote(
+            text, history, **extra
+        )
+        return await object_ref
+
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[list[str]] = None, **extra: Any
+    ) -> list:
+        object_ref = self.actor_handle.generate_inputs_prob.remote(
+            text, history, **extra
+        )
+        return await object_ref
+
+    def _create_local_tokenizer(self):
+        tokenizer_model = os.environ.get("TOKENIZER_MODEL", "cl100k_base")
+        self.tokenizer = Tokenizer(model_name=tokenizer_model)
+
+
+class LLMFactory:
+    """
+    A factory class to create LLM wrapper instances based on the specified backend.
+    Supported backends include:
+    - http_api: HTTPClient
+    - openai_api: OpenAIClient
+    - ollama_api: OllamaClient
+    - huggingface: HuggingFaceWrapper
+    - sglang: SGLangWrapper
+    """
+
+    @staticmethod
+    def create_llm(
+        model_type: str, backend: str, config: Dict[str, Any]
+    ) -> BaseLLMWrapper:
+        if not config:
+            raise ValueError(
+                f"No configuration provided for LLM {model_type} with backend {backend}."
+            )
+
+        actor_name = f"Actor_LLM_{model_type}"
+        try:
+            ray.get_actor(actor_name)
+        except ValueError:
+            print(f"Creating Ray actor for LLM {model_type} with backend {backend}.")
+            num_gpus = config.pop("num_gpus", 0)
+            actor = (
+                ray.remote(LLMServiceActor)
+                .options(
+                    name=actor_name,
+                    num_gpus=num_gpus,
+                    lifetime="detached",
+                    get_if_exists=True,
+                )
+                .remote(backend, config)
+            )
+
+            # wait for actor to be ready
+            ray.get(actor.ready.remote())
+
+        return LLMServiceProxy(actor_name)
 
 
 def _load_env_group(prefix: str) -> Dict[str, Any]:
@@ -78,8 +173,5 @@ def init_llm(model_type: str) -> Optional[BaseLLMWrapper]:
     if not config:
         return None
     backend = config.pop("backend")
-    llm_wrapper = LLMFactory.create_llm_wrapper(backend, config)
+    llm_wrapper = LLMFactory.create_llm(model_type, backend, config)
     return llm_wrapper
-
-
-# TODO: use ray serve when loading large models to avoid re-loading in each actor
diff --git a/graphgen/common/init_storage.py b/graphgen/common/init_storage.py
@@ -55,7 +55,7 @@ def __init__(self, backend: str, working_dir: str, namespace: str):
             from graphgen.models import NetworkXStorage
 
             self.graph = NetworkXStorage(working_dir, namespace)
-        if backend == "kuzu":
+        elif backend == "kuzu":
             from graphgen.models import KuzuStorage
 
             self.graph = KuzuStorage(working_dir, namespace)
diff --git a/graphgen/models/llm/local/sglang_wrapper.py b/graphgen/models/llm/local/sglang_wrapper.py
@@ -138,15 +138,3 @@ async def generate_inputs_prob(
         raise NotImplementedError(
             "SGLangWrapper does not support per-token logprobs yet."
         )
-
-    def shutdown(self) -> None:
-        """Gracefully shutdown the SGLang engine."""
-        if hasattr(self, "engine"):
-            self.engine.shutdown()
-
-    def restart(self) -> None:
-        """Restart the SGLang engine."""
-        self.shutdown()
-        self.engine = self.engine.__class__(
-            model_path=self.model_path, tp_size=self.tp_size
-        )
diff --git a/graphgen/operators/read/parallel_file_scanner.py b/graphgen/operators/read/parallel_file_scanner.py
@@ -11,7 +11,7 @@ class ParallelFileScanner:
     def __init__(
         self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4
     ):
-        self.cache = RocksDBCache(os.path.join(cache_dir, "file_paths_cache"))
+        self.cache = RocksDBCache(os.path.join(cache_dir, "input_paths.db"))
         self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None
         self.rescan = rescan
         self.max_workers = max_workers