InternScience
diff --git a/‎graphgen/bases/base_storage.py‎
Lines changed: 0 additions & 17 deletions b/‎graphgen/bases/base_storage.py‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎graphgen/bases/datatypes.py‎
Lines changed: 7 additions & 0 deletions b/‎graphgen/bases/datatypes.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎graphgen/common/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎graphgen/common/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎graphgen/common/init_llm.py‎
Lines changed: 2 additions & 0 deletions b/‎graphgen/common/init_llm.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎graphgen/common/init_storage.py‎
Lines changed: 28 additions & 0 deletions b/‎graphgen/common/init_storage.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎graphgen/engine.py‎
Lines changed: 41 additions & 9 deletions b/‎graphgen/engine.py‎
Lines changed: 41 additions & 9 deletions
diff --git a/‎graphgen/operators/build_kg/build_kg_service.py‎
Lines changed: 11 additions & 7 deletions b/‎graphgen/operators/build_kg/build_kg_service.py‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎graphgen/operators/evaluate.py‎ ‎graphgen/operators/evaluate/evaluate.py‎graphgen/operators/evaluate.py renamed to graphgen/operators/evaluate/evaluate.py b/‎graphgen/operators/evaluate.py‎ ‎graphgen/operators/evaluate/evaluate.py‎graphgen/operators/evaluate.py renamed to graphgen/operators/evaluate/evaluate.py
diff --git a/‎…aphgen/operators/quiz_and_judge/judge.py‎ ‎graphgen/operators/judge/judge.py‎graphgen/operators/quiz_and_judge/judge.py renamed to graphgen/operators/judge/judge.py b/‎…aphgen/operators/quiz_and_judge/judge.py‎ ‎graphgen/operators/judge/judge.py‎graphgen/operators/quiz_and_judge/judge.py renamed to graphgen/operators/judge/judge.py
diff --git a/‎graphgen/operators/quiz_and_judge/quiz.py‎ ‎graphgen/operators/quiz/quiz.py‎graphgen/operators/quiz_and_judge/quiz.py renamed to graphgen/operators/quiz/quiz.py b/‎graphgen/operators/quiz_and_judge/quiz.py‎ ‎graphgen/operators/quiz/quiz.py‎graphgen/operators/quiz_and_judge/quiz.py renamed to graphgen/operators/quiz/quiz.py
@@ -16,23 +16,6 @@ def query_done_callback(self):
         """commit the storage operations after querying"""
 
 
-class BaseListStorage(Generic[T], StorageNameSpace):
-    def all_items(self) -> list[T]:
-        raise NotImplementedError
-
-    def get_by_index(self, index: int) -> Union[T, None]:
-        raise NotImplementedError
-
-    def append(self, data: T):
-        raise NotImplementedError
-
-    def upsert(self, data: list[T]):
-        raise NotImplementedError
-
-    def drop(self):
-        raise NotImplementedError
-
-
 class BaseKVStorage(Generic[T], StorageNameSpace):
     def all_keys(self) -> list[str]:
         raise NotImplementedError
 
@@ -62,6 +62,9 @@ class Node(BaseModel):
     dependencies: List[str] = Field(
         default_factory=list, description="list of dependent node ids"
     )
+    execution_params: dict = Field(
+        default_factory=dict, description="execution parameters like replicas, batch_size"
+    )
 
     @classmethod
     @field_validator("type")
@@ -73,6 +76,10 @@ def validate_type(cls, v: str) -> str:
 
 
 class Config(BaseModel):
+    global_params: dict = Field(
+        default_factory=dict, description="global context for the computation graph"
+    )
+
     nodes: List[Node] = Field(
         ..., min_length=1, description="list of nodes in the computation graph"
     )
 
@@ -0,0 +1,2 @@
+from .init_llm import init_llm
+from .init_storage import init_storage
@@ -29,6 +29,7 @@ def create_llm_wrapper(backend: str, config: Dict[str, Any]) -> BaseLLMWrapper:
             return HTTPClient(**config)
         if backend in ("openai_api", "azure_openai_api"):
             from graphgen.models.llm.api.openai_client import OpenAIClient
+
             # pass in concrete backend to the OpenAIClient so that internally we can distinguish
             # between OpenAI and Azure OpenAI
             return OpenAIClient(**config, backend=backend)
@@ -80,4 +81,5 @@ def init_llm(model_type: str) -> Optional[BaseLLMWrapper]:
     llm_wrapper = LLMFactory.create_llm_wrapper(backend, config)
     return llm_wrapper
 
+
 # TODO: use ray serve when loading large models to avoid re-loading in each actor
@@ -0,0 +1,28 @@
+from graphgen.models import JsonKVStorage, NetworkXStorage
+
+
+class StorageFactory:
+    """
+    Factory class to create storage instances based on backend.
+    Supported backends:
+        kv_storage(key-value storage):
+            - json_kv: JsonKVStorage
+        graph_storage:
+            - networkx: NetworkXStorage (graph storage)
+    """
+
+    @staticmethod
+    def create_storage(backend: str, working_dir: str, namespace: str):
+        if backend == "json_kv":
+            return JsonKVStorage(working_dir, namespace=namespace)
+
+        if backend == "networkx":
+            return NetworkXStorage(working_dir, namespace=namespace)
+
+        raise NotImplementedError(
+            f"Storage backend '{backend}' is not implemented yet."
+        )
+
+
+def init_storage(backend: str, working_dir: str, namespace: str):
+    return StorageFactory.create_storage(backend, working_dir, namespace)
@@ -15,6 +15,7 @@ def __init__(
         self, config: Dict[str, Any], functions: Dict[str, Callable], **ray_init_kwargs
     ):
         self.config = Config(**config)
+        self.global_params = self.config.global_params
         self.functions = functions
         self.datasets: Dict[str, ray.data.Dataset] = {}
 
@@ -90,28 +91,59 @@ def _get_input_dataset(
         return main_ds.union(*other_dss)
 
     def _execute_node(self, node: Node, initial_ds: ray.data.Dataset):
+        def _filter_kwargs(
+            func_or_class: Callable,
+            global_params: Dict[str, Any],
+            func_params: Dict[str, Any],
+        ) -> Dict[str, Any]:
+            """
+            1. global_params: only when specified in function signature, will be passed
+            2. func_params: pass specified params first, then **kwargs if exists
+            """
+            try:
+                sig = inspect.signature(func_or_class)
+            except ValueError:
+                return {}
+
+            params = sig.parameters
+            final_kwargs = {}
+
+            has_var_keywords = any(
+                p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()
+            )
+            valid_keys = set(params.keys())
+            for k, v in global_params.items():
+                if k in valid_keys:
+                    final_kwargs[k] = v
+
+            for k, v in func_params.items():
+                if k in valid_keys or has_var_keywords:
+                    final_kwargs[k] = v
+                elif has_var_keywords:
+                    final_kwargs[k] = v
+            return final_kwargs
+
         if node.op_name not in self.functions:
             raise ValueError(f"Operator {node.op_name} not found for node {node.id}")
 
+        op_handler = self.functions[node.op_name]
+        node_params = _filter_kwargs(op_handler, self.global_params, node.params or {})
+
         if node.type == "source":
-            op_handler = self.functions[node.op_name]
-            node_params = node.params
             self.datasets[node.id] = op_handler(**node_params)
             return
 
         input_ds = self._get_input_dataset(node, initial_ds)
 
-        op_handler = self.functions[node.op_name]
-        node_params = node.params
-
         if inspect.isclass(op_handler):
-            replicas = node_params.pop("replicas", 1)
+            execution_params = node.execution_params or {}
+            replicas = execution_params.get("replicas", 1)
             batch_size = (
-                int(node_params.pop("batch_size"))
-                if "batch_size" in node_params
+                int(execution_params.get("batch_size"))
+                if "batch_size" in execution_params
                 else "default"
             )
-            compute_resources = node_params.pop("compute_resources", {})
+            compute_resources = execution_params.get("compute_resources", {})
 
             if node.type == "aggregate":
                 self.datasets[node.id] = input_ds.repartition(1).map_batches(
 
@@ -1,27 +1,32 @@
 from typing import List
+
 import pandas as pd
 
-from graphgen.bases import BaseLLMWrapper, BaseGraphStorage
+from graphgen.bases import BaseGraphStorage, BaseLLMWrapper
 from graphgen.bases.datatypes import Chunk
 from graphgen.common import init_llm, init_storage
 from graphgen.utils import logger
-from .build_text_kg import build_text_kg
+
 from .build_mm_kg import build_mm_kg
+from .build_text_kg import build_text_kg
 
 
 class BuildKGService:
-    def __init__(self):
+    def __init__(self, working_dir: str = "cache"):
         self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
         self.graph_storage: BaseGraphStorage = init_storage(
-            backend="networkx", working_dir="cache",namespace="graph")
+            backend="networkx", working_dir=working_dir, namespace="graph"
+        )
 
     def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
         docs = batch.to_dict(orient="records")
         docs = [Chunk.from_dict(doc["_chunk_id"], doc) for doc in docs]
-        return pd.DataFrame(self.build_kg(docs))
 
+        # consume the chunks and build kg
+        self.build_kg(docs)
+        return pd.DataFrame()
 
-    def build_kg(self, chunks: List[Chunk]) -> List:
+    def build_kg(self, chunks: List[Chunk]) -> None:
         """
         Build knowledge graph (KG) and merge into kg_instance
         """
@@ -52,4 +57,3 @@ def build_kg(self, chunks: List[Chunk]) -> List:
             )
 
         self.graph_storage.index_done_callback()
-        return [{"_chunk_id": chunk.id} for chunk in chunks]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .init_llm import init_llm`
	`2`	`+from .init_storage import init_storage`