refactor: refactor json_reader using ray data

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 36e80ef8b5c9 · 2025-11-21T21:01:11.000+08:00
diff --git a/graphgen/models/reader/csv_reader.py b/graphgen/models/reader/csv_reader.py
@@ -28,9 +28,6 @@ def read(
         """
 
         ds = ray.data.read_csv(input_path, override_num_blocks=override_num_blocks)
-
         ds = ds.map_batches(self._validate_batch, batch_format="pandas")
-
         ds = ds.filter(self._should_keep_item)
-
         return ds
diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py
@@ -1,5 +1,7 @@
-import json
-from typing import Any, Dict, List
+from typing import List, Union
+
+import ray
+from ray.data import Dataset
 
 from graphgen.bases.base_reader import BaseReader
 
@@ -12,15 +14,19 @@ class JSONReader(BaseReader):
         - if type is "text", "content" column must be present.
     """
 
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        with open(file_path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-            if isinstance(data, list):
-                for doc in data:
-                    assert "type" in doc, f"Missing 'type' in document: {doc}"
-                    if doc.get("type") == "text" and self.text_column not in doc:
-                        raise ValueError(
-                            f"Missing '{self.text_column}' in document: {doc}"
-                        )
-                return self.filter(data)
-            raise ValueError("JSON file must contain a list of documents.")
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+        parallelism: int = 4,
+    ) -> Dataset:
+        """
+        Read JSON file and return Ray Dataset.
+        :param input_path: Path to JSON file or list of JSON files.
+        :param parallelism: Number of parallel workers for reading files.
+        :return: Ray Dataset containing validated and filtered data.
+        """
+
+        ds = ray.data.read_json(input_path, parallelism=parallelism)
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        ds = ds.filter(self._should_keep_item)
+        return ds