refactor: refactor pickle_reader using ray data

ChenZiHong-Gavin · ChenZiHong-Gavin · commit ac99aa857ae4 · 2025-11-22T14:21:36.000+08:00
diff --git a/graphgen/models/reader/pdf_reader.py b/graphgen/models/reader/pdf_reader.py
@@ -247,22 +247,3 @@ def _check_bin() -> None:
                 "MinerU is not installed or not found in PATH. Please install it from pip: \n"
                 "pip install -U 'mineru[core]'"
             ) from exc
-
-
-if __name__ == "__main__":
-    reader = PDFReader(
-        output_dir="./output",
-        method="auto",
-        backend="pipeline",
-        device="cpu",
-        lang="en",
-        formula=True,
-        table=True,
-    )
-    dataset = reader.read(
-        "/home/PJLAB/chenzihong/Project/graphgen/resources/input_examples/pdf_demo.pdf",
-        parallelism=2,
-    )
-
-    for item in dataset.take_all():
-        print(item)
diff --git a/graphgen/models/reader/pickle_reader.py b/graphgen/models/reader/pickle_reader.py
@@ -1,30 +1,82 @@
 import pickle
-from typing import Any, Dict, List
+from typing import List, Union
+
+import pandas as pd
+import ray
+from ray.data import Dataset
 
 from graphgen.bases.base_reader import BaseReader
+from graphgen.utils import logger
 
 
 class PickleReader(BaseReader):
     """
-    Read pickle files, requiring the top-level object to be List[Dict[str, Any]].
-
-    Columns:
+    Read pickle files, requiring the schema to be restored to List[Dict[str, Any]].
+    Each pickle file should contain a list of dictionaries with at least:
     - type: The type of the document (e.g., "text", "image", etc.)
     - if type is "text", "content" column must be present.
+
+    Note: Uses ray.data.read_binary_files as ray.data.read_pickle is not available.
+    For Ray >= 2.5, consider using read_pickle if available in your version.
     """
 
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        with open(file_path, "rb") as f:
-            data = pickle.load(f)
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+        override_num_blocks: int = None,
+    ) -> Dataset:
+        """
+        Read Pickle files using Ray Data.
+
+        :param input_path: Path to pickle file or list of pickle files.
+        :param override_num_blocks: Number of blocks for Ray Dataset reading.
+        :return: Ray Dataset containing validated documents.
+        """
+        if not ray.is_initialized():
+            ray.init()
+
+        # Use read_binary_files as a reliable alternative to read_pickle
+        ds = ray.data.read_binary_files(
+            input_path, override_num_blocks=override_num_blocks, include_paths=True
+        )
+
+        # Deserialize pickle files and flatten into individual records
+        def deserialize_batch(batch: pd.DataFrame) -> pd.DataFrame:
+            all_records = []
+            for _, row in batch.iterrows():
+                try:
+                    # Load pickle data from bytes
+                    data = pickle.loads(row["bytes"])
+
+                    # Validate structure
+                    if not isinstance(data, list):
+                        logger.error(
+                            "Pickle file {row['path']} must contain a list, got {type(data)}"
+                        )
+                        continue
+
+                    if not all(isinstance(item, dict) for item in data):
+                        logger.error(
+                            "Pickle file {row['path']} must contain a list of dictionaries"
+                        )
+                        continue
+
+                    # Flatten: each dict in the list becomes a separate row
+                    all_records.extend(data)
+                except Exception as e:
+                    logger.error(
+                        "Failed to deserialize pickle file %s: %s", row["path"], str(e)
+                    )
+                    continue
+
+            return pd.DataFrame(all_records)
 
-        if not isinstance(data, list):
-            raise ValueError("Pickle file must contain a list of documents.")
+        # Apply deserialization and flattening
+        ds = ds.map_batches(deserialize_batch, batch_format="pandas")
 
-        for doc in data:
-            if not isinstance(doc, dict):
-                raise ValueError("Every item in the list must be a dict.")
-            assert "type" in doc, f"Missing 'type' in document: {doc}"
-            if doc.get("type") == "text" and self.text_column not in doc:
-                raise ValueError(f"Missing '{self.text_column}' in document: {doc}")
+        # Validate the schema
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
 
-        return self.filter(data)
+        # Filter valid items
+        ds = ds.filter(self._should_keep_item)
+        return ds