InternScience
diff --git a/‎graphgen/bases/base_reader.py‎
Lines changed: 55 additions & 40 deletions b/‎graphgen/bases/base_reader.py‎
Lines changed: 55 additions & 40 deletions
diff --git a/‎graphgen/models/reader/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎graphgen/models/reader/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎graphgen/models/reader/csv_reader.py‎
Lines changed: 19 additions & 11 deletions b/‎graphgen/models/reader/csv_reader.py‎
Lines changed: 19 additions & 11 deletions
diff --git a/‎graphgen/models/reader/json_reader.py‎
Lines changed: 21 additions & 15 deletions b/‎graphgen/models/reader/json_reader.py‎
Lines changed: 21 additions & 15 deletions
diff --git a/‎graphgen/models/reader/jsonl_reader.py‎
Lines changed: 0 additions & 30 deletions b/‎graphgen/models/reader/jsonl_reader.py‎
Lines changed: 0 additions & 30 deletions
diff --git a/‎graphgen/models/reader/parquet_reader.py‎
Lines changed: 21 additions & 10 deletions b/‎graphgen/models/reader/parquet_reader.py‎
Lines changed: 21 additions & 10 deletions
diff --git a/‎graphgen/models/reader/pdf_reader.py‎
Lines changed: 36 additions & 20 deletions b/‎graphgen/models/reader/pdf_reader.py‎
Lines changed: 36 additions & 20 deletions
@@ -1,8 +1,10 @@
 import os
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
+import pandas as pd
 import requests
+from ray.data import Dataset
 
 
 class BaseReader(ABC):
@@ -14,52 +16,65 @@ def __init__(self, text_column: str = "content"):
         self.text_column = text_column
 
     @abstractmethod
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
+    def read(self, input_path: Union[str, List[str]]) -> Dataset:
         """
         Read data from the specified file path.
 
-        :param file_path: Path to the input file.
-        :return: List of dictionaries containing the data.
+        :param input_path: Path to the input file or list of file paths.
+        :return: Ray Dataset containing the read data.
         """
 
-    @staticmethod
-    def filter(data: List[dict]) -> List[dict]:
+    def _should_keep_item(self, item: Dict[str, Any]) -> bool:
+        """
+        Determine whether to keep the given item based on the text column.
+
+        :param item: Dictionary representing a data entry.
+        :return: True if the item should be kept, False otherwise.
         """
-        Filter out entries with empty or missing text in the specified column.
+        item_type = item.get("type")
+        assert item_type in [
+            "text",
+            "image",
+            "table",
+            "equation",
+            "protein",
+        ], f"Unsupported item type: {item_type}"
+        if item_type == "text":
+            content = item.get(self.text_column, "").strip()
+            return bool(content)
+        return True
 
-        :param data: List of dictionaries containing the data.
-        :return: Filtered list of dictionaries.
+    def _validate_batch(self, batch: pd.DataFrame) -> pd.DataFrame:
+        """
+        Validate data format.
         """
+        if "type" not in batch.columns:
+            raise ValueError(f"Missing 'type' column. Found: {list(batch.columns)}")
 
-        def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
-            """
-            Check if an image exists at the given local path or URL.
-            :param path_or_url: Local file path or remote URL of the image.
-            :param timeout: Timeout for remote URL requests in seconds.
-            :return: True if the image exists, False otherwise.
-            """
-            if not path_or_url:
-                return False
-            if not path_or_url.startswith(("http://", "https://", "ftp://")):
-                path = path_or_url.replace("file://", "", 1)
-                path = os.path.abspath(path)
-                return os.path.isfile(path)
-            try:
-                resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
-                return resp.status_code == 200
-            except requests.RequestException:
-                return False
+        if "text" in batch["type"].values:
+            if self.text_column not in batch.columns:
+                raise ValueError(
+                    f"Missing '{self.text_column}' column for text documents"
+                )
 
-        filtered_data = []
-        for item in data:
-            if item.get("type") == "text":
-                content = item.get("content", "").strip()
-                if content:
-                    filtered_data.append(item)
-            elif item.get("type") in ("image", "table", "equation"):
-                img_path = item.get("img_path")
-                if _image_exists(img_path):
-                    filtered_data.append(item)
-            else:
-                filtered_data.append(item)
-        return filtered_data
+        return batch
+
+    @staticmethod
+    def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+        """
+        Check if an image exists at the given local path or URL.
+        :param path_or_url: Local file path or remote URL of the image.
+        :param timeout: Timeout for remote URL requests in seconds.
+        :return: True if the image exists, False otherwise.
+        """
+        if not path_or_url:
+            return False
+        if not path_or_url.startswith(("http://", "https://", "ftp://")):
+            path = path_or_url.replace("file://", "", 1)
+            path = os.path.abspath(path)
+            return os.path.isfile(path)
+        try:
+            resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+            return resp.status_code == 200
+        except requests.RequestException:
+            return False
@@ -1,6 +1,5 @@
 from .csv_reader import CSVReader
 from .json_reader import JSONReader
-from .jsonl_reader import JSONLReader
 from .parquet_reader import ParquetReader
 from .pdf_reader import PDFReader
 from .pickle_reader import PickleReader
 
@@ -1,6 +1,7 @@
-from typing import Any, Dict, List
+from typing import List, Union
 
-import pandas as pd
+import ray
+from ray.data import Dataset
 
 from graphgen.bases.base_reader import BaseReader
 
@@ -13,13 +14,20 @@ class CSVReader(BaseReader):
         - if type is "text", "content" column must be present.
     """
 
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+        parallelism: int = None,
+    ) -> Dataset:
+        """
+        Read CSV files and return Ray Dataset.
 
-        df = pd.read_csv(file_path)
-        for _, row in df.iterrows():
-            assert "type" in row, f"Missing 'type' column in document: {row.to_dict()}"
-            if row["type"] == "text" and self.text_column not in row:
-                raise ValueError(
-                    f"Missing '{self.text_column}' in document: {row.to_dict()}"
-                )
-        return self.filter(df.to_dict(orient="records"))
+        :param input_path: Path to CSV file or list of CSV files.
+        :param parallelism: Number of blocks for Ray Dataset reading.
+        :return: Ray Dataset containing validated and filtered data.
+        """
+
+        ds = ray.data.read_csv(input_path, override_num_blocks=parallelism)
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        ds = ds.filter(self._should_keep_item)
+        return ds
@@ -1,26 +1,32 @@
-import json
-from typing import Any, Dict, List
+from typing import List, Union
+
+import ray
+from ray.data import Dataset
 
 from graphgen.bases.base_reader import BaseReader
 
 
 class JSONReader(BaseReader):
     """
-    Reader for JSON files.
+    Reader for JSON and JSONL files.
     Columns:
         - type: The type of the document (e.g., "text", "image", etc.)
         - if type is "text", "content" column must be present.
     """
 
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        with open(file_path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-            if isinstance(data, list):
-                for doc in data:
-                    assert "type" in doc, f"Missing 'type' in document: {doc}"
-                    if doc.get("type") == "text" and self.text_column not in doc:
-                        raise ValueError(
-                            f"Missing '{self.text_column}' in document: {doc}"
-                        )
-                return self.filter(data)
-            raise ValueError("JSON file must contain a list of documents.")
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+        parallelism: int = 4,
+    ) -> Dataset:
+        """
+        Read JSON file and return Ray Dataset.
+        :param input_path: Path to JSON/JSONL file or list of JSON/JSONL files.
+        :param parallelism: Number of parallel workers for reading files.
+        :return: Ray Dataset containing validated and filtered data.
+        """
+
+        ds = ray.data.read_json(input_path, override_num_blocks=parallelism)
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        ds = ds.filter(self._should_keep_item)
+        return ds
@@ -1,6 +1,7 @@
-from typing import Any, Dict, List
+from typing import List, Union
 
-import pandas as pd
+import ray
+from ray.data import Dataset
 
 from graphgen.bases.base_reader import BaseReader
 
@@ -13,12 +14,22 @@ class ParquetReader(BaseReader):
     - if type is "text", "content" column must be present.
     """
 
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        df = pd.read_parquet(file_path)
-        data: List[Dict[str, Any]] = df.to_dict(orient="records")
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+        parallelism: int = None,
+    ) -> Dataset:
+        """
+        Read Parquet files using Ray Data.
 
-        for doc in data:
-            assert "type" in doc, f"Missing 'type' in document: {doc}"
-            if doc.get("type") == "text" and self.text_column not in doc:
-                raise ValueError(f"Missing '{self.text_column}' in document: {doc}")
-        return self.filter(data)
+        :param input_path: Path to Parquet file or list of Parquet files.
+        :param parallelism: Number of blocks for Ray Dataset reading.
+        :return: Ray Dataset containing validated documents.
+        """
+        if not ray.is_initialized():
+            ray.init()
+
+        ds = ray.data.read_parquet(input_path, override_num_blocks=parallelism)
+        ds = ds.map_batches(self._validate_batch, batch_format="pandas")
+        ds = ds.filter(self._should_keep_item)
+        return ds
@@ -5,6 +5,9 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+import ray
+from ray.data import Dataset
+
 from graphgen.bases.base_reader import BaseReader
 from graphgen.models.reader.txt_reader import TXTReader
 from graphgen.utils import logger, pick_device
@@ -62,19 +65,32 @@ def __init__(
         self.parser = MinerUParser()
         self.txt_reader = TXTReader()
 
-    def read(self, file_path: str, **override) -> List[Dict[str, Any]]:
-        """
-        file_path
-        **override: override MinerU parameters
-        """
-        pdf_path = Path(file_path).expanduser().resolve()
-        if not pdf_path.is_file():
-            raise FileNotFoundError(pdf_path)
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+        parallelism: int = 4,
+        **override,
+    ) -> Dataset:
+
+        # Ensure input_path is a list
+        if isinstance(input_path, str):
+            input_path = [input_path]
+
+        paths_ds = ray.data.from_items(input_path)
+
+        def process_pdf(row: Dict[str, Any]) -> List[Dict[str, Any]]:
+            try:
+                pdf_path = row["item"]
+                kwargs = {**self._default_kwargs, **override}
+                return self._call_mineru(Path(pdf_path), kwargs)
+            except Exception as e:
+                logger.error("Failed to process %s: %s", row, e)
+                return []
 
-        kwargs = {**self._default_kwargs, **override}
+        docs_ds = paths_ds.flat_map(process_pdf)
+        docs_ds = docs_ds.filter(self._should_keep_item)
 
-        mineru_result = self._call_mineru(pdf_path, kwargs)
-        return self.filter(mineru_result)
+        return docs_ds
 
     def _call_mineru(
         self, pdf_path: Path, kwargs: Dict[str, Any]
@@ -161,18 +177,18 @@ def _try_load_cached_result(
 
         base = os.path.dirname(json_file)
         results = []
-        for item in data:
+        for it in data:
             for key in ("img_path", "table_img_path", "equation_img_path"):
-                rel_path = item.get(key)
+                rel_path = it.get(key)
                 if rel_path:
-                    item[key] = str(Path(base).joinpath(rel_path).resolve())
-            if item["type"] == "text":
-                item["content"] = item["text"]
-                del item["text"]
+                    it[key] = str(Path(base).joinpath(rel_path).resolve())
+            if it["type"] == "text":
+                it["content"] = it["text"]
+                del it["text"]
             for key in ("page_idx", "bbox", "text_level"):
-                if item.get(key) is not None:
-                    del item[key]
-            results.append(item)
+                if it.get(key) is not None:
+                    del it[key]
+            results.append(it)
         return results
 
     @staticmethod