InternScience · ChenZiHong-Gavin · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/.pylintrc b/.pylintrc
@@ -452,6 +452,7 @@ disable=raw-checker-failed,
         R0917,  # Too many positional arguments (6/5) (too-many-positional-arguments)
         C0103,
         E0401,
+        W0703,  # Catching too general exception Exception
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option

diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py
@@ -1,8 +1,10 @@
 import os
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
+import pandas as pd
 import requests
+from ray.data import Dataset
 
 
 class BaseReader(ABC):
@@ -14,52 +16,65 @@ def __init__(self, text_column: str = "content"):
         self.text_column = text_column
 
     @abstractmethod
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
+    def read(self, input_path: Union[str, List[str]]) -> Dataset:
         """
         Read data from the specified file path.
 
-        :param file_path: Path to the input file.
-        :return: List of dictionaries containing the data.
+        :param input_path: Path to the input file or list of file paths.
+        :return: Ray Dataset containing the read data.
         """
 
-    @staticmethod
-    def filter(data: List[dict]) -> List[dict]:
+    def _should_keep_item(self, item: Dict[str, Any]) -> bool:
+        """
+        Determine whether to keep the given item based on the text column.
+
+        :param item: Dictionary representing a data entry.
+        :return: True if the item should be kept, False otherwise.
         """
-        Filter out entries with empty or missing text in the specified column.
+        item_type = item.get("type")
+        assert item_type in [
+            "text",
+            "image",
+            "table",
+            "equation",
+            "protein",
+        ], f"Unsupported item type: {item_type}"
+        if item_type == "text":
+            content = item.get(self.text_column, "").strip()
+            return bool(content)
+        return True
 
-        :param data: List of dictionaries containing the data.
-        :return: Filtered list of dictionaries.
+    def _validate_batch(self, batch: pd.DataFrame) -> pd.DataFrame:
+        """
+        Validate data format.
         """
+        if "type" not in batch.columns:
+            raise ValueError(f"Missing 'type' column. Found: {list(batch.columns)}")
 
-        def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
-            """
-            Check if an image exists at the given local path or URL.
-            :param path_or_url: Local file path or remote URL of the image.
-            :param timeout: Timeout for remote URL requests in seconds.
-            :return: True if the image exists, False otherwise.
-            """
-            if not path_or_url:
-                return False
-            if not path_or_url.startswith(("http://", "https://", "ftp://")):
-                path = path_or_url.replace("file://", "", 1)
-                path = os.path.abspath(path)
-                return os.path.isfile(path)
-            try:
-                resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
-                return resp.status_code == 200
-            except requests.RequestException:
-                return False
+        if "text" in batch["type"].values:
+            if self.text_column not in batch.columns:
+                raise ValueError(
+                    f"Missing '{self.text_column}' column for text documents"
+                )
 
-        filtered_data = []
-        for item in data:
-            if item.get("type") == "text":
-                content = item.get("content", "").strip()
-                if content:
-                    filtered_data.append(item)
-            elif item.get("type") in ("image", "table", "equation"):
-                img_path = item.get("img_path")
-                if _image_exists(img_path):
-                    filtered_data.append(item)
-            else:
-                filtered_data.append(item)
-        return filtered_data
+        return batch
+
+    @staticmethod
+    def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+        """
+        Check if an image exists at the given local path or URL.
+        :param path_or_url: Local file path or remote URL of the image.
+        :param timeout: Timeout for remote URL requests in seconds.
+        :return: True if the image exists, False otherwise.
+        """
+        if not path_or_url:
+            return False
+        if not path_or_url.startswith(("http://", "https://", "ftp://")):
+            path = path_or_url.replace("file://", "", 1)
+            path = os.path.abspath(path)
+            return os.path.isfile(path)
+        try:
+            resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+            return resp.status_code == 200
+        except requests.RequestException:
+            return False
diff --git a/graphgen/bases/base_splitter.py b/graphgen/bases/base_splitter.py
@@ -33,7 +33,7 @@ def split_text(self, text: str) -> List[str]:
         """
         Split the input text into smaller chunks.
 
-        :param text: The input text to be split.
+        :param text: The input text to be chunk.
         :return: A list of text chunks.
         """
 
@@ -111,7 +111,7 @@ def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
     def _split_text_with_regex(
         text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
     ) -> List[str]:
-        # Now that we have the separator, split the text
+        # Now that we have the separator, chunk the text
         if separator:
             if keep_separator:
                 # The parentheses in the pattern keep the delimiters in the result.

diff --git a/graphgen/configs/search_config.yaml b/graphgen/configs/search_config.yaml
@@ -1,7 +1,7 @@
 pipeline:
   - name: read
     params:
-      input_file:  resources/input_examples/search_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      input_file:  resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 
   - name: search
     params: