diff --git a/graphgen/models/reader/csv_reader.py b/graphgen/models/reader/csv_reader.py index 97b26c68..bc865a3b 100644 --- a/graphgen/models/reader/csv_reader.py +++ b/graphgen/models/reader/csv_reader.py @@ -6,11 +6,19 @@ class CSVReader(BaseReader): + """ + Reader for CSV files. + Columns: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. + """ + def read(self, file_path: str) -> List[Dict[str, Any]]: df = pd.read_csv(file_path) for _, row in df.iterrows(): - if "type" in row and row["type"] == "text" and self.text_column not in row: + assert "type" in row, f"Missing 'type' column in document: {row.to_dict()}" + if row["type"] == "text" and self.text_column not in row: raise ValueError( f"Missing '{self.text_column}' in document: {row.to_dict()}" ) diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py index 943fbcab..8253041c 100644 --- a/graphgen/models/reader/json_reader.py +++ b/graphgen/models/reader/json_reader.py @@ -5,11 +5,19 @@ class JSONReader(BaseReader): + """ + Reader for JSON files. + Columns: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. + """ + def read(self, file_path: str) -> List[Dict[str, Any]]: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) if isinstance(data, list): for doc in data: + assert "type" in doc, f"Missing 'type' in document: {doc}" if doc.get("type") == "text" and self.text_column not in doc: raise ValueError( f"Missing '{self.text_column}' in document: {doc}" diff --git a/graphgen/models/reader/jsonl_reader.py b/graphgen/models/reader/jsonl_reader.py index be9f1cca..31bc3195 100644 --- a/graphgen/models/reader/jsonl_reader.py +++ b/graphgen/models/reader/jsonl_reader.py @@ -6,12 +6,20 @@ class JSONLReader(BaseReader): + """ + Reader for JSONL files. + Columns: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. + """ + def read(self, file_path: str) -> List[Dict[str, Any]]: docs = [] with open(file_path, "r", encoding="utf-8") as f: for line in f: try: doc = json.loads(line) + assert "type" in doc, f"Missing 'type' in document: {doc}" if doc.get("type") == "text" and self.text_column not in doc: raise ValueError( f"Missing '{self.text_column}' in document: {doc}" diff --git a/graphgen/models/reader/parquet_reader.py b/graphgen/models/reader/parquet_reader.py index 34194a1b..a325b876 100644 --- a/graphgen/models/reader/parquet_reader.py +++ b/graphgen/models/reader/parquet_reader.py @@ -8,6 +8,9 @@ class ParquetReader(BaseReader): """ Read parquet files, requiring the schema to be restored to List[Dict[str, Any]]. + Columns: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. """ def read(self, file_path: str) -> List[Dict[str, Any]]: @@ -15,6 +18,7 @@ def read(self, file_path: str) -> List[Dict[str, Any]]: data: List[Dict[str, Any]] = df.to_dict(orient="records") for doc in data: + assert "type" in doc, f"Missing 'type' in document: {doc}" if doc.get("type") == "text" and self.text_column not in doc: raise ValueError(f"Missing '{self.text_column}' in document: {doc}") return self.filter(data) diff --git a/graphgen/models/reader/pickle_reader.py b/graphgen/models/reader/pickle_reader.py index 96cf8bca..1a11dc11 100644 --- a/graphgen/models/reader/pickle_reader.py +++ b/graphgen/models/reader/pickle_reader.py @@ -7,6 +7,10 @@ class PickleReader(BaseReader): """ Read pickle files, requiring the top-level object to be List[Dict[str, Any]]. + + Columns: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. """ def read(self, file_path: str) -> List[Dict[str, Any]]: @@ -19,6 +23,7 @@ def read(self, file_path: str) -> List[Dict[str, Any]]: for doc in data: if not isinstance(doc, dict): raise ValueError("Every item in the list must be a dict.") + assert "type" in doc, f"Missing 'type' in document: {doc}" if doc.get("type") == "text" and self.text_column not in doc: raise ValueError(f"Missing '{self.text_column}' in document: {doc}")