Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion graphgen/models/reader/csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,19 @@


class CSVReader(BaseReader):
"""
Reader for CSV files.
Columns:
- type: The type of the document (e.g., "text", "image", etc.)
- if type is "text", "content" column must be present.
"""

def read(self, file_path: str) -> List[Dict[str, Any]]:

df = pd.read_csv(file_path)
for _, row in df.iterrows():
if "type" in row and row["type"] == "text" and self.text_column not in row:
assert "type" in row, f"Missing 'type' column in document: {row.to_dict()}"
if row["type"] == "text" and self.text_column not in row:
raise ValueError(
f"Missing '{self.text_column}' in document: {row.to_dict()}"
)
Expand Down
8 changes: 8 additions & 0 deletions graphgen/models/reader/json_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,19 @@


class JSONReader(BaseReader):
"""
Reader for JSON files.
Columns:
- type: The type of the document (e.g., "text", "image", etc.)
- if type is "text", "content" column must be present.
"""

def read(self, file_path: str) -> List[Dict[str, Any]]:
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
for doc in data:
assert "type" in doc, f"Missing 'type' in document: {doc}"
if doc.get("type") == "text" and self.text_column not in doc:
raise ValueError(
f"Missing '{self.text_column}' in document: {doc}"
Expand Down
8 changes: 8 additions & 0 deletions graphgen/models/reader/jsonl_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,20 @@


class JSONLReader(BaseReader):
"""
Reader for JSONL files.
Columns:
- type: The type of the document (e.g., "text", "image", etc.)
- if type is "text", "content" column must be present.
"""

def read(self, file_path: str) -> List[Dict[str, Any]]:
docs = []
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
try:
doc = json.loads(line)
assert "type" in doc, f"Missing 'type' in document: {doc}"
if doc.get("type") == "text" and self.text_column not in doc:
raise ValueError(
f"Missing '{self.text_column}' in document: {doc}"
Expand Down
4 changes: 4 additions & 0 deletions graphgen/models/reader/parquet_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,17 @@
class ParquetReader(BaseReader):
"""
Read parquet files, requiring the schema to be restored to List[Dict[str, Any]].
Columns:
- type: The type of the document (e.g., "text", "image", etc.)
- if type is "text", "content" column must be present.
"""

def read(self, file_path: str) -> List[Dict[str, Any]]:
df = pd.read_parquet(file_path)
data: List[Dict[str, Any]] = df.to_dict(orient="records")

for doc in data:
assert "type" in doc, f"Missing 'type' in document: {doc}"
if doc.get("type") == "text" and self.text_column not in doc:
raise ValueError(f"Missing '{self.text_column}' in document: {doc}")
return self.filter(data)
5 changes: 5 additions & 0 deletions graphgen/models/reader/pickle_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
class PickleReader(BaseReader):
"""
Read pickle files, requiring the top-level object to be List[Dict[str, Any]].

Columns:
- type: The type of the document (e.g., "text", "image", etc.)
- if type is "text", "content" column must be present.
"""

def read(self, file_path: str) -> List[Dict[str, Any]]:
Expand All @@ -19,6 +23,7 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
for doc in data:
if not isinstance(doc, dict):
raise ValueError("Every item in the list must be a dict.")
assert "type" in doc, f"Missing 'type' in document: {doc}"
if doc.get("type") == "text" and self.text_column not in doc:
raise ValueError(f"Missing '{self.text_column}' in document: {doc}")

Expand Down