Skip to content

Commit 90f3a72

Browse files
fix: support content type for input data
1 parent ee2e59a commit 90f3a72

File tree

5 files changed

+13
-13
lines changed

5 files changed

+13
-13
lines changed

graphgen/models/reader/csv_reader.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ class CSVReader(BaseReader):
99
def read(self, file_path: str) -> List[Dict[str, Any]]:
1010

1111
df = pd.read_csv(file_path)
12-
if self.text_column not in df.columns:
13-
raise ValueError(f"Missing '{self.text_column}' column in CSV file.")
14-
return df.to_dict(orient="records")
12+
for _, row in df.iterrows():
13+
if row.get("type") == "text" and self.text_column not in row:
14+
raise ValueError(
15+
f"Missing '{self.text_column}' in document: {row.to_dict()}"
16+
)
17+
return self.filter(df.to_dict(orient="records"))

graphgen/models/reader/json_reader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
1010
data = json.load(f)
1111
if isinstance(data, list):
1212
for doc in data:
13-
if self.text_column not in doc:
13+
if doc["type"] == "text" and self.text_column not in doc:
1414
raise ValueError(
1515
f"Missing '{self.text_column}' in document: {doc}"
1616
)
17-
return data
17+
return self.filter(data)
1818
raise ValueError("JSON file must contain a list of documents.")

graphgen/models/reader/jsonl_reader.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,11 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
1212
for line in f:
1313
try:
1414
doc = json.loads(line)
15-
if self.text_column in doc:
16-
docs.append(doc)
17-
else:
15+
if doc["type"] == "text" and self.text_column not in doc:
1816
raise ValueError(
1917
f"Missing '{self.text_column}' in document: {doc}"
2018
)
19+
docs.append(doc)
2120
except json.JSONDecodeError as e:
2221
logger.error("Error decoding JSON line: %s. Error: %s", line, e)
23-
return docs
22+
return self.filter(docs)

graphgen/models/reader/pdf_reader.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def read(self, file_path: str, **override) -> List[Dict[str, Any]]:
7474
kwargs = {**self._default_kwargs, **override}
7575

7676
mineru_result = self._call_mineru(pdf_path, kwargs)
77-
return mineru_result
77+
return self.filter(mineru_result)
7878

7979
def _call_mineru(
8080
self, pdf_path: Path, kwargs: Dict[str, Any]
@@ -172,8 +172,6 @@ def _try_load_cached_result(
172172
for key in ("page_idx", "bbox", "text_level"):
173173
if item.get(key) is not None:
174174
del item[key]
175-
if item["type"] == "text" and not item["content"].strip():
176-
continue
177175
results.append(item)
178176
return results
179177

graphgen/models/reader/txt_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
1111
line = line.strip()
1212
if line:
1313
docs.append({self.text_column: line})
14-
return docs
14+
return self.filter(docs)

0 commit comments

Comments
 (0)