Skip to content

feat: add tiered fallback for pdf #289

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
15 changes: 14 additions & 1 deletion _test_unstructured_client/unit/test_pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest
from pypdf import PdfReader

from unstructured_client._hooks.custom.pdf_utils import check_pdf, PDFValidationError
from unstructured_client._hooks.custom.pdf_utils import check_pdf, read_pdf, PDFValidationError
from _test_unstructured_client.unit_utils import sample_docs_path


Expand All @@ -23,6 +23,7 @@ def test_check_pdf_with_valid_pdf():
assert isinstance(result, PdfReader)


# TODO(klaijan) - add pdf file when file is ready
@pytest.mark.parametrize(
("pdf_name", "expected_error_message"),
[
Expand Down Expand Up @@ -51,3 +52,15 @@ def test_check_pdf_raises_pdf_validation_error(
check_pdf(pdf)

assert exc_info.value.message == expected_error_message


# TODO(klaijan) - uncomment when file is ready
"""
def test_check_read_pdf():
pdf_path = sample_docs_path(".pdf")
with open(pdf_path, "rb") as f:
pdf_content = f.read()
pdf = read_pdf(pdf_content)
result = check_pdf(pdf)
assert isinstance(result, PdfReader)
"""
50 changes: 47 additions & 3 deletions src/unstructured_client/_hooks/custom/pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,43 @@ def __init__(self, message: str):


def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
reader = read_pdf_raw(pdf_file=pdf_file)
if reader:
return reader

# TODO(klaijan) - remove once debugged
pdf_logger.debug("Primary PdfReader parse failed, attempting multipart and raw extraction fallbacks.")

# Load raw bytes
# case bytes
if isinstance(pdf_file, bytes):
raw = pdf_file
# case BinaryIO
elif hasattr(pdf_file, "read"):
try:
pdf_file.seek(0)
raw = pdf_file.read()
except Exception as e:
raise IOError(f"Failed to read file stream: {e}") from e
else:
raise IOError("Expected bytes or a file-like object with 'read()' method")

# breakpoint()
# This looks for %PDF-
try:
start = raw.find(b"%PDF-")
end = raw.find(b"%%EOF") + len(b"%%EOF")
if start != -1:
sliced = raw[start:end]
pdf = PdfReader(io.BytesIO(sliced), strict=False)
return check_pdf(pdf)
except Exception as e:
pdf_logger.debug("%%PDF- slicing fallback failed: %s", e)

return None


def read_pdf_raw(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
"""Reads the given PDF file.

Args:
Expand All @@ -34,13 +71,20 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
Returns:
The PdfReader object if the file is a PDF, None otherwise.
"""

try:
if isinstance(pdf_file, bytes):
content = cast(bytes, pdf_file)
pdf_file = io.BytesIO(content)
return PdfReader(pdf_file, strict=False)
except (PdfReadError, UnicodeDecodeError):
reader = PdfReader(pdf_file, strict=False)
return check_pdf(reader)
except (PdfReadError, UnicodeDecodeError) as e:
pdf_logger.debug("Read pdf failed: %s", e)
return None
except PDFValidationError as e:
pdf_logger.debug("Check pdf failed: %s", e)
return None
except Exception as e:
pdf_logger.debug("An unexpected error occurred: %s", e)
return None


Expand Down