Skip to content

Commit a8e484c

Browse files
fix: Return error for invalid PDFs (#277)
- Adds function for checking PDF files before the request gets sent. Throws appropriate error message in case the file is invalid.
1 parent a8d0075 commit a8e484c

File tree

10 files changed

+204
-2
lines changed

10 files changed

+204
-2
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.37.0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
* Throws appropriate error message in case the given PDF file is invalid (corrupted or encrypted).
9+
110
## 0.30.0
211

312
### Enhancements

_sample_docs/failing-encrypted.pdf

936 Bytes
Binary file not shown.

_sample_docs/failing-invalid.pdf

70 Bytes
Binary file not shown.
160 Bytes
Binary file not shown.

_sample_docs/failing-missing-root.pdf

4.74 KB
Binary file not shown.

_test_unstructured_client/integration/test_integration.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,3 +348,31 @@ def test_partition_strategy_vlm_anthropic(split_pdf, vlm_model, vlm_model_provid
348348
assert response.status_code == 200
349349
assert len(response.elements) > 0
350350
assert response.elements[0]["metadata"]["partitioner_type"] == "vlm_partition"
351+
352+
353+
def test_returns_422_for_invalid_pdf(
354+
caplog: pytest.LogCaptureFixture,
355+
doc_path: Path,
356+
client: UnstructuredClient,
357+
):
358+
"""Test that we get a RequestError with the correct error message for invalid PDF files."""
359+
pdf_name = "failing-invalid.pdf"
360+
with open(doc_path / pdf_name, "rb") as f:
361+
files = shared.Files(
362+
content=f.read(),
363+
file_name=pdf_name,
364+
)
365+
366+
req = operations.PartitionRequest(
367+
partition_parameters=shared.PartitionParameters(
368+
files=files,
369+
strategy="fast",
370+
split_pdf_page=True,
371+
)
372+
)
373+
374+
with pytest.raises(HTTPValidationError):
375+
client.general.partition(request=req)
376+
377+
assert "File does not appear to be a valid PDF" in caplog.text
378+
assert "422" in caplog.text
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from __future__ import annotations
2+
3+
import io
4+
5+
import pytest
6+
from pypdf import PdfReader
7+
8+
from unstructured_client._hooks.custom.pdf_utils import check_pdf, PDFValidationError
9+
from _test_unstructured_client.unit_utils import sample_docs_path
10+
11+
12+
def _open_pdf(pdf_path: str) -> PdfReader:
13+
with open(pdf_path, "rb") as f:
14+
pdf_content = f.read()
15+
return PdfReader(io.BytesIO(pdf_content))
16+
17+
18+
def test_check_pdf_with_valid_pdf():
19+
pdf_path = sample_docs_path("list-item-example-1.pdf")
20+
pdf = _open_pdf(pdf_path)
21+
22+
result = check_pdf(pdf)
23+
assert isinstance(result, PdfReader)
24+
25+
26+
@pytest.mark.parametrize(
27+
("pdf_name", "expected_error_message"),
28+
[
29+
(
30+
"failing-encrypted.pdf",
31+
"File is encrypted. Please decrypt it with password.",
32+
),
33+
(
34+
"failing-missing-root.pdf",
35+
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
36+
),
37+
(
38+
"failing-missing-pages.pdf",
39+
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
40+
),
41+
],
42+
)
43+
def test_check_pdf_raises_pdf_validation_error(
44+
pdf_name: str, expected_error_message: str
45+
):
46+
"""Test that we get a PDFValidationError with the correct error message for invalid PDF files."""
47+
pdf_path = sample_docs_path(pdf_name)
48+
pdf = _open_pdf(pdf_path)
49+
50+
with pytest.raises(PDFValidationError) as exc_info:
51+
check_pdf(pdf)
52+
53+
assert exc_info.value.message == expected_error_message

_test_unstructured_client/unit/test_split_pdf_hook.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88
from unittest.mock import MagicMock, patch
99

1010
import httpx
11+
from httpx import RequestError
1112
import pytest
1213
import requests
1314
from requests_toolbelt import MultipartDecoder
1415

16+
from _test_unstructured_client.unit_utils import sample_docs_path
1517
from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
1618
from unstructured_client._hooks.custom.form_utils import (
1719
FormData,
@@ -29,6 +31,7 @@
2931
SplitPdfHook,
3032
get_optimal_split_size, run_tasks,
3133
)
34+
from unstructured_client._hooks.types import BeforeRequestContext
3235
from unstructured_client.models import shared
3336

3437

@@ -462,3 +465,64 @@ def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path
462465
mock_path.assert_called_once_with(mock_dir)
463466
mock_path_instance.exists.assert_called_once()
464467
assert result == str(Path(mock_dir).resolve())
468+
469+
470+
def test_before_request_raises_request_error_when_pdf_check_fails():
471+
"""Test that before_request raises RequestError when pdf_utils.check_pdf throws PDFValidationError."""
472+
hook = SplitPdfHook()
473+
474+
# Initialize the hook with a mock client
475+
mock_client = MagicMock()
476+
hook.sdk_init(base_url="http://localhost:8888", client=mock_client)
477+
478+
# Create a mock request context
479+
mock_hook_ctx = MagicMock()
480+
mock_hook_ctx.operation_id = "partition"
481+
482+
# Create a mock request with proper headers and content
483+
mock_request = MagicMock()
484+
mock_request.headers = {"Content-Type": "multipart/form-data"}
485+
mock_request.url.host = "localhost"
486+
487+
# Mock the form data to include the necessary fields for PDF splitting
488+
mock_pdf_file = MagicMock()
489+
mock_pdf_file.read.return_value = b"mock_pdf_content"
490+
491+
mock_form_data = {
492+
"split_pdf_page": "true",
493+
"files": {
494+
"filename": "test.pdf",
495+
"content_type": "application/pdf",
496+
"file": mock_pdf_file
497+
}
498+
}
499+
500+
# Mock the PDF reader object
501+
mock_pdf_reader = MagicMock()
502+
503+
# Define the error message that will be raised
504+
error_message = "File does not appear to be a valid PDF."
505+
506+
with patch("unstructured_client._hooks.custom.request_utils.get_multipart_stream_fields") as mock_get_fields, \
507+
patch("unstructured_client._hooks.custom.pdf_utils.read_pdf") as mock_read_pdf, \
508+
patch("unstructured_client._hooks.custom.pdf_utils.check_pdf") as mock_check_pdf, \
509+
patch("unstructured_client._hooks.custom.request_utils.get_base_url") as mock_get_base_url:
510+
511+
# Set up the mocks
512+
mock_get_fields.return_value = mock_form_data
513+
mock_read_pdf.return_value = mock_pdf_reader
514+
mock_check_pdf.side_effect = pdf_utils.PDFValidationError(error_message)
515+
mock_get_base_url.return_value = "http://localhost:8888"
516+
517+
# Call the method under test and verify it raises RequestError
518+
with pytest.raises(RequestError) as exc_info:
519+
hook.before_request(mock_hook_ctx, mock_request)
520+
521+
# Verify the exception has the correct message and request object
522+
assert str(exc_info.value) == error_message
523+
assert exc_info.value.request == mock_request
524+
525+
# Verify that the mocked functions were called as expected
526+
mock_get_fields.assert_called_once_with(mock_request)
527+
mock_read_pdf.assert_called_once_with(mock_pdf_file)
528+
mock_check_pdf.assert_called_once_with(mock_pdf_reader)

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import cast, Optional, BinaryIO, Union
66

77
from pypdf import PdfReader
8-
from pypdf.errors import PdfReadError
8+
from pypdf.errors import FileNotDecryptedError, PdfReadError
99

1010
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
1111

@@ -16,6 +16,15 @@
1616
pdf_logger = logging.getLogger("pypdf")
1717
pdf_logger.setLevel(logging.ERROR)
1818

19+
20+
class PDFValidationError(Exception):
21+
"""Base exception for PDF validation errors."""
22+
23+
def __init__(self, message: str):
24+
self.message = message
25+
super().__init__(self.message)
26+
27+
1928
def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
2029
"""Reads the given PDF file.
2130
@@ -33,3 +42,34 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
3342
return PdfReader(pdf_file, strict=False)
3443
except (PdfReadError, UnicodeDecodeError):
3544
return None
45+
46+
47+
def check_pdf(pdf: PdfReader) -> PdfReader:
48+
"""
49+
Check if PDF is:
50+
- Encrypted
51+
- Has corrupted pages
52+
- Has corrupted root object
53+
54+
Throws:
55+
- PDFValidationError if file is encrypted or corrupted
56+
"""
57+
try:
58+
# This will raise if the file is encrypted
59+
pdf.metadata # pylint: disable=pointless-statement
60+
61+
# This will raise if the file's root object is corrupted
62+
pdf.root_object # pylint: disable=pointless-statement
63+
64+
# This will raise if the file's pages are corrupted
65+
list(pdf.pages)
66+
67+
return pdf
68+
except FileNotDecryptedError as e:
69+
raise PDFValidationError(
70+
"File is encrypted. Please decrypt it with password.",
71+
) from e
72+
except PdfReadError as e:
73+
raise PDFValidationError(
74+
f"File does not appear to be a valid PDF. Error: {e}",
75+
) from e

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import aiofiles
1717
import httpx
1818
import nest_asyncio # type: ignore
19-
from httpx import AsyncClient
19+
from httpx import AsyncClient, RequestError
2020
from pypdf import PdfReader, PdfWriter
2121

2222
from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
@@ -303,6 +303,14 @@ def before_request(
303303
if pdf is None:
304304
return request
305305

306+
try:
307+
pdf = pdf_utils.check_pdf(pdf)
308+
except pdf_utils.PDFValidationError as e:
309+
raise RequestError(
310+
message=e.message,
311+
request=request,
312+
) from e
313+
306314
starting_page_number = form_utils.get_starting_page_number(
307315
form_data,
308316
key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,

0 commit comments

Comments
 (0)