Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat: improve PDF validation error handling with FileValidationError … #280

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
## 0.37.3

### Enhancements
* Improve PDF validation error handling by introducing FileValidationError base class for better error abstraction

### Features

### Fixes
* Replace RequestError with PDFValidationError for invalid PDF files to provide more accurate error context

## 0.37.0

### Enhancements
Expand Down
12 changes: 5 additions & 7 deletions _test_unstructured_client/unit/test_split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from unittest.mock import MagicMock, patch

import httpx
from httpx import RequestError
import pytest
import requests
from requests_toolbelt import MultipartDecoder
Expand Down Expand Up @@ -467,8 +466,8 @@ def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path
assert result == str(Path(mock_dir).resolve())


def test_before_request_raises_request_error_when_pdf_check_fails():
"""Test that before_request raises RequestError when pdf_utils.check_pdf throws PDFValidationError."""
def test_before_request_raises_pdf_validation_error_when_pdf_check_fails():
"""Test that before_request raises PDFValidationError when pdf_utils.check_pdf throws PDFValidationError."""
hook = SplitPdfHook()

# Initialize the hook with a mock client
Expand Down Expand Up @@ -514,13 +513,12 @@ def test_before_request_raises_request_error_when_pdf_check_fails():
mock_check_pdf.side_effect = pdf_utils.PDFValidationError(error_message)
mock_get_base_url.return_value = "http://localhost:8888"

# Call the method under test and verify it raises RequestError
with pytest.raises(RequestError) as exc_info:
# Call the method under test and verify it raises PDFValidationError
with pytest.raises(pdf_utils.PDFValidationError) as exc_info:
hook.before_request(mock_hook_ctx, mock_request)

# Verify the exception has the correct message and request object
# Verify the exception has the correct message
assert str(exc_info.value) == error_message
assert exc_info.value.request == mock_request

# Verify that the mocked functions were called as expected
mock_get_fields.assert_called_once_with(mock_request)
Expand Down
2 changes: 1 addition & 1 deletion gen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ generation:
oAuth2ClientCredentialsEnabled: false
oAuth2PasswordEnabled: false
python:
version: 0.37.2
version: 0.37.3
additionalDependencies:
dev:
deepdiff: '>=6.0'
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file is autogenerated - we can manually set the version in gen.yaml, and then it should take this the next time the SDK is rebuilt.

Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "unstructured-client"
version = "0.37.2"
version = "0.37.3"
description = "Python Client SDK for Unstructured API"
authors = [{ name = "Unstructured" },]
readme = "README-PYPI.md"
Expand Down
8 changes: 4 additions & 4 deletions src/unstructured_client/_hooks/custom/pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pypdf.errors import FileNotDecryptedError, PdfReadError

from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
from unstructured_client._hooks.custom.validation_errors import FileValidationError

logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME)

Expand All @@ -17,12 +18,11 @@
pdf_logger.setLevel(logging.ERROR)


class PDFValidationError(Exception):
"""Base exception for PDF validation errors."""
class PDFValidationError(FileValidationError):
"""Exception for PDF validation errors."""

def __init__(self, message: str):
self.message = message
super().__init__(self.message)
super().__init__(message, file_type="PDF")


def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
Expand Down
10 changes: 2 additions & 8 deletions src/unstructured_client/_hooks/custom/split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import aiofiles
import httpx
import nest_asyncio # type: ignore
from httpx import AsyncClient, RequestError
from httpx import AsyncClient
from pypdf import PdfReader, PdfWriter

from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
Expand Down Expand Up @@ -303,13 +303,7 @@ def before_request(
if pdf is None:
return request

try:
pdf = pdf_utils.check_pdf(pdf)
except pdf_utils.PDFValidationError as e:
raise RequestError(
message=e.message,
request=request,
) from e
pdf = pdf_utils.check_pdf(pdf)

starting_page_number = form_utils.get_starting_page_number(
form_data,
Expand Down
16 changes: 16 additions & 0 deletions src/unstructured_client/_hooks/custom/validation_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""File validation error classes for the Unstructured client."""

from typing import Optional


class FileValidationError(Exception):
"""Base exception for file validation errors.

This exception should be raised when a file fails validation
checks before being processed by the API.
"""

def __init__(self, message: str, file_type: Optional[str] = None):
self.message: str = message
self.file_type: Optional[str] = file_type
super().__init__(self.message)