Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat: add support for tall pages in pdfs by splitting them horizontally #292

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 208 additions & 2 deletions src/unstructured_client/_hooks/custom/split_pdf_hook.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import asyncio
import copy
import io
import json
import logging
Expand All @@ -13,6 +14,8 @@
from pathlib import Path
from typing import Any, Coroutine, Optional, Tuple, Union, cast, Generator, BinaryIO

from PIL import Image

import aiofiles
import httpx
import nest_asyncio # type: ignore
Expand Down Expand Up @@ -55,6 +58,7 @@
MAX_PAGES_PER_SPLIT = 20
HI_RES_STRATEGY = 'hi_res'
MAX_PAGE_LENGTH = 4000
TALL_PAGE_ASPECT_RATIO_THRESHOLD = 1.5


async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]:
Expand Down Expand Up @@ -304,6 +308,10 @@ def before_request(
return request

pdf = pdf_utils.check_pdf(pdf)

original_page_count = len(pdf.pages)
pdf = self._split_tall_pages(pdf)
image_processing_performed = (len(pdf.pages) != original_page_count)

starting_page_number = form_utils.get_starting_page_number(
form_data,
Expand Down Expand Up @@ -349,10 +357,30 @@ def before_request(
num_pages=page_count, concurrency_level=concurrency_level
)

# If the doc is small enough, and we aren't slicing it with a page range:
# If the doc is small enough,
# and we aren't slicing it with a page range,
# and no image processing (horizontal slicing) was performed:
# do not split, just continue with the original request
if split_size >= page_count and page_count == len(pdf.pages):
if split_size >= page_count and page_count == len(pdf.pages) and not image_processing_performed:
return request

# If image processing was performed, we need to send the processed PDF even for single pages
if image_processing_performed and len(pdf.pages) == 1:
# Create a single chunk with the processed PDF
processed_pdf_data = io.BytesIO()
pdf_writer = PdfWriter()
pdf_writer.add_page(pdf.pages[0])
pdf_writer.write(processed_pdf_data)
processed_pdf_data.seek(0)

# Create new request with processed PDF
processed_request = request_utils.create_pdf_chunk_request(
form_data=form_data,
pdf_chunk=(processed_pdf_data, 1),
filename=pdf_file_meta["filename"],
original_request=request,
)
return processed_request

pdf = self._trim_large_pages(pdf, form_data)

Expand Down Expand Up @@ -445,6 +473,184 @@ async def call_api_partial(

return response

def _split_tall_pages(self, pdf: PdfReader) -> PdfReader:
"""Checks for and splits pages that are disproportionately tall."""
# Initial analysis of the PDF structure
writer = PdfWriter()
any_page_split = False

for page in pdf.pages:
height = float(page.mediabox.height)
width = float(page.mediabox.width)

if width == 0: # Avoid division by zero for invalid pages
writer.add_page(page)
continue

aspect_ratio = height / width
logger.info(f"Page aspect ratio: {aspect_ratio:.2f} (threshold: {TALL_PAGE_ASPECT_RATIO_THRESHOLD})")

if aspect_ratio <= TALL_PAGE_ASPECT_RATIO_THRESHOLD:
writer.add_page(page)
continue

any_page_split = True
num_splits = math.ceil(aspect_ratio / TALL_PAGE_ASPECT_RATIO_THRESHOLD)
logger.info(f"Target splits: {num_splits} parts")

try:
split_pages = self._split_page_with_image_processing(page, num_splits)
if split_pages and len(split_pages) > 1:
logger.info(f"Image processing succeeded: {len(split_pages)} parts")
for split_page in split_pages:
writer.add_page(split_page)
else:
logger.warning("Image processing failed - no valid splits returned")
self._add_media_box_split_pages(writer, page, num_splits, height)
except Exception as e:
logger.error(f"Image processing exception: {e}")
self._add_media_box_split_pages(writer, page, num_splits, height)

if not any_page_split:
return pdf

# If we split any pages, return a new PdfReader from the modified content
buffer = io.BytesIO()
writer.write(buffer)
buffer.seek(0)
return PdfReader(buffer)

def _split_page_with_image_processing(self, page, num_splits):
"""Split a page by extracting and processing its images."""
if "/Resources" not in page or "/XObject" not in page["/Resources"]:
return None

xobjects = page["/Resources"]["/XObject"]

for obj_name, obj in xobjects.items():
if hasattr(obj, 'get_object'):
obj = obj.get_object()

if obj.get("/Subtype") == "/Image":
width = int(obj.get("/Width", 0))
height = int(obj.get("/Height", 0))
original_pixels = width * height

image_data = self._extract_image_data(obj)
if not image_data:
continue

try:
pil_image = Image.open(io.BytesIO(image_data))
except Exception:
continue

# Calculate target resolution to stay under API limits
# API limit is ~179M pixels, let's target 80M pixels total for safety margin
target_pixels_total = 80_000_000
target_pixels_per_split = target_pixels_total // num_splits

# Calculate scale factor if we need to reduce resolution
scale_factor = 1.0
if original_pixels > target_pixels_per_split:
scale_factor = (target_pixels_per_split / original_pixels) ** 0.5

# Apply scaling
new_width = int(pil_image.width * scale_factor)
new_height = int(pil_image.height * scale_factor)
pil_image = pil_image.resize((new_width, new_height), Image.Resampling.LANCZOS)

strip_height = pil_image.height // num_splits
total_split_pixels = 0
split_pages = []

for i in range(num_splits):
top = i * strip_height
bottom = min((i + 1) * strip_height, pil_image.height)

cropped_image = pil_image.crop((0, top, pil_image.width, bottom))
strip_pixels = cropped_image.width * cropped_image.height
total_split_pixels += strip_pixels

new_page = self._create_page_with_image(cropped_image, page)
if new_page:
split_pages.append(new_page)
else:
return None

if split_pages and len(split_pages) == num_splits:
return split_pages

return None

def _extract_image_data(self, image_obj):
"""Extract raw image data from a PDF image object."""
try:
if "/Filter" in image_obj:
filter_type = image_obj["/Filter"]

if filter_type in ["/DCTDecode", "/JPXDecode"]:
# JPEG or JPEG2000 - data is already compressed
data = image_obj._data
return data
elif filter_type == "/FlateDecode":
# PNG-like compression
import zlib
compressed_data = image_obj._data
data = zlib.decompress(compressed_data)
return data

# Fallback to raw data
data = image_obj._data
return data

except Exception:
return None

def _create_page_with_image(self, pil_image, original_page):
"""Create a new PDF page containing the given PIL image."""
try:
img_buffer = io.BytesIO()

# Convert to RGB if necessary
if pil_image.mode != 'RGB':
pil_image = pil_image.convert('RGB')

# Save the image as PDF
pil_image.save(img_buffer, format='PDF')
img_buffer.seek(0)

# Create a new PDF reader from the image
img_pdf = PdfReader(img_buffer)
if not img_pdf.pages:
return None

new_page = img_pdf.pages[0]
return new_page

except Exception:
return None

def _add_media_box_split_pages(self, writer, page, num_splits, page_height):
"""Fallback method to add pages with media box splitting (original approach)."""
split_height = page_height / num_splits

for i in range(num_splits):
# Create a deep copy to modify the media box independently
new_page = copy.deepcopy(page)

# Calculate new coordinates for the crop
top_coord = page.mediabox.top - (i * split_height)
bottom_coord = page.mediabox.top - ((i + 1) * split_height)

# Set the new media box to crop the page
new_page.mediabox.lower_left = (page.mediabox.left, bottom_coord)
new_page.mediabox.lower_right = (page.mediabox.right, bottom_coord)
new_page.mediabox.upper_left = (page.mediabox.left, top_coord)
new_page.mediabox.upper_right = (page.mediabox.right, top_coord)

writer.add_page(new_page)

def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfReader:
if form_data['strategy'] != HI_RES_STRATEGY:
return pdf
Expand Down
63 changes: 63 additions & 0 deletions tryout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
import json
import logging
from datetime import datetime
import unstructured_client
from unstructured_client.models import shared, operations

# Set up verbose logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("unstructured-client")
logger.setLevel(logging.INFO)

def run_partition_with_hook(pdf_path: str):
api_url = os.getenv("URL", None)
api_key = os.getenv("KEY", None)

client = unstructured_client.UnstructuredClient(
server_url=api_url,
api_key_auth=api_key,
)

with open(pdf_path, "rb") as f:
files = shared.Files(
content=f.read(),
file_name=pdf_path,
)

partition_parameters = shared.PartitionParameters(
files=files,
strategy="hi_res",
# This is the crucial parameter that activates the hook's logic
split_pdf_page=True,
)

request = operations.PartitionRequest(
partition_parameters=partition_parameters
)

resp = client.general.partition(request=request)

print("\n--- Partitioning Response ---")
if resp.elements:
print(f"Successfully partitioned document into {len(resp.elements)} elements.")

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"response_{timestamp}.json"
elements_data = [element for element in resp.elements]

with open(filename, 'w') as f:
json.dump(elements_data, f, indent=2)
print(f"✅ Saved to: {filename}")

else:
print("Partitioning call completed, but no elements were returned.")

if __name__ == "__main__":
test_filename = "input.pdf"

if os.path.exists(test_filename):
# Run the test
run_partition_with_hook(test_filename)
else:
print(f"Error: The file '{test_filename}' was not found.")
Loading