Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f114624

Browse files
holtskinnergalz10
andauthored
feat: Add PDF Splitter (#51)
* feat: Add PDF Splitter * fix: Updated setup.py syntax * fix: Fixed initializer error * Updated Test to include a multi-page split * formatting fix * Add Pdf Split Example * Adjusted mkdir in tests * Added pikepdf to test dependencies --------- Co-authored-by: Gal Zahavi <[email protected]>
1 parent b02670f commit f114624

File tree

14 files changed

+609
-5
lines changed

14 files changed

+609
-5
lines changed

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
from google.cloud.documentai_toolbox.wrappers.page import FormField
3131
from google.cloud.documentai_toolbox.wrappers.entity import Entity
3232

33+
from pikepdf import Pdf
34+
3335

3436
def _entities_from_shards(
3537
shards: List[documentai.Document],
@@ -365,3 +367,44 @@ def get_entity_by_type(self, target_type: str) -> List[Entity]:
365367
366368
"""
367369
return [entity for entity in self.entities if entity.type_ == target_type]
370+
371+
def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
372+
r"""Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.
373+
374+
Args:
375+
pdf_path (str):
376+
Required. The path to the PDF file.
377+
output_path (str):
378+
Required. The path to the output directory.
379+
Returns:
380+
List[str]:
381+
A list of output pdf files.
382+
"""
383+
output_files: List[str] = []
384+
input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
385+
with Pdf.open(pdf_path) as f:
386+
for entity in self.entities:
387+
subdoc_type = entity.type_ or "subdoc"
388+
389+
if entity.start_page == entity.end_page:
390+
page_range = f"pg{entity.start_page + 1}"
391+
else:
392+
page_range = f"pg{entity.start_page + 1}-{entity.end_page + 1}"
393+
394+
output_filename = (
395+
f"{input_filename}_{page_range}_{subdoc_type}{input_extension}"
396+
)
397+
398+
subdoc = Pdf.new()
399+
for page_num in range(entity.start_page, entity.end_page + 1):
400+
subdoc.pages.append(f.pages[page_num])
401+
402+
subdoc.save(
403+
os.path.join(
404+
output_path,
405+
output_filename,
406+
),
407+
min_version=f.pdf_version,
408+
)
409+
output_files.append(output_filename)
410+
return output_files

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/entity.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,13 @@ class Entity:
3737
documentai_entity: documentai.Document.Entity = dataclasses.field(repr=False)
3838
type_: str = dataclasses.field(init=False)
3939
mention_text: str = dataclasses.field(init=False, default="")
40+
# Only Populated for Splitter/Classifier Output
41+
start_page: int = dataclasses.field(init=False)
42+
end_page: int = dataclasses.field(init=False)
4043

4144
def __post_init__(self):
4245
self.type_ = self.documentai_entity.type_
4346
self.mention_text = self.documentai_entity.mention_text
47+
if self.documentai_entity.page_anchor.page_refs:
48+
self.start_page = int(self.documentai_entity.page_anchor.page_refs[0].page)
49+
self.end_page = int(self.documentai_entity.page_anchor.page_refs[-1].page)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
17+
# [START documentai_toolbox_split_pdf]
18+
19+
from google.cloud.documentai_toolbox import document
20+
21+
# TODO(developer): Uncomment these variables before running the sample.
22+
# Given a local document.proto or sharded document.proto from a splitter/classifier in path
23+
# document_path = "path/to/local/document.json"
24+
# pdf_path = "path/to/local/document.pdf"
25+
# output_path = "resources/output/"
26+
27+
28+
def split_pdf_sample(document_path: str, pdf_path: str, output_path: str) -> None:
29+
wrapped_document = document.Document.from_document_path(document_path=document_path)
30+
31+
output_files = wrapped_document.split_pdf(
32+
pdf_path=pdf_path, output_path=output_path
33+
)
34+
35+
print("Document Successfully Split")
36+
for output_file in output_files:
37+
print(output_file)
38+
39+
40+
# [END documentai_toolbox_split_pdf]
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
import os
17+
import shutil
18+
19+
import pytest
20+
from samples.snippets import split_pdf_sample
21+
22+
document_path = "../../tests/unit/resources/splitter/procurement_splitter_output.json"
23+
pdf_path = "../../tests/unit/resources/procurement_multi_document.pdf"
24+
output_path = "resources/output/"
25+
26+
27+
def test_split_pdf_sample(capsys: pytest.CaptureFixture) -> None:
28+
os.makedirs(output_path)
29+
current_directory = os.path.dirname(__file__)
30+
rel_document_path = os.path.relpath(document_path, current_directory)
31+
rel_pdf_path = os.path.relpath(pdf_path, current_directory)
32+
33+
split_pdf_sample.split_pdf_sample(
34+
document_path=rel_document_path, pdf_path=rel_pdf_path, output_path=output_path
35+
)
36+
out, _ = capsys.readouterr()
37+
38+
assert "Document Successfully Split" in out
39+
assert "procurement_multi_document_pg1_invoice_statement.pdf" in out
40+
41+
assert os.path.exists(output_path)
42+
shutil.rmtree(output_path)

packages/google-cloud-documentai-toolbox/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
"google-cloud-documentai >= 1.2.1, < 3.0.0dev",
5353
"google-cloud-storage >= 1.31.0, < 3.0.0dev",
5454
"numpy >= 1.18.1",
55+
"pikepdf >= 6.2.9, < 8.0.0",
5556
),
5657
python_requires=">=3.7",
5758
classifiers=[

packages/google-cloud-documentai-toolbox/testing/constraints-3.10.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ pandas
77
proto-plus
88
grpc-google-iam-v1
99
google-cloud-documentai
10-
google-cloud-storage
10+
google-cloud-storage
11+
pikepdf

packages/google-cloud-documentai-toolbox/testing/constraints-3.11.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ pandas
77
proto-plus
88
grpc-google-iam-v1
99
google-cloud-documentai
10-
google-cloud-storage
10+
google-cloud-storage
11+
pikepdf

packages/google-cloud-documentai-toolbox/testing/constraints-3.7.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ proto-plus== 1.22.0
1111
grpc-google-iam-v1==0.12.4
1212
google-cloud-documentai==1.2.1
1313
google-cloud-storage== 1.31.0
14-
numpy==1.18.1
14+
numpy==1.18.1
15+
pikepdf==6.2.9

packages/google-cloud-documentai-toolbox/testing/constraints-3.8.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ pandas
77
proto-plus
88
grpc-google-iam-v1
99
google-cloud-documentai
10-
google-cloud-storage
10+
google-cloud-storage
11+
pikepdf

packages/google-cloud-documentai-toolbox/testing/constraints-3.9.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ pandas
77
proto-plus
88
grpc-google-iam-v1
99
google-cloud-documentai
10-
google-cloud-storage
10+
google-cloud-storage
11+
pikepdf

0 commit comments

Comments
 (0)