Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions mmda/predictors/hf_predictors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,31 @@
from mmda.types.names import *


def normalize_bbox(
bbox,
page_width,
page_height,
target_width,
target_height,
):
"""
Normalize bounding box to the target size.
"""

x1, y1, x2, y2 = bbox

# Right now only execute this for only "large" PDFs
# TODO: Change it for all PDFs
if page_width > target_width or page_height > target_height:

x1 = float(x1) / page_width * target_width
x2 = float(x2) / page_width * target_width
y1 = float(y1) / page_height * target_height
y2 = float(y2) / page_height * target_height

return (x1, y1, x2, y2)


def shift_index_sequence_to_zero_start(sequence):
"""
Shift a sequence to start at 0.
Expand Down Expand Up @@ -57,9 +82,9 @@ def convert_document_page_to_pdf_dict(
# TODO: Right now we assume the token could only have a single span.

bbox = [
token.spans[0].box.get_absolute(
page_width=page_width, page_height=page_height
).coordinates
token.spans[0]
.box.get_absolute(page_width=page_width, page_height=page_height)
.coordinates
for token in document.tokens
]

Expand All @@ -83,7 +108,6 @@ def convert_document_page_to_pdf_dict(
}



def convert_sequence_tagging_to_spans(
token_prediction_sequence: List,
) -> List[Tuple[int, int, int]]:
Expand All @@ -103,4 +127,4 @@ def convert_sequence_tagging_to_spans(
cur_len = len(list(seq))
spans.append((prev_len, prev_len + cur_len, gp))
prev_len = prev_len + cur_len
return spans
return spans
30 changes: 26 additions & 4 deletions mmda/predictors/hf_predictors/vila_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,16 @@
from mmda.predictors.hf_predictors.utils import (
convert_document_page_to_pdf_dict,
convert_sequence_tagging_to_spans,
normalize_bbox,
)
from mmda.predictors.hf_predictors.base_hf_predictor import BaseHFPredictor

# Two constants for the constraining the size of the page for
# inputs to the model.
# TODO: Move this to somewhere else.
MAX_PAGE_WIDTH = 1000
MAX_PAGE_HEIGHT = 1000


def columns_used_in_model_inputs(model):
signature = inspect.signature(model.forward)
Expand Down Expand Up @@ -98,7 +105,9 @@ def initialize_preprocessor(tokenizer, config):
# vila module.
pass

def preprocess(self, pdf_dict: Dict[str, List[Any]]):
def preprocess(
self, pdf_dict: Dict[str, List[Any]], page_width: int, page_height: int
) -> Dict[str, List[Any]]:
_labels = pdf_dict.get("labels")
pdf_dict["labels"] = [0] * len(pdf_dict["words"])
# because the preprocess_sample requires the labels to be
Expand All @@ -107,6 +116,19 @@ def preprocess(self, pdf_dict: Dict[str, List[Any]]):
# and we will change them back to the original labels later.

model_inputs = self.preprocessor.preprocess_sample(pdf_dict)
model_inputs["bbox"] = [
[
normalize_bbox(
bbox,
page_width,
page_height,
target_width=MAX_PAGE_WIDTH,
target_height=MAX_PAGE_HEIGHT,
)
for bbox in batch
]
for batch in model_inputs["bbox"]
]
pdf_dict["labels"] = _labels
return model_inputs

Expand Down Expand Up @@ -152,12 +174,12 @@ def predict(self, document: Document) -> List[Annotation]:
page_width, page_height = document.images[page_id].size

pdf_dict = convert_document_page_to_pdf_dict(
page, page_width=page_width,page_height=page_height
page, page_width=page_width, page_height=page_height
)
# VILA models trained based on absolute page width rather than the
# size (1000, 1000) in vanilla LayoutLM models

model_inputs = self.preprocess(pdf_dict)
model_inputs = self.preprocess(pdf_dict, page_width, page_height)
model_outputs = self.model(**self.model_input_collator(model_inputs))
model_predictions = self.get_category_prediction(model_outputs)
page_prediction_results.extend(
Expand Down Expand Up @@ -278,4 +300,4 @@ def get_true_token_level_category_prediction(

preds = [ele[0] for ele in flatten_predictions]

return preds
return preds
7 changes: 5 additions & 2 deletions pipeline/run_local.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import pathlib
import sys

from mmda.parsers.pdfplumber_parser import PDFPlumberParser
from mmda.predictors.heuristic_predictors.dictionary_word_predictor import DictionaryWordPredictor
from mmda.predictors.lp_predictors import LayoutParserPredictor
from mmda.predictors.hf_predictors.vila_predictor import IVILAPredictor
from mmda.rasterizers.rasterizer import PDF2ImageRasterizer

pdf_file = "/home/ubuntu/git/VILA/2306c568f2d3dfec6762ccb9fb16e63e173a.pdf"
pdf_file = pathlib.Path(sys.argv[1]).resolve()
print(f"reading pdf from from {pdf_file}")

pdf_plumber = PDFPlumberParser()
rasterizer = PDF2ImageRasterizer()

doc = pdf_plumber.parse(pdf_file)
doc.annotate_images(rasterizer.rasterize(pdf_file, dpi=72))

lp_predictor1 = LayoutParserPredictor.from_pretrained("lp://efficientdet/PubLayNet")
lp_predictor2 = LayoutParserPredictor.from_pretrained("lp://efficientdet/MFD")
blocks = lp_predictor1.predict(doc) + lp_predictor2.predict(doc)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"api": ["Flask", "gevent"],
"pipeline": ["requests"],
"lp_predictors": ["layoutparser", "torch", "torchvision", "effdet"],
"vila_predictors": ["vila", "transformers"],
"vila_predictors": ["vila >= 0.3.0", "transformers"],
},
include_package_data=True,
)