allenai · yoganandc · Apr 22, 2022 · Apr 22, 2022 · Apr 22, 2022 · Apr 22, 2022
diff --git a/mmda/predictors/hf_predictors/utils.py b/mmda/predictors/hf_predictors/utils.py
@@ -6,6 +6,31 @@
 from mmda.types.names import *
 
 
+def normalize_bbox(
+    bbox,
+    page_width,
+    page_height,
+    target_width,
+    target_height,
+):
+    """
+    Normalize bounding box to the target size.
+    """
+
+    x1, y1, x2, y2 = bbox
+
+    # Right now only execute this for only "large" PDFs
+    # TODO: Change it for all PDFs
+    if page_width > target_width or page_height > target_height:
+
+        x1 = float(x1) / page_width * target_width
+        x2 = float(x2) / page_width * target_width
+        y1 = float(y1) / page_height * target_height
+        y2 = float(y2) / page_height * target_height
+
+    return (x1, y1, x2, y2)
+
+
 def shift_index_sequence_to_zero_start(sequence):
     """
     Shift a sequence to start at 0.
@@ -57,9 +82,9 @@ def convert_document_page_to_pdf_dict(
     # TODO: Right now we assume the token could only have a single span.
 
     bbox = [
-        token.spans[0].box.get_absolute(
-            page_width=page_width, page_height=page_height
-        ).coordinates
+        token.spans[0]
+        .box.get_absolute(page_width=page_width, page_height=page_height)
+        .coordinates
         for token in document.tokens
     ]
 
@@ -83,7 +108,6 @@ def convert_document_page_to_pdf_dict(
     }
 
 
-
 def convert_sequence_tagging_to_spans(
     token_prediction_sequence: List,
 ) -> List[Tuple[int, int, int]]:
@@ -103,4 +127,4 @@ def convert_sequence_tagging_to_spans(
         cur_len = len(list(seq))
         spans.append((prev_len, prev_len + cur_len, gp))
         prev_len = prev_len + cur_len
-    return spans
+    return spans
diff --git a/mmda/predictors/hf_predictors/vila_predictor.py b/mmda/predictors/hf_predictors/vila_predictor.py
@@ -23,9 +23,16 @@
 from mmda.predictors.hf_predictors.utils import (
     convert_document_page_to_pdf_dict,
     convert_sequence_tagging_to_spans,
+    normalize_bbox,
 )
 from mmda.predictors.hf_predictors.base_hf_predictor import BaseHFPredictor
 
+# Two constants for the constraining the size of the page for
+# inputs to the model.
+# TODO: Move this to somewhere else.
+MAX_PAGE_WIDTH = 1000
+MAX_PAGE_HEIGHT = 1000
+
 
 def columns_used_in_model_inputs(model):
     signature = inspect.signature(model.forward)
@@ -98,7 +105,9 @@ def initialize_preprocessor(tokenizer, config):
         # vila module.
         pass
 
-    def preprocess(self, pdf_dict: Dict[str, List[Any]]):
+    def preprocess(
+        self, pdf_dict: Dict[str, List[Any]], page_width: int, page_height: int
+    ) -> Dict[str, List[Any]]:
         _labels = pdf_dict.get("labels")
         pdf_dict["labels"] = [0] * len(pdf_dict["words"])
         # because the preprocess_sample requires the labels to be
@@ -107,6 +116,19 @@ def preprocess(self, pdf_dict: Dict[str, List[Any]]):
         # and we will change them back to the original labels later.
 
         model_inputs = self.preprocessor.preprocess_sample(pdf_dict)
+        model_inputs["bbox"] = [
+            [
+                normalize_bbox(
+                    bbox,
+                    page_width,
+                    page_height,
+                    target_width=MAX_PAGE_WIDTH,
+                    target_height=MAX_PAGE_HEIGHT,
+                )
+                for bbox in batch
+            ]
+            for batch in model_inputs["bbox"]
+        ]
         pdf_dict["labels"] = _labels
         return model_inputs
 
@@ -152,12 +174,12 @@ def predict(self, document: Document) -> List[Annotation]:
                 page_width, page_height = document.images[page_id].size
 
                 pdf_dict = convert_document_page_to_pdf_dict(
-                    page, page_width=page_width,page_height=page_height
+                    page, page_width=page_width, page_height=page_height
                 )
                 # VILA models trained based on absolute page width rather than the
                 # size (1000, 1000) in vanilla LayoutLM models
 
-                model_inputs = self.preprocess(pdf_dict)
+                model_inputs = self.preprocess(pdf_dict, page_width, page_height)
                 model_outputs = self.model(**self.model_input_collator(model_inputs))
                 model_predictions = self.get_category_prediction(model_outputs)
                 page_prediction_results.extend(
@@ -278,4 +300,4 @@ def get_true_token_level_category_prediction(
 
         preds = [ele[0] for ele in flatten_predictions]
 
-        return preds
+        return preds
diff --git a/pipeline/run_local.py b/pipeline/run_local.py
@@ -1,18 +1,21 @@
+import pathlib
+import sys
+
 from mmda.parsers.pdfplumber_parser import PDFPlumberParser
 from mmda.predictors.heuristic_predictors.dictionary_word_predictor import DictionaryWordPredictor
 from mmda.predictors.lp_predictors import LayoutParserPredictor
 from mmda.predictors.hf_predictors.vila_predictor import IVILAPredictor
 from mmda.rasterizers.rasterizer import PDF2ImageRasterizer
 
-pdf_file = "/home/ubuntu/git/VILA/2306c568f2d3dfec6762ccb9fb16e63e173a.pdf"
+pdf_file = pathlib.Path(sys.argv[1]).resolve()
 print(f"reading pdf from from {pdf_file}")
 
 pdf_plumber = PDFPlumberParser()
 rasterizer = PDF2ImageRasterizer()
 
 doc = pdf_plumber.parse(pdf_file)
 doc.annotate_images(rasterizer.rasterize(pdf_file, dpi=72))
-    
+
 lp_predictor1 = LayoutParserPredictor.from_pretrained("lp://efficientdet/PubLayNet")
 lp_predictor2 = LayoutParserPredictor.from_pretrained("lp://efficientdet/MFD")
 blocks = lp_predictor1.predict(doc) + lp_predictor2.predict(doc)

diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
         "api": ["Flask", "gevent"],
         "pipeline": ["requests"],
         "lp_predictors": ["layoutparser", "torch", "torchvision", "effdet"],
-        "vila_predictors": ["vila", "transformers"],
+        "vila_predictors": ["vila >= 0.3.0", "transformers"],
     },
     include_package_data=True,
 )