allenai · kyleclo · Jul 14, 2021 · Jul 13, 2021 · Jul 14, 2021 · Jul 14, 2021
diff --git a/mmda/parsers/parser.py b/mmda/parsers/parser.py
@@ -6,13 +6,27 @@
 
 """
 
-from typing import Optional, Union
+from typing import Optional, Union, List
+from abc import abstractmethod
+
+from pdf2image import convert_from_path
 
 from mmda.types.document import Document
+from mmda.types.image import Image
+
+
+class BaseParser:
+
+    @abstractmethod
+    def parse(
+        self, infile: str, outdir: Optional[str] = None, outfname: Optional[str] = None, load_images=True
+    ) -> Union[str, Document]:
+        """This is the main entrance point for using the PDF parsers. For a
+        given PDF file, this method will return a Document object.
+        """
 
-class Parser:
-    def parse(self, infile: str, outdir: Optional[str] = None, outfname: Optional[str] = None) -> Union[str, Document]:
-        raise NotImplementedError
+    def load_images(self, infile: str) -> List["PIL.Image"]:
 
-    def load(self, infile: str) -> Document:
-        raise NotImplementedError
+        images = convert_from_path(infile, dpi=72) 
+        #Though 72 is not the default dpi for pdf2image, it's commonly used by other PDF parsing systems 
+        return images
diff --git a/mmda/parsers/symbol_scraper_parser.py b/mmda/parsers/symbol_scraper_parser.py
@@ -11,38 +11,43 @@
 import os
 import json
 import subprocess
+import tempfile
 
 import re
 from collections import defaultdict
 
 from mmda.types.span import Span
-from mmda.types.document import Document, Page, Token, Row, Sent, Block, Text
+from mmda.types.document import Document, Page, Token, Row, Sent, Block, Text, DocImage
 from mmda.types.boundingbox import BoundingBox
-from mmda.parsers.parser import Parser
+from mmda.parsers.parser import BaseParser
 
 
-class SymbolScraperParser(Parser):
+class SymbolScraperParser(BaseParser):
     def __init__(self, sscraper_bin_path: str):
         self.sscraper_bin_path = sscraper_bin_path
 
-    def parse(self, infile: str, outdir: Optional[str] = None, outfname: Optional[str] = None) -> Document:
-        if outdir:
-            if not outfname:
-                raise ValueError(f'Specifying `outdir` requires also specifying `outfname`')
+    def parse(self, infile: str, outdir: Optional[str] = None, outfname: Optional[str] = None, load_images=True) -> Document:
+
+        if outdir is None:
+            with tempfile.TemporaryDirectory() as outdir:
+                xmlfile = self._run_sscraper(infile=infile, outdir=outdir)
+                doc: Document = self._parse_xml_to_doc(xmlfile=xmlfile)
+            outdir = None
+        else:
             xmlfile = self._run_sscraper(infile=infile, outdir=outdir)
             doc: Document = self._parse_xml_to_doc(xmlfile=xmlfile)
+
+        if load_images:
+            doc.load(images=self.load_images(infile))
+
+        if outdir is not None:
+            if outfname is None:
+                outfname = os.path.join(outdir, os.path.basename(infile).replace('.pdf', '.json'))
+
             outfile = os.path.join(outdir, outfname)
             with open(outfile, 'w') as f_out:
                 json.dump(doc.to_json(), f_out, indent=4)
-            return doc
-        else:
-            raise NotImplementedError(f'Sscraper needs somewhere to output temp XML files')
-
-    def load(self, infile: str) -> Document:
-        with open(infile) as f_in:
-            doc_json = json.load(f_in)
-            doc = Document.from_json(doc_json=doc_json)
-            return doc
+        return doc
 
     #
     #   methods for interacting with SymbolScraper binary

diff --git a/mmda/types/document.py b/mmda/types/document.py
@@ -12,6 +12,7 @@
 
 from mmda.types.boundingbox import BoundingBox
 from mmda.types.annotations import Annotation, SpanAnnotation, BoundingBoxAnnotation
+from mmda.types.image import Image
 from mmda.types.span import Span
 
 
@@ -21,13 +22,15 @@
 Row = 'row'
 Sent = 'sent'
 Block = 'block'
+DocImage = 'image' # Conflicting the PIL Image naming 
 
 
 class Document:
 
     valid_types = [Page, Token, Row, Sent, Block]
 
     def __init__(self, text: str):
+
         self.text = text
 
         # TODO: if have span_type Map, do still need these?
@@ -36,6 +39,7 @@ def __init__(self, text: str):
         self._rows: List[Span] = []
         self._sents: List[Span] = []
         self._blocks: List[Span] = []
+        self._images: List["PIL.Image"] = []
 
         self._span_type_to_spans: Dict[Type, List[Span]] = {
             Page: self._pages,
@@ -67,6 +71,7 @@ def from_json(cls, doc_json: Dict) -> 'Document':
         rows = []
         sents = []
         blocks = []
+
         for span_type in cls.valid_types:
             if span_type in doc_json:
                 doc_spans = [DocSpan.from_span(span=Span.from_json(span_json=span_json), doc=doc, span_type=span_type)
@@ -83,7 +88,10 @@ def from_json(cls, doc_json: Dict) -> 'Document':
                     blocks = doc_spans
                 else:
                     raise Exception(f'Should never reach here')
-        doc.load(pages=pages, tokens=tokens, rows=rows, sents=sents, blocks=blocks)
+
+        images = [Image.frombase64(image_str) for image_str in doc_json.get(DocImage,[])]
+
+        doc.load(pages=pages, tokens=tokens, rows=rows, sents=sents, blocks=blocks, images=images)
         return doc
 
     # TODO: consider simpler more efficient method (e.g. JSONL; text)
@@ -94,7 +102,8 @@ def to_json(self) -> Dict:
             Token: [token.to_json(exclude=['text', 'type']) for token in self.tokens],
             Row: [row.to_json(exclude=['text', 'type']) for row in self.rows],
             Sent: [sent.to_json(exclude=['text', 'type']) for sent in self.sents],
-            Block: [block.to_json(exclude=['text', 'type']) for block in self.blocks]
+            Block: [block.to_json(exclude=['text', 'type']) for block in self.blocks],
+            DocImage: [image.tobase64() for image in self.images]
         }
 
     #
@@ -134,7 +143,9 @@ def load(self, pages: Optional[List[Span]] = None,
              tokens: Optional[List[Span]] = None,
              rows: Optional[List[Span]] = None,
              sents: Optional[List[Span]] = None,
-             blocks: Optional[List[Span]] = None):
+             blocks: Optional[List[Span]] = None,
+             images: Optional[List["PIL.Image"]] = None):
+
         if pages:
             self._pages = pages
             self._page_index = self._build_span_index(spans=pages)
@@ -150,6 +161,9 @@ def load(self, pages: Optional[List[Span]] = None,
         if blocks:
             self._blocks = blocks
             self._block_index = self._build_span_index(spans=blocks)
+        if images:
+            self._images = images
+
         self._build_span_type_to_spans()
         self._build_span_type_to_index()
 
@@ -176,6 +190,10 @@ def sents(self) -> List[Span]:
     def blocks(self) -> List[Span]:
         return self._blocks
 
+    @property
+    def images(self) -> List["PIL.Image"]:
+        return self._images
+
     #
     #   methods for using Document
     #

diff --git a/mmda/types/image.py b/mmda/types/image.py
@@ -0,0 +1,32 @@
+"""
+
+Dataclass for doing stuff on images of pages of a document
+
+@kylel, @shannons
+
+"""
+
+import base64
+from io import BytesIO
+
+from PIL import Image
+
+# Monkey patch the PIL.Image methods to add base64 conversion
+
+def tobase64(self):
+    # Ref: https://stackoverflow.com/a/31826470
+    buffered = BytesIO()
+    self.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue())
+
+    return img_str
+
+def frombase64(img_str):
+    # Use the same naming style as the original Image methods
+
+    buffered = BytesIO(base64.b64decode(img_str))
+    img = Image.open(buffered)
+    return img  
+
+Image.Image.tobase64 = tobase64 # This is the method applied to individual Image classes 
+Image.frombase64 = frombase64 # This is bind to the module, used for loading the images 
diff --git a/mmda/types/imgs.py b/mmda/types/imgs.py
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-intervaltree=3.1.0
+intervaltree=3.1.0
+pdf2image=0.1.12
diff --git a/tests/fixtures/1903.10676.pdf b/tests/fixtures/1903.10676.pdf
diff --git a/tests/test_parsers/test_load_PDF_images.py b/tests/test_parsers/test_load_PDF_images.py
@@ -0,0 +1,32 @@
+"""
+
+Tests for PDF Image parser
+
+@kylel
+
+
+"""
+
+import unittest
+
+from PIL import ImageChops
+
+from mmda.parsers.parser import BaseParser
+from mmda.types.image import Image
+
+class TestLoadPDFImages(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.pdf_path = "tests/fixtures/1903.10676.pdf"
+        cls.parser = BaseParser()
+
+    def test_load_image(self):
+        images = self.parser.load_images(self.pdf_path)
+
+        assert hasattr(images[0], "tobase64")
+
+        recovered_image = Image.frombase64(images[0].tobase64()) 
+        diff = ImageChops.difference(recovered_image, images[0])
+
+        assert diff.getbbox() is None