Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions mmda/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,27 @@

"""

from typing import Optional, Union
from typing import Optional, Union, List
from abc import abstractmethod

from pdf2image import convert_from_path

from mmda.types.document import Document
from mmda.types.image import Image


class BaseParser:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the rename?


@abstractmethod
def parse(
self, infile: str, outdir: Optional[str] = None, outfname: Optional[str] = None, load_images=True
) -> Union[str, Document]:
"""This is the main entrance point for using the PDF parsers. For a
given PDF file, this method will return a Document object.
"""

class Parser:
def parse(self, infile: str, outdir: Optional[str] = None, outfname: Optional[str] = None) -> Union[str, Document]:
raise NotImplementedError
def load_images(self, infile: str) -> List["PIL.Image"]:

def load(self, infile: str) -> Document:
raise NotImplementedError
images = convert_from_path(infile, dpi=72)
#Though 72 is not the default dpi for pdf2image, it's commonly used by other PDF parsing systems
return images
37 changes: 21 additions & 16 deletions mmda/parsers/symbol_scraper_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,38 +11,43 @@
import os
import json
import subprocess
import tempfile

import re
from collections import defaultdict

from mmda.types.span import Span
from mmda.types.document import Document, Page, Token, Row, Sent, Block, Text
from mmda.types.document import Document, Page, Token, Row, Sent, Block, Text, DocImage
from mmda.types.boundingbox import BoundingBox
from mmda.parsers.parser import Parser
from mmda.parsers.parser import BaseParser


class SymbolScraperParser(Parser):
class SymbolScraperParser(BaseParser):
def __init__(self, sscraper_bin_path: str):
self.sscraper_bin_path = sscraper_bin_path

def parse(self, infile: str, outdir: Optional[str] = None, outfname: Optional[str] = None) -> Document:
if outdir:
if not outfname:
raise ValueError(f'Specifying `outdir` requires also specifying `outfname`')
def parse(self, infile: str, outdir: Optional[str] = None, outfname: Optional[str] = None, load_images=True) -> Document:

if outdir is None:
with tempfile.TemporaryDirectory() as outdir:
xmlfile = self._run_sscraper(infile=infile, outdir=outdir)
doc: Document = self._parse_xml_to_doc(xmlfile=xmlfile)
outdir = None
else:
xmlfile = self._run_sscraper(infile=infile, outdir=outdir)
doc: Document = self._parse_xml_to_doc(xmlfile=xmlfile)

if load_images:
doc.load(images=self.load_images(infile))

if outdir is not None:
if outfname is None:
outfname = os.path.join(outdir, os.path.basename(infile).replace('.pdf', '.json'))

outfile = os.path.join(outdir, outfname)
with open(outfile, 'w') as f_out:
json.dump(doc.to_json(), f_out, indent=4)
return doc
else:
raise NotImplementedError(f'Sscraper needs somewhere to output temp XML files')

def load(self, infile: str) -> Document:
with open(infile) as f_in:
doc_json = json.load(f_in)
doc = Document.from_json(doc_json=doc_json)
return doc
return doc

#
# methods for interacting with SymbolScraper binary
Expand Down
24 changes: 21 additions & 3 deletions mmda/types/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from mmda.types.boundingbox import BoundingBox
from mmda.types.annotations import Annotation, SpanAnnotation, BoundingBoxAnnotation
from mmda.types.image import Image
from mmda.types.span import Span


Expand All @@ -21,13 +22,15 @@
Row = 'row'
Sent = 'sent'
Block = 'block'
DocImage = 'image' # Conflicting the PIL Image naming


class Document:

valid_types = [Page, Token, Row, Sent, Block]

def __init__(self, text: str):

self.text = text

# TODO: if have span_type Map, do still need these?
Expand All @@ -36,6 +39,7 @@ def __init__(self, text: str):
self._rows: List[Span] = []
self._sents: List[Span] = []
self._blocks: List[Span] = []
self._images: List["PIL.Image"] = []

self._span_type_to_spans: Dict[Type, List[Span]] = {
Page: self._pages,
Expand Down Expand Up @@ -67,6 +71,7 @@ def from_json(cls, doc_json: Dict) -> 'Document':
rows = []
sents = []
blocks = []

for span_type in cls.valid_types:
if span_type in doc_json:
doc_spans = [DocSpan.from_span(span=Span.from_json(span_json=span_json), doc=doc, span_type=span_type)
Expand All @@ -83,7 +88,10 @@ def from_json(cls, doc_json: Dict) -> 'Document':
blocks = doc_spans
else:
raise Exception(f'Should never reach here')
doc.load(pages=pages, tokens=tokens, rows=rows, sents=sents, blocks=blocks)

images = [Image.frombase64(image_str) for image_str in doc_json.get(DocImage,[])]

doc.load(pages=pages, tokens=tokens, rows=rows, sents=sents, blocks=blocks, images=images)
return doc

# TODO: consider simpler more efficient method (e.g. JSONL; text)
Expand All @@ -94,7 +102,8 @@ def to_json(self) -> Dict:
Token: [token.to_json(exclude=['text', 'type']) for token in self.tokens],
Row: [row.to_json(exclude=['text', 'type']) for row in self.rows],
Sent: [sent.to_json(exclude=['text', 'type']) for sent in self.sents],
Block: [block.to_json(exclude=['text', 'type']) for block in self.blocks]
Block: [block.to_json(exclude=['text', 'type']) for block in self.blocks],
DocImage: [image.tobase64() for image in self.images]
}

#
Expand Down Expand Up @@ -134,7 +143,9 @@ def load(self, pages: Optional[List[Span]] = None,
tokens: Optional[List[Span]] = None,
rows: Optional[List[Span]] = None,
sents: Optional[List[Span]] = None,
blocks: Optional[List[Span]] = None):
blocks: Optional[List[Span]] = None,
images: Optional[List["PIL.Image"]] = None):

if pages:
self._pages = pages
self._page_index = self._build_span_index(spans=pages)
Expand All @@ -150,6 +161,9 @@ def load(self, pages: Optional[List[Span]] = None,
if blocks:
self._blocks = blocks
self._block_index = self._build_span_index(spans=blocks)
if images:
self._images = images

self._build_span_type_to_spans()
self._build_span_type_to_index()

Expand All @@ -176,6 +190,10 @@ def sents(self) -> List[Span]:
def blocks(self) -> List[Span]:
return self._blocks

@property
def images(self) -> List["PIL.Image"]:
return self._images

#
# methods for using Document
#
Expand Down
32 changes: 32 additions & 0 deletions mmda/types/image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""

Dataclass for doing stuff on images of pages of a document

@kylel, @shannons

"""

import base64
from io import BytesIO

from PIL import Image

# Monkey patch the PIL.Image methods to add base64 conversion

def tobase64(self):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we want to support 2 forms: base64 and proper image file that one can download & view

# Ref: https://stackoverflow.com/a/31826470
buffered = BytesIO()
self.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue())

return img_str

def frombase64(img_str):
# Use the same naming style as the original Image methods

buffered = BytesIO(base64.b64decode(img_str))
img = Image.open(buffered)
return img

Image.Image.tobase64 = tobase64 # This is the method applied to individual Image classes
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prefer we define our own Image class, inherit from PIL's Image class, and override the methods.

Image.frombase64 = frombase64 # This is bind to the module, used for loading the images
8 changes: 0 additions & 8 deletions mmda/types/imgs.py

This file was deleted.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
intervaltree=3.1.0
intervaltree=3.1.0
pdf2image=0.1.12
Binary file added tests/fixtures/1903.10676.pdf
Binary file not shown.
32 changes: 32 additions & 0 deletions tests/test_parsers/test_load_PDF_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""

Tests for PDF Image parser

@kylel


"""

import unittest

from PIL import ImageChops

from mmda.parsers.parser import BaseParser
from mmda.types.image import Image

class TestLoadPDFImages(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.pdf_path = "tests/fixtures/1903.10676.pdf"
cls.parser = BaseParser()

def test_load_image(self):
images = self.parser.load_images(self.pdf_path)

assert hasattr(images[0], "tobase64")

recovered_image = Image.frombase64(images[0].tobase64())
diff = ImageChops.difference(recovered_image, images[0])

assert diff.getbbox() is None