diff --git a/CHANGELOG b/CHANGELOG index 829d1041..d40440c9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,10 @@ Changelog ========= +* 0.3.13 + * Significant performance gains for documents with a large number of table + cells. + * Significant performance gains for large documents. * 0.3.12 * Added command line support to convert from docx to either html or markdown. diff --git a/README.rst b/README.rst index fe21f717..6c41ad8c 100644 --- a/README.rst +++ b/README.rst @@ -185,16 +185,16 @@ When creating your own Parser (as described above) you can now add in your own c :: class Docx2Foo(DocxParser): - pre_processor_class = FooPrePorcessor + pre_processor_class = FooPreProcessor -The `FooPrePorcessor` will need a few things to get you going: +The `FooPreProcessor` will need a few things to get you going: :: - class FooPrePorcessor(PydocxPrePorcessor): + class FooPreProcessor(PydocxPreProcessor): def perform_pre_processing(self, root, *args, **kwargs): - super(FooPrePorcessor, self).perform_pre_processing(root, *args, **kwargs) + super(FooPreProcessor, self).perform_pre_processing(root, *args, **kwargs) self._set_foo(root) def _set_foo(self, root): diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py index 5d618ecd..3e24f98f 100644 --- a/pydocx/DocxParser.py +++ b/pydocx/DocxParser.py @@ -6,13 +6,14 @@ from contextlib import contextmanager from pydocx.utils import ( - PydocxPrePorcessor, - get_list_style, - parse_xml_from_string, - find_first, + MulitMemoizeMixin, + PydocxPreProcessor, find_all, find_ancestor_with_tag, + find_first, + get_list_style, has_descendant_with_tag, + parse_xml_from_string, ) from pydocx.exceptions import MalformedDocxException @@ -46,9 +47,9 @@ def ZipFile(path): # This is not needed in python 3.2+ f.close() -class DocxParser: +class DocxParser(MulitMemoizeMixin): __metaclass__ = ABCMeta - pre_processor_class = PydocxPrePorcessor + pre_processor_class = PydocxPreProcessor def _extract_xml(self, f, xml_path): try: @@ -161,13 +162,19 @@ def __init__( #all blank when we init self.comment_store = None - self.visited = [] + self.visited = set() self.list_depth = 0 self.rels_dict = self._parse_rels_root() self.styles_dict = self._parse_styles() self.parse_begin(self.root) # begin to parse def parse_begin(self, el): + self.populate_memoization({ + 'find_all': find_all, + 'find_first': find_first, + 'has_descendant_with_tag': has_descendant_with_tag, + '_get_tcs_in_column': self._get_tcs_in_column, + }) self.pre_processor = self.pre_processor_class( convert_root_level_upper_roman=self.convert_root_level_upper_roman, styles_dict=self.styles_dict, @@ -179,7 +186,7 @@ def parse_begin(self, el): def parse(self, el): if el in self.visited: return '' - self.visited.append(el) + self.visited.add(el) parsed = '' for child in el: # recursive. So you can get all the way to the bottom @@ -417,7 +424,7 @@ def _should_append_break_tag(self, next_el): if self.pre_processor.previous(next_el) is None: return False tag_is_inline_like = any( - has_descendant_with_tag(next_el, tag) for + self.memod_tree_op('has_descendant_with_tag', next_el, tag) for tag in inline_like_tags ) if tag_is_inline_like: @@ -478,7 +485,20 @@ def _should_parse_next_as_content(el): # Create the actual li element return self.list_element(parsed) + def _get_tcs_in_column(self, tbl, column_index): + return [ + tc for tc in self.memod_tree_op('find_all', tbl, 'tc') + if self.pre_processor.column_index(tc) == column_index + ] + def _get_rowspan(self, el, v_merge): + restart_in_v_merge = False + if v_merge is not None and 'val' in v_merge.attrib: + restart_in_v_merge = 'restart' in v_merge.attrib['val'] + + if not restart_in_v_merge: + return '' + current_row = self.pre_processor.row_index(el) current_col = self.pre_processor.column_index(el) rowspan = 1 @@ -488,24 +508,20 @@ def _get_rowspan(self, el, v_merge): # than the current_row and that are on the current_col if tbl is None: return '' + tcs = [ - tc for tc in find_all(tbl, 'tc') - if self.pre_processor.row_index(tc) >= current_row and - self.pre_processor.column_index(tc) == current_col + tc for tc in self.memod_tree_op( + '_get_tcs_in_column', tbl, current_col, + ) if self.pre_processor.row_index(tc) >= current_row ] - restart_in_v_merge = False - if v_merge is not None and 'val' in v_merge.attrib: - restart_in_v_merge = 'restart' in v_merge.attrib['val'] - def increment_rowspan(tc): - if not restart_in_v_merge: - return False + def should_increment_rowspan(tc): if not self.pre_processor.vmerge_continue(tc): return False return True for tc in tcs: - if increment_rowspan(tc): + if should_increment_rowspan(tc): rowspan += 1 else: rowspan = 1 @@ -517,7 +533,7 @@ def get_colspan(self, el): grid_span = find_first(el, 'gridSpan') if grid_span is None: return '' - return find_first(el, 'gridSpan').attrib['val'] + return grid_span.attrib['val'] def parse_table_cell_contents(self, el, text): parsed = text @@ -640,7 +656,7 @@ def parse_r(self, el, parsed): # Get the rPr for the current style, they are the defaults. p = find_ancestor_with_tag(self.pre_processor, el, 'p') - paragraph_style = find_first(p, 'pStyle') + paragraph_style = self.memod_tree_op('find_first', p, 'pStyle') if paragraph_style is not None: style = paragraph_style.get('val') style_defaults = self.styles_dict.get(style, {}) diff --git a/pydocx/utils.py b/pydocx/utils.py index fabe7863..1323302b 100644 --- a/pydocx/utils.py +++ b/pydocx/utils.py @@ -1,4 +1,5 @@ import re +import collections from collections import defaultdict from xml.etree import cElementTree @@ -21,6 +22,43 @@ ) +class MulitMemoize(object): + ''' + Adapted from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize + func_names = { + 'find_all': find_all, + ... + } + ''' + def __init__(self, func_names): + self.cache = dict((func_name, {}) for func_name in func_names) + self.func_names = func_names + + def __call__(self, func_name, *args): + if not isinstance(args, collections.Hashable): + # uncacheable. a list, for instance. + # better to not cache than blow up. + return self.func_names[func_name](*args) + if args in self.cache[func_name]: + return self.cache[func_name][args] + else: + value = self.func_names[func_name](*args) + self.cache[func_name][args] = value + return value + + +class MulitMemoizeMixin(object): + def __init__(self, *args, **kwargs): + super(MulitMemoizeMixin, self).__init__(*args, **kwargs) + self._memoization = None + + def memod_tree_op(self, func_name, *args): + return self._memoization(func_name, *args) + + def populate_memoization(self, func_names): + self._memoization = MulitMemoize(func_names) + + def el_iter(el): """ Go through all elements @@ -61,7 +99,7 @@ def has_descendant_with_tag(el, tag): Determine if there is a child ahead in the element tree. """ # Get child. stop at first child. - return True if el.find('.//' + tag) is not None else False + return True if find_first(el, tag) is not None else False def _filter_children(element, tags): @@ -154,7 +192,7 @@ def num_id(self): return self._num_id -class PydocxPrePorcessor(object): +class PydocxPreProcessor(MulitMemoizeMixin): def __init__( self, convert_root_level_upper_roman=False, @@ -167,6 +205,9 @@ def __init__( self.numbering_root = numbering_root def perform_pre_processing(self, root, *args, **kwargs): + self.populate_memoization({ + 'find_first': find_first, + }) self._add_parent(root) # If we don't have a numbering root there cannot be any lists. if self.numbering_root is not None: @@ -251,14 +292,12 @@ def _set_list_attributes(self, el): # Deleted text in a list will have a numId but no ilvl. if parent is None: continue - if find_first(parent, 'ilvl') is None: + parent_ilvl = self.memod_tree_op('find_first', parent, 'ilvl') + if parent_ilvl is None: continue self.meta_data[parent]['is_list_item'] = True self.meta_data[parent]['num_id'] = self._generate_num_id(parent) - self.meta_data[parent]['ilvl'] = find_first( - parent, - 'ilvl', - ).attrib['val'] + self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val'] def _generate_num_id(self, el): ''' @@ -364,9 +403,10 @@ def _set_headers(self, elements): for element in elements: # This element is using the default style which is not a heading. - if find_first(element, 'pStyle') is None: + p_style = find_first(element, 'pStyle') + if p_style is None: continue - style = find_first(element, 'pStyle').attrib.get('val', '') + style = p_style.attrib.get('val', '') metadata = self.styles_dict.get(style, {}) style_name = metadata.get('style_name') @@ -389,6 +429,7 @@ def _convert_upper_roman(self, body): if self.is_first_list_item(el) ] visited_num_ids = [] + all_p_tags_in_body = find_all(body, 'p') for root_list_item in first_root_list_items: if self.num_id(root_list_item) in visited_num_ids: continue @@ -401,11 +442,11 @@ def _convert_upper_roman(self, body): if lst_style != 'upperRoman': continue ilvl = min( - self.ilvl(el) for el in find_all(body, 'p') + self.ilvl(el) for el in all_p_tags_in_body if self.num_id(el) == self.num_id(root_list_item) ) root_upper_roman_list_items = [ - el for el in find_all(body, 'p') + el for el in all_p_tags_in_body if self.num_id(el) == self.num_id(root_list_item) and self.ilvl(el) == ilvl ]