diff --git a/.gitignore b/.gitignore index ce7a7cef..40efeefd 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,6 @@ pip-log.txt nosetests.xml *.mo .idea + +test.html +testxml.html diff --git a/.travis.yml b/.travis.yml index 6a5babb4..4251ba15 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,13 @@ language: python python: - "2.6" - "2.7" -script: python main.py +script: ./run_tests.sh install: + - python setup.py -q install - pip install -r requirements.txt +env: + - TRAVIS_EXECUTE_PERFORMANCE=1 notifications: email: - jason.louard.ward@gmail.com + - samson91787@gmail.com diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 00000000..33954f41 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,2 @@ +Sam Portnow +Jason Ward diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 00000000..a3c57d6f --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,29 @@ + +Changelog +========= +* 0.3.2 + * We were not taking into account that vertical merges should have a + continue attribute, but sometimes they do not, and in those cases word + assumes the continue attribute. We updated the parser to handle the + cases in which the continue attribute is not there. +* 0.3.1 + * Added support for several more OOXML tags including: + * caps + * smallCaps + * strike + * dstrike + * vanish + * webHidden + More details in the README. +* 0.3.0 + * We switched from using stock *xml.etree.ElementTree* to using + *xml.etree.cElementTree*. This has resulted in a fairly significant speed + increase for python 2.6 + * It is now possible to create your own pre processor to do additional pre + processing. + * Superscripts and subscripts are now extracted correctly. +* 0.2.1 + * Added a changelog + * Added the version in pydocx.__init__ + * Fixed an issue with duplicating content if there was indentation or + justification on a p element that had multiple t tags. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..88fbbf67 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,7 @@ +include AUTHORS +include CHANGELOG +include LICENSE +include MANIFEST.in +include README.rst +include pydocx/fixtures/* +include pydocx/tests/templates/* diff --git a/README.md b/README.md deleted file mode 100644 index e3773551..00000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -pydocx -====== \ No newline at end of file diff --git a/README.rst b/README.rst new file mode 100644 index 00000000..2f750299 --- /dev/null +++ b/README.rst @@ -0,0 +1,228 @@ +====== +pydocx +====== +.. image:: https://travis-ci.org/OpenScienceFramework/pydocx.png?branch=master + :align: left + :target: https://travis-ci.org/OpenScienceFramework/pydocx + +pydocx is a parser that breaks down the elements of a docxfile and converts them +into different markup languages. Right now, HTML is supported. Markdown and LaTex +will be available soon. You can extend any of the available parsers to customize it +to your needs. You can also create your own class that inherits DocxParser +to create your own methods for a markup language not yet supported. + +Currently Supported +################### + +* tables + * nested tables + * rowspans + * colspans + * lists in tables +* lists + * list styles + * nested lists + * list of tables + * list of pragraphs +* justification +* images +* styles + * bold + * italics + * underline + * hyperlinks +* headings + +Usage +##### + +DocxParser includes abstracts methods that each parser overwrites to satsify its own needs. The abstract methods are as follows: + +:: + + class DocxParser: + + @property + def parsed(self): + return self._parsed + + @property + def escape(self, text): + return text + + @abstractmethod + def linebreak(self): + return '' + + @abstractmethod + def paragraph(self, text): + return text + + @abstractmethod + def heading(self, text, heading_level): + return text + + @abstractmethod + def insertion(self, text, author, date): + return text + + @abstractmethod + def hyperlink(self, text, href): + return text + + @abstractmethod + def image_handler(self, path): + return path + + @abstractmethod + def image(self, path, x, y): + return self.image_handler(path) + + @abstractmethod + def deletion(self, text, author, date): + return text + + @abstractmethod + def bold(self, text): + return text + + @abstractmethod + def italics(self, text): + return text + + @abstractmethod + def underline(self, text): + return text + + @abstractmethod + def superscript(self, text): + return text + + @abstractmethod + def subscript(self, text): + return text + + @abstractmethod + def tab(self): + return True + + @abstractmethod + def ordered_list(self, text): + return text + + @abstractmethod + def unordered_list(self, text): + return text + + @abstractmethod + def list_element(self, text): + return text + + @abstractmethod + def table(self, text): + return text + @abstractmethod + def table_row(self, text): + return text + + @abstractmethod + def table_cell(self, text): + return text + + @abstractmethod + def page_break(self): + return True + + @abstractmethod + def indent(self, text, left='', right='', firstLine=''): + return text + +Docx2Html inherits DocxParser and implements basic HTML handling. Ex. + +:: + + class Docx2Html(DocxParser): + + # Escape '&', '<', and '>' so we render the HTML correctly + def escape(self, text): + return xml.sax.saxutils.quoteattr(text)[1:-1] + + # return a line break + def linebreak(self, pre=None): + return '
' + + # add paragraph tags + def paragraph(self, text, pre=None): + return '

' + text + '

' + + +However, let's say you want to add a specific style to your HTML document. In order to do this, you want to make each paragraph a class of type `my_implementation`. Simply extend docx2Html and add what you need. + +:: + + class My_Implementation_of_Docx2Html(Docx2Html): + + def paragraph(self, text, pre = None): + return

+ text + '

' + + + +OR, let's say FOO is your new favorite markup language. Simply customize your own new parser, overwritting the abstract methods of DocxParser + +:: + + class Docx2Foo(DocxParser): + + # because linebreaks in are denoted by '!!!!!!!!!!!!' with the FOO markup langauge :) + def linebreak(self): + return '!!!!!!!!!!!!' + +Custom Pre-Processor +#################### + +When creating your own Parser (as described above) you can now add in your own custom Pre Processor. To do so you will need to set the `pre_processor` field on the custom parser, like so: + +:: + + class Docx2Foo(DocxParser): + pre_processor_class = FooPrePorcessor + + +The `FooPrePorcessor` will need a few things to get you going: + +:: + + class FooPrePorcessor(PydocxPrePorcessor): + def perform_pre_processing(self, root, *args, **kwargs): + super(FooPrePorcessor, self).perform_pre_processing(root, *args, **kwargs) + self._set_foo(root) + + def _set_foo(self, root): + pass + +If you want `_set_foo` to be called you must add it to `perform_pre_processing` which is called in the base parser for pydocx. + +Everything done during pre-processing is executed prior to `parse` being called for the first time. + + +Styles +###### + +The base parser `Docx2Html` relies on certain css class being set for certain behaviour to occur. Currently these include: + +* class `pydocx-insert` -> Turns the text green. +* class `pydocx-delete` -> Turns the text red and draws a line through the text. +* class `pydocx-center` -> Aligns the text to the center. +* class `pydocx-right` -> Aligns the text to the right. +* class `pydocx-left` -> Aligns the text to the left. +* class `pydocx-comment` -> Turns the text blue. +* class `pydocx-underline` -> Underlines the text. +* class `pydocx-caps` -> Makes all text uppercase. +* class `pydocx-small-caps` -> Makes all text uppercase, however truly lowercase letters will be small than their uppercase counterparts. +* class `pydocx-strike` -> Strike a line through. +* class `pydocx-hidden` -> Hide the text. + +Optional Arguments +################## + +You can pass in `convert_root_level_upper_roman=True` to the parser and it will convert all root level upper roman lists to headings instead. diff --git a/main.py b/main.py deleted file mode 100644 index c9e8e1d4..00000000 --- a/main.py +++ /dev/null @@ -1,12 +0,0 @@ -from pydocx import * -from bs4 import BeautifulSoup -import xml.etree.ElementTree as ElementTree -#import lxml.etree as etree - -with open('test.html', 'w') as f: - f.write(docx2html('helloworld.docx')) -with open('testxml.html','w') as f: - f.write(BeautifulSoup(ElementTree.tostring(Docx2Html('helloworld.docx').root)).prettify()) - -#print docx2html('helloworld.docx') -#print docx2markdown('helloworld.docx') \ No newline at end of file diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py index b3006ef0..092248f0 100644 --- a/pydocx/DocxParser.py +++ b/pydocx/DocxParser.py @@ -1,323 +1,634 @@ -from abc import abstractmethod, ABCMeta -import zipfile import logging -import xml.etree.ElementTree as ElementTree -from xml.etree.ElementTree import _ElementInterface +import os +import zipfile + +from abc import abstractmethod, ABCMeta +from contextlib import contextmanager + +from pydocx.utils import ( + PydocxPrePorcessor, + get_list_style, + parse_xml_from_string, + find_first, + find_all, + find_ancestor_with_tag, + has_descendant_with_tag, +) logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("NewParser") -def remove_namespaces(document): - root = ElementTree.fromstring(document) - for child in el_iter(root): - child.tag = child.tag.split("}")[1] - child.attrib = dict( - (k.split("}")[1], v) - for k, v in child.attrib.items() - ) - return ElementTree.tostring(root) - -# Add some helper functions to Element to make it slightly more readable - - -def has_child(self, tag): - return True if self.find(tag) is not None else False - - -def has_child_all(self, tag): - return True if self.find('.//' + tag) is not None else False - - -def find_all(self, tag): - return self.find('.//' + tag) - - -def findall_all(self, tag): - return self.findall('.//' + tag) +# http://openxmldeveloper.org/discussions/formats/f/15/p/396/933.aspx +EMUS_PER_PIXEL = 9525 +USE_ALIGNMENTS = True +JUSTIFY_CENTER = 'center' +JUSTIFY_LEFT = 'left' +JUSTIFY_RIGHT = 'right' -def el_iter(el): - try: - return el.iter() - except AttributeError: - return el.findall('.//*') +INDENTATION_RIGHT = 'right' +INDENTATION_LEFT = 'left' +INDENTATION_FIRST_LINE = 'firstLine' +# Add some helper functions to Element to make it slightly more readable -setattr(_ElementInterface, 'has_child', has_child) -setattr(_ElementInterface, 'has_child_all', has_child_all) -setattr(_ElementInterface, 'find_all', find_all) -setattr(_ElementInterface, 'findall_all', findall_all) -setattr(_ElementInterface, 'parent', None) -setattr(_ElementInterface, 'parent_list', []) -# End helpers +@contextmanager +def ZipFile(path): # This is not needed in python 3.2+ + f = zipfile.ZipFile(path) + yield f + f.close() class DocxParser: __metaclass__ = ABCMeta + pre_processor_class = PydocxPrePorcessor - def __init__(self, path): - self._parsed = '' - self.in_list = False - - f = zipfile.ZipFile(path) - try: + def _build_data(self, path, *args, **kwargs): + with ZipFile(path) as f: self.document_text = f.read('word/document.xml') + self.styles_text = f.read('word/styles.xml') try: + self.fonts = f.read('/word/fontTable.xml') + except KeyError: + self.fonts = None + try: # Only present if there are lists self.numbering_text = f.read('word/numbering.xml') - except zipfile.BadZipfile: - pass - try: + except KeyError: + self.numbering_text = None + try: # Only present if there are comments self.comment_text = f.read('word/comments.xml') - except zipfile.BadZipfile: - pass - finally: - f.close() - - self.root = ElementTree.fromstring( - remove_namespaces(self.document_text), - ) - - def add_parent(el): - for child in el.getchildren(): - setattr(child, 'parent', el) - add_parent(child) - add_parent(self.root) - - def create_parent_list(el, tmp=None): - if tmp is None: - tmp = [] - for child in el: - tmp.append(el) - tmp = create_parent_list(child, tmp) - el.parent_list = tmp[:] - try: - tmp.pop() - except: - tmp = [] - return tmp - - create_parent_list(self.root) - + except KeyError: + self.comment_text = None + self.relationship_text = f.read('word/_rels/document.xml.rels') + zipped_image_files = [ + e for e in f.infolist() + if e.filename.startswith('word/media/') + ] + for e in zipped_image_files: + self._image_data[e.filename] = f.read(e.filename) + + self.root = parse_xml_from_string(self.document_text) + self.numbering_root = None + if self.numbering_text: + self.numbering_root = parse_xml_from_string(self.numbering_text) + self.comment_root = None + if self.comment_text: + self.comment_root = parse_xml_from_string(self.comment_text) + + def _parse_styles(self): + tree = parse_xml_from_string(self.styles_text) + result = {} + for style in find_all(tree, 'style'): + style_val = find_first(style, 'name').attrib['val'] + result[style.attrib['styleId']] = style_val + return result + + def _parse_rels_root(self): + tree = parse_xml_from_string(self.relationship_text) + rels_dict = {} + for el in tree: + rId = el.get('Id') + target = el.get('Target') + rels_dict[rId] = target + return rels_dict + + def __init__( + self, + path, + convert_root_level_upper_roman=False, + *args, + **kwargs): + self._parsed = '' + self.block_text = '' + self.page_width = 0 + self.convert_root_level_upper_roman = convert_root_level_upper_roman + self._image_data = {} + self._build_data(path, *args, **kwargs) + self.pre_processor = None + + #divide by 20 to get to pt (Office works in 20th's of a point) + """ + see http://msdn.microsoft.com/en-us/library/documentformat + .openxml.wordprocessing.indentation.aspx + """ + if find_first(self.root, 'pgSz') is not None: + self.page_width = int( + find_first(self.root, 'pgSz').attrib['w'] + ) / 20 + + #all blank when we init self.comment_store = None - self.numbering_store = None - self.ignore_current = False - self.elements = [] - self.tables_seen = [] self.visited = [] - try: - self.numbering_root = ElementTree.fromstring( - remove_namespaces(self.numbering_text), - ) - except: - pass - self.parse_begin(self.root) + self.list_depth = 0 + self.rels_dict = self._parse_rels_root() + self.styles_dict = self._parse_styles() + self.parse_begin(self.root) # begin to parse def parse_begin(self, el): - self._parsed += self.parse_lists(el) - -### parse table function and is_table flag - def parse_lists(self, el): - parsed = '' - first_p = el.find_all('p') - children = [] - for child in first_p.parent: - if child.tag == 'p' or child.tag == 'tbl': - children.append(child) - p_list = children - list_started = False - list_type = '' - list_chunks = [] - index_start = 0 - index_end = 1 - for i, el in enumerate(p_list): - if not list_started and el.has_child_all('ilvl'): - list_started = True - list_type = self.get_list_style( - el.find_all('numId').attrib['val'], - ) - list_chunks.append(p_list[index_start:index_end]) - index_start = i - index_end = i+1 - elif ( - list_started and - el.has_child_all('ilvl') and - not list_type == self.get_list_style( - el.find_all('numId').attrib['val'] - )): - list_type = self.get_list_style( - el.find_all('numId').attrib['val'], - ) - list_started = True - list_chunks.append(p_list[index_start:index_end]) - index_start = i - index_end = i+1 - elif list_started and not el.has_child_all('ilvl'): - list_started = False - list_chunks.append(p_list[index_start:index_end]) - index_start = i - index_end = i+1 - else: - index_end = i+1 - list_chunks.append(p_list[index_start:index_end]) - for chunk in list_chunks: - chunk_parsed = '' - for el in chunk: - chunk_parsed += self.parse(el) - if chunk[0].has_child_all('ilvl'): - lst_style = self.get_list_style( - chunk[0].find_all('numId').attrib['val'], - ) - if lst_style['val'] == 'bullet': - parsed += self.unordered_list(chunk_parsed) - else: - parsed += self.ordered_list(chunk_parsed) - elif chunk[0].has_child_all('br'): - parsed += self.page_break() - else: - parsed += chunk_parsed - - return parsed + self.pre_processor = self.pre_processor_class( + convert_root_level_upper_roman=self.convert_root_level_upper_roman, + styles_dict=self.styles_dict, + numbering_root=self.numbering_root, + ) + self.pre_processor.perform_pre_processing(el) + self._parsed += self.parse(el) def parse(self, el): + if el in self.visited: + return '' + self.visited.append(el) parsed = '' - if not self.ignore_current: - tmp_d = dict( - (tmpel.tag, i) - for i, tmpel in enumerate(el.parent_list) - ) - if ( - 'tbl' in tmp_d and - el.parent_list[tmp_d['tbl']] not in self.tables_seen): - self.ignore_current = True - self.tables_seen.append(el.parent_list[tmp_d['tbl']]) - tmpout = self.table(self.parse(el.parent_list[tmp_d['tbl']])) - self.ignore_current = False - return tmpout - for child in el: + # recursive. So you can get all the way to the bottom parsed += self.parse(child) - - if el.tag == 'br' and el.attrib['type'] == 'page': - #TODO figure out what parsed is getting overwritten - return self.page_break() - # add it to the list so we don't repeat! - if el.tag == 'ilvl' and el not in self.visited: - self.in_list = True - self.visited.append(el) - ## This starts the returns + if el.tag == 'br' and el.attrib.get('type') == 'page': + return self.parse_page_break(el, parsed) + elif el.tag == 'tbl': + return self.parse_table(el, parsed) elif el.tag == 'tr': - return self.table_row(parsed) + return self.parse_table_row(el, parsed) elif el.tag == 'tc': - self.elements.append(el) - return self.table_cell(parsed) - if el.tag == 'r' and el not in self.elements: - self.elements.append(el) - return self.parse_r(el) + return self.parse_table_cell(el, parsed) + elif el.tag == 'r': + return self.parse_r(el, parsed) + elif el.tag == 't': + return self.parse_t(el, parsed) + elif el.tag == 'br': + return self.parse_break_tag(el, parsed) + elif el.tag == 'delText': + return self.parse_deletion(el, parsed) elif el.tag == 'p': return self.parse_p(el, parsed) elif el.tag == 'ins': - return self.insertion(parsed, '', '') + return self.parse_insertion(el, parsed) + elif el.tag == 'hyperlink': + return self.parse_hyperlink(el, parsed) + elif el.tag in ('pict', 'drawing'): + return self.parse_image(el) else: return parsed + def parse_page_break(self, el, text): + #TODO figure out what parsed is getting overwritten + return self.page_break() + + def parse_table(self, el, text): + return self.table(text) + + def parse_table_row(self, el, text): + return self.table_row(text) + + def parse_table_cell(self, el, text): + v_merge = find_first(el, 'vMerge') + if v_merge is not None and ( + 'restart' != v_merge.get('val', '')): + return self.empty_cell() + colspan = self.get_colspan(el) + rowspan = self._get_rowspan(el, v_merge) + if rowspan > 1: + rowspan = str(rowspan) + else: + rowspan = '' + return self.table_cell( + text, colspan, rowspan, self.pre_processor.is_last_row_item(el), + has_descendant_with_tag(el, 'ilvl')) + + def parse_list(self, el, text): + """ + All the meat of building the list is done in _parse_list, however we + call this method for two reasons: It is the naming convention we are + following. And we need a reliable way to raise and lower the list_depth + (which is used to determine if we are in a list). I could have done + this in _parse_list, however it seemed cleaner to do it here. + """ + self.list_depth += 1 + parsed = self._parse_list(el, text) + self.list_depth -= 1 + if self.pre_processor.is_in_table(el): + return self.parse_table_cell_contents(el, parsed) + return parsed + + def get_list_style(self, num_id, ilvl): + return get_list_style(self.numbering_root, num_id, ilvl) + + def _build_list(self, el, text): + # Get the list style for the pending list. + lst_style = self.get_list_style( + self.pre_processor.num_id(el).num_id, + self.pre_processor.ilvl(el), + ) + + parsed = text + # Create the actual list and return it. + if lst_style == 'bullet': + return self.unordered_list(parsed) + else: + return self.ordered_list( + parsed, + lst_style, + ) + + def _parse_list(self, el, text): + parsed = self.parse_list_item(el, text) + num_id = self.pre_processor.num_id(el) + ilvl = self.pre_processor.ilvl(el) + # Everything after this point assumes the first element is not also the + # last. If the first element is also the last then early return by + # building and returning the completed list. + if self.pre_processor.is_last_list_item_in_root(el): + return self._build_list(el, parsed) + next_el = self.pre_processor.next(el) + + def is_same_list(next_el, num_id, ilvl): + # Bail if next_el is not an element + if next_el is None: + return False + if self.pre_processor.is_last_list_item_in_root(next_el): + return False + # If next_el is not a list item then roll it into the list by + # returning True. + if not self.pre_processor.is_list_item(next_el): + return True + if self.pre_processor.num_id(next_el) != num_id: + # The next element is a new list entirely + return False + if self.pre_processor.ilvl(next_el) < ilvl: + # The next element is de-indented, so this is really the last + # element in the list + return False + return True + + while is_same_list(next_el, num_id, ilvl): + if next_el in self.visited: + # Early continue for elements we have already visited. + next_el = self.pre_processor.next(next_el) + continue + + if self.pre_processor.is_list_item(next_el): + # Reset the ilvl + ilvl = self.pre_processor.ilvl(next_el) + + parsed += self.parse(next_el) + next_el = self.pre_processor.next(next_el) + + def should_parse_last_el(last_el, first_el): + if last_el is None: + return False + # Different list + if ( + self.pre_processor.num_id(last_el) != + self.pre_processor.num_id(first_el)): + return False + # Will be handled when the ilvls do match (nesting issue) + if ( + self.pre_processor.ilvl(last_el) != + self.pre_processor.ilvl(first_el)): + return False + # We only care about last items that have not been + # parsed before (first list items are + # always parsed at the beginning of this method.) + return ( + not self.pre_processor.is_first_list_item(last_el) and + self.pre_processor.is_last_list_item_in_root(last_el) + ) + if should_parse_last_el(next_el, el): + parsed += self.parse(next_el) + + # If the list has no content, then we don't need to worry about the + # list styling, because it will be stripped out. + if parsed == '': + return parsed + + return self._build_list(el, parsed) + + def justification(self, el, text): + paragraph_tag_property = el.find('pPr') + if paragraph_tag_property is None: + return text + + _justification = paragraph_tag_property.find('jc') + indentation = paragraph_tag_property.find('ind') + if _justification is None and indentation is None: + return text + alignment = None + right = None + left = None + firstLine = None + if _justification is not None: # text alignments + value = _justification.attrib['val'] + if value in [JUSTIFY_LEFT, JUSTIFY_CENTER, JUSTIFY_RIGHT]: + alignment = value + if indentation is not None: + if INDENTATION_RIGHT in indentation.attrib: + right = indentation.attrib[INDENTATION_RIGHT] + # divide by 20 to get to pt. multiply by (4/3) to get to px + right = (int(right) / 20) * float(4) / float(3) + right = str(right) + if INDENTATION_LEFT in indentation.attrib: + left = indentation.attrib[INDENTATION_LEFT] + left = (int(left) / 20) * float(4) / float(3) + left = str(left) + if INDENTATION_FIRST_LINE in indentation.attrib: + firstLine = indentation.attrib[INDENTATION_FIRST_LINE] + firstLine = (int(firstLine) / 20) * float(4) / float(3) + firstLine = str(firstLine) + if any([alignment, firstLine, left, right]): + return self.indent( + text, alignment, firstLine, + left, right, self.pre_processor.is_in_table(el)) + return text + def parse_p(self, el, text): + if text == '': + return '' + # TODO This is still not correct, however it fixes the bug. We need to + # apply the classes/styles on p, td, li and h tags instead of inline, + # but that is for another ticket. + text = self.justification(el, text) + if self.pre_processor.is_first_list_item(el): + return self.parse_list(el, text) + if self.pre_processor.heading_level(el): + return self.parse_heading(el, text) + if self.pre_processor.is_list_item(el): + return self.parse_list_item(el, text) + if self.pre_processor.is_in_table(el): + return self.parse_table_cell_contents(el, text) parsed = text - if self.in_list: - self.in_list = False - parsed = self.list_element(parsed) - elif ( - not el.has_child_all('t') and - 'tbl' not in [i.tag for i in el.parent_list]): - parsed = self.linebreak() - elif el.parent not in self.elements: + # No p tags in li tags + if self.list_depth == 0: parsed = self.paragraph(parsed) return parsed - def parse_r(self, el): - is_deleted = False - text = None - if el.has_child('t'): - text = self.escape(el.find('t').text) - elif el.has_child('delText'): - text = self.escape(el.find('delText').text) - is_deleted = True - if text: - rpr = el.find('rPr') - if rpr is not None: - fns = [] - if rpr.has_child('b'): - fns.append(self.bold) - if rpr.has_child('i'): - fns.append(self.italics) - if rpr.has_child('u'): - fns.append(self.underline) - for fn in fns: - text = fn(text) - ppr = el.parent.find('pPr') - if ppr is not None: - jc = ppr.find('jc') - if jc is not None: - if jc.attrib['val'] == 'right': - text = self.right_justify(text) - if jc.attrib['val'] == 'center': - text = self.center_justify(text) - ind = ppr.find('ind') - if ind is not None: - right = None - left = None - firstLine = None - if 'right' in ind.attrib: - right = ind.attrib['right'] - right = int(right)/20 - right = str(right) - if 'left' in ind.attrib: - left = ind.attrib['left'] - left = int(left)/20 - left = str(left) - if 'firstLine' in ind.attrib: - firstLine = ind.attrib['firstLine'] - firstLine = int(firstLine)/20 - firstLine = str(firstLine) - text = self.indent(text, right, left, firstLine) - if is_deleted: - text = self.deletion(text, '', '') - return text - else: + def _should_append_break_tag(self, next_el): + paragraph_like_tags = [ + 'p', + ] + inline_like_tags = [ + 'smartTag', + 'ins', + 'delText', + ] + if self.pre_processor.is_list_item(next_el): + return False + if self.pre_processor.previous(next_el) is None: + return False + tag_is_inline_like = any( + has_descendant_with_tag(next_el, tag) for + tag in inline_like_tags + ) + if tag_is_inline_like: + return False + if ( + self.pre_processor.is_last_list_item_in_root( + self.pre_processor.previous(next_el))): + return False + if self.pre_processor.previous(next_el).tag not in paragraph_like_tags: + return False + if next_el.tag not in paragraph_like_tags: + return False + return True + + def parse_heading(self, el, parsed): + return self.heading(parsed, self.pre_processor.heading_level(el)) + + def parse_list_item(self, el, text): + # If for whatever reason we are not currently in a list, then start + # a list here. This will only happen if the num_id/ilvl combinations + # between lists is not well formed. + parsed = text + if self.list_depth == 0: + return self.parse_list(el, parsed) + + def _should_parse_next_as_content(el): + """ + Get the contents of the next el and append it to the + contents of the current el (that way things like tables + are actually in the li tag instead of in the ol/ul tag). + """ + next_el = self.pre_processor.next(el) + if next_el is None: + return False + if ( + not self.pre_processor.is_list_item(next_el) and + not self.pre_processor.is_last_list_item_in_root(el) + ): + return True + if self.pre_processor.is_first_list_item(next_el): + if ( + self.pre_processor.num_id(next_el) == + self.pre_processor.num_id(el)): + return True + return False + + while el is not None: + if _should_parse_next_as_content(el): + el = self.pre_processor.next(el) + next_elements_content = self.parse(el) + if not next_elements_content: + continue + if self._should_append_break_tag(el): + parsed += self.break_tag( + self.pre_processor.is_in_table(el)) + parsed += next_elements_content + else: + break + # Create the actual li element + return self.list_element(parsed) + + def _get_rowspan(self, el, v_merge): + current_row = self.pre_processor.row_index(el) + current_col = self.pre_processor.column_index(el) + rowspan = 1 + result = '' + tbl = find_ancestor_with_tag(self.pre_processor, el, 'tbl') + # We only want table cells that have a higher row_index that is greater + # than the current_row and that are on the current_col + if tbl is None: + return '' + tcs = [ + tc for tc in find_all(tbl, 'tc') + if self.pre_processor.row_index(tc) >= current_row and + self.pre_processor.column_index(tc) == current_col + ] + restart_in_v_merge = False + if v_merge is not None and 'val' in v_merge.attrib: + restart_in_v_merge = 'restart' in v_merge.attrib['val'] + + def increment_rowspan(tc): + if not restart_in_v_merge: + return False + if not self.pre_processor.vmerge_continue(tc): + return False + return True + + for tc in tcs: + if increment_rowspan(tc): + rowspan += 1 + else: + rowspan = 1 + if rowspan > 1: + result = rowspan + return str(result) + + def get_colspan(self, el): + grid_span = find_first(el, 'gridSpan') + if grid_span is None: return '' + return find_first(el, 'gridSpan').attrib['val'] + + def parse_table_cell_contents(self, el, text): + parsed = text + + def _should_parse_next_as_content(el): + next_el = self.pre_processor.next(el) + if next_el is None: + return False + if self.pre_processor.is_in_table(next_el): + return True + while el is not None: + if _should_parse_next_as_content(el): + el = self.pre_processor.next(el) + next_elements_content = self.parse(el) + if not next_elements_content: + continue + if self._should_append_break_tag(el): + parsed += self.break_tag( + self.pre_processor.is_in_table(el)) + parsed += next_elements_content + else: + break + return parsed - def get_list_style(self, numval): - ids = self.numbering_root.findall_all('num') - for _id in ids: - if _id.attrib['numId'] == numval: - abstractid = _id.find('abstractNumId') - abstractid = abstractid.attrib['val'] - style_information = self.numbering_root.findall_all( - 'abstractNum', - ) - for info in style_information: - if info.attrib['abstractNumId'] == abstractid: - for i in el_iter(info): - if i.find('numFmt') is not None: - return i.find('numFmt').attrib - - def get_comments(self, doc_id): - if self.comment_store is None: - # TODO throw appropriate error - comment_root = ElementTree.fromstring( - remove_namespaces(self.comment_text), + def parse_hyperlink(self, el, text): + rId = el.get('id') + href = self.rels_dict.get(rId) + if not href: + return text + href = self.escape(href) + return self.hyperlink(text, href) + + def _get_image_id(self, el): + # Drawings + blip = find_first(el, 'blip') + if blip is not None: + # On drawing tags the id is actually whatever is returned from the + # embed attribute on the blip tag. Thanks a lot Microsoft. + return blip.get('embed') + # Picts + imagedata = find_first(el, 'imagedata') + if imagedata is not None: + return imagedata.get('id') + + def _convert_image_size(self, size): + return size / EMUS_PER_PIXEL + + def _get_image_size(self, el): + """ + If we can't find a height or width, return 0 for whichever is not + found, then rely on the `image` handler to strip those attributes. This + functionality can change once we integrate PIL. + """ + sizes = find_first(el, 'ext') + if sizes is not None and sizes.get('cx'): + if sizes.get('cx'): + x = self._convert_image_size(int(sizes.get('cx'))) + if sizes.get('cy'): + y = self._convert_image_size(int(sizes.get('cy'))) + return ( + '%dpx' % x, + '%dpx' % y, ) - ids_and_info = {} - ids = comment_root.findall_all('comment') - for _id in ids: - ids_and_info[_id.attrib['id']] = { - "author": _id.attrib['author'], - "date": _id.attrib['date'], - "text": _id.findall_all('t')[0].text, - } - self.comment_store = ids_and_info - return self.comment_store[doc_id] + shape = find_first(el, 'shape') + if shape is not None and shape.get('style') is not None: + # If either of these are not set, rely on the method `image` to not + # use either of them. + x = 0 + y = 0 + styles = shape.get('style').split(';') + + for s in styles: + if s.startswith('height:'): + y = s.split(':')[1] + if s.startswith('width:'): + x = s.split(':')[1] + return x, y + return 0, 0 + + def parse_image(self, el): + x, y = self._get_image_size(el) + rId = self._get_image_id(el) + src = self.rels_dict.get(rId) + if not src: + return '' + src = os.path.join( + 'word', + src, + ) + if src in self._image_data: + filename = os.path.split(src)[-1] + return self.image(self._image_data[src], filename, x, y) + return '' + + def _is_style_on(self, el): + """ + For b, i, u (bold, italics, and underline) merely having the tag is not + sufficient. You need to check to make sure it is not set to "false" as + well. + """ + return el.get('val') != 'false' + + def parse_t(self, el, parsed): + return self.escape(el.text) + + def parse_break_tag(self, el, parsed): + return self.break_tag(self.pre_processor.is_in_table(el)) + + def parse_deletion(self, el, parsed): + return self.deletion(el.text, '', '') + + def parse_insertion(self, el, parsed): + return self.insertion(parsed, '', '') + + def parse_r(self, el, parsed): + """ + Parse the running text. + """ + text = parsed + if not text: + return '' + run_tag_property = el.find('rPr') + + def _has_style_on(run_tag_property, tag): + el = run_tag_property.find(tag) + if el is not None: + return self._is_style_on(el) + inline_tags = { + 'b': self.bold, + 'i': self.italics, + 'u': self.underline, + 'caps': self.caps, + 'smallCaps': self.small_caps, + 'strike': self.strike, + 'dstrike': self.strike, + 'vanish': self.hide, + 'webHidden': self.hide, + } + if run_tag_property is not None: + for child in run_tag_property: + # These tags are a little different, handle them separately + # from the rest. + # This could be a superscript or a subscript + if child.tag == 'vertAlign': + if child.attrib['val'] == 'superscript': + text = self.superscript(text) + elif child.attrib['val'] == 'subscript': + text = self.subscript(text) + elif child.tag in inline_tags and self._is_style_on(child): + text = inline_tags[child.tag](text) + + return text @property def parsed(self): @@ -335,10 +646,26 @@ def linebreak(self): def paragraph(self, text): return text + @abstractmethod + def heading(self, text, heading_level): + return text + @abstractmethod def insertion(self, text, author, date): return text + @abstractmethod + def hyperlink(self, text, href): + return text + + @abstractmethod + def image_handler(self, path): + return path + + @abstractmethod + def image(self, data, filename, x, y): + return self.image_handler(data) + @abstractmethod def deletion(self, text, author, date): return text @@ -355,6 +682,30 @@ def italics(self, text): def underline(self, text): return text + @abstractmethod + def caps(self, text): + return text + + @abstractmethod + def small_caps(self, text): + return text + + @abstractmethod + def strike(self, text): + return text + + @abstractmethod + def hide(self, text): + return text + + @abstractmethod + def superscript(self, text): + return text + + @abstractmethod + def subscript(self, text): + return text + @abstractmethod def tab(self): return True @@ -388,15 +739,9 @@ def page_break(self): return True @abstractmethod - def right_justify(self, text): - return text - - @abstractmethod - def center_justify(self, text): + def indent(self, text, left='', right='', firstLine=''): return text @abstractmethod - def indent(self, text, left=None, right=None, firstLine=None): - return text - - #TODO JUSTIFIED JUSTIFIED TEXT + def empty_cell(self): + return '' diff --git a/pydocx/HtmlConversion.py b/pydocx/HtmlConversion.py new file mode 100644 index 00000000..cab112f1 --- /dev/null +++ b/pydocx/HtmlConversion.py @@ -0,0 +1,394 @@ +import xml.etree.ElementTree as ElementTree +from xml.etree.ElementTree import _ElementInterface +from pydocx.py_docx.docx import * +import py_docx.docx as docx + + +def find_first(self, tag): + """ + Find the first occurrence of a tag beneath the current element. + """ + return self.find('.//' + tag) + + +def find_all(self, tag): + """ + Find all occurrences of a tag + """ + return self.findall('.//' + tag) + + +def has_descendant_with_tag(el, tag): + """ + Determine if there is a child ahead in the element tree. + """ + # Get child. stop at first child. + return True if el.find('.//' + tag) is not None else False + + +setattr(_ElementInterface, 'find_first', find_first) +setattr(_ElementInterface, 'find_all', find_all) +setattr(_ElementInterface, 'is_first_list_item', False) +setattr(_ElementInterface, 'is_last_list_item', False) +setattr(_ElementInterface, 'in_table', False) +setattr(_ElementInterface, 'has_descendant_with_tag', has_descendant_with_tag) +setattr(_ElementInterface, 'new_list', False) +setattr(_ElementInterface, 'new_ilvl', False) +setattr(_ElementInterface, 'is_first_list', False) +setattr(_ElementInterface, 'is_last_item_in_list', False) + + +class Html2Docx(): + + def __init__(self, html): + # set up what is parsed + self.parsed = '' + with open(html, 'r') as f: + html = f.read() + # need to keep track of elements + # that have been visited + self.visited = [] + self.stored_numId = 0 + # need to keep track of the + # ilvl in the document + self.stored_ilvl = 0 + #abstractId info for the numbering documents + self.abstractIdInfo = [] + #numIds for the numbering document. + #these correspond to the abstractIdInfo + self.numIds = [] + #for the numbering document + self.abstract = None + # set up the html + self.html = ElementTree.fromstring(html) + # get the relationship list + self.relationships = relationshiplist() + # make a new document + self.document = newdocument() + #get the body + self.body = self.document.xpath( + '/w:document/w:body', namespaces=nsprefixes)[0] + #make a new numbering document + self.numbering = new_numbering() + #start bulding the document + self.build() + + def build(self): + #first step is to add parent attribute + #for the whole document + def add_parent(el): + for child in el.getchildren(): + setattr(child, 'parent', el) + add_parent(child) + add_parent(self.html) + #now set the list attributes + self.set_list_attributes() + #and begin parsing + self.parse(self.html.find_first('body')) + + def find_all_by_tags(self, html, *args): + #helper function to find all the elements + #with mutiple tags + list_elements = [] + for el in html.iter(): + if el.tag in args: + list_elements.append(el) + return list_elements + + def check_for_lst_parent(self, el): + #helper function to see if a list + #has an li as a parent. + #meaning that its parent is itself + #a list and therefore, it is nested + lst_parent = False + if el.parent.tag != 'body': + if el.parent.tag == 'li': + lst_parent = True + #return true if you find a list parent + return lst_parent + self.check_for_lst_parent(el.parent) + else: + return lst_parent + + def set_list_attributes(self): + #now we set the list attributes + ilvl = 0 + numId = 0 + lsts = self.find_all_by_tags(self.html, 'ol', 'ul') + for lst in lsts: + lst.getchildren()[0].is_first_list_item = True + lst.getchildren()[-1].is_last_list_item = True + for item in lst.getchildren(): + #if the element does not have a parent and it is + #the last list item, we know it is safe to + #increment the numId, meaning there is a new + #list + if not self.check_for_lst_parent(item.parent): + if item.is_last_list_item: + numId += 1 + #has to be true because a new list will + # automatically have a new ilvl + item.new_ilvl = True + item.new_list = True + #also have to set the ilvl back to 0 + ilvl = 0 + elif item.is_first_list_item and self.check_for_lst_parent( + item.parent): + #if a list if item has a parent that is a list + #and its the first item, we must increment the + #indentation level (ilvl) + item.new_ilvl = True + ilvl += 1 + item.ilvl = ilvl + item.num_id = numId + item.is_list_item = True + + def parse(self, el): + for child in el.getchildren(): + if child.tag == 'br': + #if we find a break tag, look for text after it + text_and_style = self.parse_r(child)[0] + just = self.parse_r(child)[1] + self.body.append(paragraph(text_and_style, jc=just)) + if child.tag == 'p': + #if we find a p tag, look for text after it + text_and_style = self.parse_r(child)[0] + just = self.parse_r(child)[1] + self.body.append(paragraph(text_and_style, jc=just)) + if child.tag == 'ul' or child.tag == 'ol': + #if we find a list, look for text after it + lst_type = child.tag + self.parse_list(child, lst_type) + if child.tag == 'table': + #separate function for parsing tables + #because in word, the table tags are the parent + #of the p tag, so we have to handle + #them a bit differently + self.body.append(self.parse_table(child)) + self.parse(child) + self.save() + + def parse_r(self, el): + # we have to the whole block of + # text that will go in a paragraph + par_block = [] + # we have to get the breaks that + # will go in the paragraph + breaks = [] + #we need this to creating a string of the styles + #i.e., bold, italic, underline + style = '' + just = 'left' + for child in el.iter(): + text = '' + if child.tag == 'div': + #look for what the justification is + if 'center' in child.attrib['class']: + just = 'center' + elif 'right' in child.attrib['class']: + just = 'right' + if child.tag == 'em': + #if there's an em tag, + #add italic to style + style += 'i' + if child.tag == 'strong': + #if there's a strong tag, + #add bold to style + style += 'b' + if child.tag == 'underline': + #if there's an underline tag, + #add underline to style + style += 'u' + if child.text: + #get the text + text = child.text + if child.tag == 'br' and child not in self.visited: + #dont want to hit breaks twice + #text of break comes at the tail + text = child.tail + breaks.append('br') + self.visited.append(child) + if text: + #if text, add everything to the parblock + #set the style back to blank + par_block.append([text, style, breaks]) + style = '' + if child.parent and child.parent.tag == 'li': + #if it has a list parent, return early + return par_block, just + return par_block, just + + def parse_list(self, lst, lst_type=''): + tentatives = None + """ + parsing lists, we need to keep track of both + the list itself, and as we go through build up + the numbering document. for some reason, + there are two sections of a word numbering document: + an abstract numbering section that contains all of the + relevant list info, as well as a num section that contains + references to the abstract numbers defined earlier in the + numbering file + """ + for child in lst.getchildren(): + if child not in self.visited: + #first append the elements to + #the visisted elements + self.visited.append(child) + #get the text and style of this child + text_and_style = self.parse_r(child)[0] + #get the justication of the style + just = self.parse_r(child)[1] + #if its an ol, then its a decimal list + if lst_type == 'ol': + type_lst = 'decimal' + #if its a ul, then its a bulleted list + if lst_type == 'ul': + type_lst = 'bullet' + if child.new_ilvl: + #if theres a new ilvl, increase + #the indentation + ind = 720 * (child.ilvl + 1) + #create a numId attribute for the list, this + #is for the numbering document, + num = create_list_attributes( + ilvl=str(child.ilvl), + type=type_lst, just=just, left=str(ind)) + #append that numId to the lists of + #all the numIds + #we will later append this info to the + #abstract id section of the numbering document + self.numIds.append(num) + self.stored_ilvl += 1 + if not child.find('ol') and not child.find('ul'): + tentatives = fill_tentative( + self.stored_ilvl, type_lst=type_lst) + #if we cant find another list, we know its the + #last item and it's ok to fill out the rest of the + #abstract num info + + #abstractnumid gets increased + # for every list, starts out at 0. numIds themselves + self.abstract = create_list(child.num_id - 1) + self.numbering.append(self.abstract) + #here is where we append to the abstract num section + for num in self.numIds: + self.abstract.append(num) + #now we have to create tentative lists. the way that + #word is able to nicely do indent to create new lists + #is by creating tentative lists that start past the + #last indent. it goes all the way up to 8, because that's + #all that will fit in the width of the file. + for tentative in tentatives: + self.abstract.append(tentative) + #now we have our abstract id info, and we have to append to + #it the current num_id + self.abstractIdInfo.append( + create_abstract_IdInfo(str(child.num_id))) + #we're done here, so we can set our stored_ilvl back to 0 + self.stored_ilvl = 0 + #and we can set our num ideas to zero + self.numIds = [] + #now we append to hte body the relavent list info + self.body.append( + paragraph( + text_and_style, is_list=True, + ilvl=str(child.ilvl), numId=str(child.num_id), + style=lst_type, jc=just)) + #if, from the current list element, we find another list, + # we have to parse that lists BEFORE we parse the next list + # item in the current list + if child.find('ul'): + lst = child.find('ul') + self.parse_list(lst, lst.tag) + if child.find('ol'): + lst = child.find('ol') + self.parse_list(lst, lst.tag) + + def table_look_ahead(self, tbl): + #table look ahead function, + #we need to do this to account for vertical merges. in html + #all you need to do is include the rowspan and not include any + #extra table elements. word, on the other hand, expects an + #empty tale with a vmerge attribute inside it. so we're + #going to go thru and create these elements and insert them + #into the html document + trs = tbl.find_all('tr') + for i in range(len(trs)): + tcs = trs[i].find_all('td') + for j in range(len(tcs)): + if 'rowspan' in tcs[j].attrib: + for x in range(1, int(tcs[j].attrib['rowspan'])): + tc = ElementTree.Element('td') + setattr(tc, 'parent', trs[i+x]) + tc.set('vmerge_continue', True) + trs[i + x].insert(j, tc) + return tbl + + def get_columns(self, tbl): + #have to get the total number of columns + #for the table. just go by the first row + #but if there is a colspan, add that to the + #column count + columns = 0 + trs = tbl.find_all('tr') + tcs = trs[0].find_all('td') + for tc in tcs: + tc.in_table = True + if 'colspan' in tc.attrib: + columns += int(tc.attrib['colspan']) + else: + columns += 1 + return columns + + def parse_table(self, el): + #get the number of columns + columns = self.get_columns(el) + #set up the table properties + tbl = createtblproperties(columns) + #going to have to do a look ahead and + #create those extra table rows + for tr in self.table_look_ahead(el).getchildren(): + table_row = createtablerow() + tcs = tr.find_all('td') + for tc in tcs: + colspan = '' + vmerge = {} + #now look for colspans + #and rowspans (referenced by + #total number of vmerge starting from + #a vmerge:restart + if 'colspan' in tc.attrib: + colspan = tc.attrib['colspan'] + if 'rowspan' in tc.attrib: + vmerge = {'val': 'restart'} + if 'vmerge_continue' in tc.attrib: + vmerge = {'val': 'continue'} + cell = createtablecell(gridspan=colspan, vmerge=vmerge) + text_and_style = self.parse_r(tc)[0] + just = self.parse_r(tc)[1] + par_run = paragraph(text_and_style, jc=just) + cell.append(par_run) + table_row.append(cell) + tbl.append(table_row) + return tbl + + def save(self): + title = 'Python docx demo' + subject = 'A practical example of making docx from Python' + creator = 'Mike MacCana' + keywords = ['python', 'Office Open XML', 'Word'] + for abstract in self.abstractIdInfo: + self.numbering.append(abstract) + coreprops = coreproperties( + title=title, subject=subject, + creator=creator, keywords=keywords) + appprops = appproperties() + contenttypes = docx.contenttypes() + websettings = docx.websettings() + wordrelationships = docx.wordrelationships(self.relationships) + # Save our document + savedocx( + self.document, coreprops, + appprops, contenttypes, websettings, + wordrelationships, 'Testing.docx', self.numbering) diff --git a/pydocx/__init__.py b/pydocx/__init__.py index 9b42e00f..07833131 100644 --- a/pydocx/__init__.py +++ b/pydocx/__init__.py @@ -1,8 +1,19 @@ -from .parsers import * +from .parsers import Docx2LaTex, Docx2Html, Docx2Markdown +from HtmlConversion import Html2Docx + def docx2html(path): return Docx2Html(path).parsed + def docx2markdown(path): return Docx2Markdown(path).parsed + +def docx2latex(path): + return Docx2LaTex(path).parsed + +def html2docx(path): + return Html2Docx(path).parsed + +VERSION = '0.3.1' diff --git a/pydocx/fixtures/all_configured_styles.docx b/pydocx/fixtures/all_configured_styles.docx new file mode 100644 index 00000000..8f514372 Binary files /dev/null and b/pydocx/fixtures/all_configured_styles.docx differ diff --git a/pydocx/fixtures/attachment_is_tiff.docx b/pydocx/fixtures/attachment_is_tiff.docx new file mode 100644 index 00000000..774362ca Binary files /dev/null and b/pydocx/fixtures/attachment_is_tiff.docx differ diff --git a/pydocx/fixtures/bigger_font_size_to_header.docx b/pydocx/fixtures/bigger_font_size_to_header.docx new file mode 100644 index 00000000..c722888b Binary files /dev/null and b/pydocx/fixtures/bigger_font_size_to_header.docx differ diff --git a/pydocx/fixtures/convert_p_to_h.docx b/pydocx/fixtures/convert_p_to_h.docx new file mode 100644 index 00000000..53769e15 Binary files /dev/null and b/pydocx/fixtures/convert_p_to_h.docx differ diff --git a/pydocx/fixtures/fake_headings_by_length.docx b/pydocx/fixtures/fake_headings_by_length.docx new file mode 100644 index 00000000..a130f5ba Binary files /dev/null and b/pydocx/fixtures/fake_headings_by_length.docx differ diff --git a/pydocx/fixtures/greek_alphabet.docx b/pydocx/fixtures/greek_alphabet.docx new file mode 100644 index 00000000..46ab5429 Binary files /dev/null and b/pydocx/fixtures/greek_alphabet.docx differ diff --git a/pydocx/fixtures/has_image.docx b/pydocx/fixtures/has_image.docx new file mode 100644 index 00000000..2ebd0bd0 Binary files /dev/null and b/pydocx/fixtures/has_image.docx differ diff --git a/pydocx/fixtures/has_missing_image.docx b/pydocx/fixtures/has_missing_image.docx new file mode 100644 index 00000000..996e6671 Binary files /dev/null and b/pydocx/fixtures/has_missing_image.docx differ diff --git a/pydocx/fixtures/has_title.docx b/pydocx/fixtures/has_title.docx new file mode 100644 index 00000000..a87d88ed Binary files /dev/null and b/pydocx/fixtures/has_title.docx differ diff --git a/pydocx/fixtures/header_footer_problem.docx b/pydocx/fixtures/header_footer_problem.docx new file mode 100644 index 00000000..6bc49a7a Binary files /dev/null and b/pydocx/fixtures/header_footer_problem.docx differ diff --git a/pydocx/fixtures/headers.docx b/pydocx/fixtures/headers.docx new file mode 100644 index 00000000..890104c7 Binary files /dev/null and b/pydocx/fixtures/headers.docx differ diff --git a/pydocx/fixtures/headers_with_full_line_styles.docx b/pydocx/fixtures/headers_with_full_line_styles.docx new file mode 100644 index 00000000..38d6f6a8 Binary files /dev/null and b/pydocx/fixtures/headers_with_full_line_styles.docx differ diff --git a/pydocx/fixtures/inline_tags.docx b/pydocx/fixtures/inline_tags.docx new file mode 100644 index 00000000..4aba2347 Binary files /dev/null and b/pydocx/fixtures/inline_tags.docx differ diff --git a/pydocx/fixtures/justification.docx b/pydocx/fixtures/justification.docx new file mode 100644 index 00000000..7f8a3bf1 Binary files /dev/null and b/pydocx/fixtures/justification.docx differ diff --git a/pydocx/fixtures/list_in_table.docx b/pydocx/fixtures/list_in_table.docx new file mode 100644 index 00000000..d1a87388 Binary files /dev/null and b/pydocx/fixtures/list_in_table.docx differ diff --git a/pydocx/fixtures/list_to_header.docx b/pydocx/fixtures/list_to_header.docx new file mode 100644 index 00000000..f9b3946e Binary files /dev/null and b/pydocx/fixtures/list_to_header.docx differ diff --git a/pydocx/fixtures/lists_with_styles.docx b/pydocx/fixtures/lists_with_styles.docx new file mode 100644 index 00000000..c1c7ecf8 Binary files /dev/null and b/pydocx/fixtures/lists_with_styles.docx differ diff --git a/pydocx/fixtures/localDpi.docx b/pydocx/fixtures/localDpi.docx new file mode 100644 index 00000000..0f6d7f77 Binary files /dev/null and b/pydocx/fixtures/localDpi.docx differ diff --git a/pydocx/fixtures/missing_content.docx b/pydocx/fixtures/missing_content.docx new file mode 100644 index 00000000..21bed964 Binary files /dev/null and b/pydocx/fixtures/missing_content.docx differ diff --git a/pydocx/fixtures/nested_lists.docx b/pydocx/fixtures/nested_lists.docx new file mode 100644 index 00000000..f4000dfa Binary files /dev/null and b/pydocx/fixtures/nested_lists.docx differ diff --git a/pydocx/fixtures/nested_table_rowspan.docx b/pydocx/fixtures/nested_table_rowspan.docx new file mode 100644 index 00000000..b43b8a0d Binary files /dev/null and b/pydocx/fixtures/nested_table_rowspan.docx differ diff --git a/pydocx/fixtures/nested_tables.docx b/pydocx/fixtures/nested_tables.docx new file mode 100644 index 00000000..af704d4d Binary files /dev/null and b/pydocx/fixtures/nested_tables.docx differ diff --git a/pydocx/fixtures/resized_image.docx b/pydocx/fixtures/resized_image.docx new file mode 100644 index 00000000..913099c4 Binary files /dev/null and b/pydocx/fixtures/resized_image.docx differ diff --git a/pydocx/fixtures/shift_enter.docx b/pydocx/fixtures/shift_enter.docx new file mode 100644 index 00000000..4128c0a2 Binary files /dev/null and b/pydocx/fixtures/shift_enter.docx differ diff --git a/pydocx/fixtures/simple.docx b/pydocx/fixtures/simple.docx new file mode 100644 index 00000000..1d2a1c23 Binary files /dev/null and b/pydocx/fixtures/simple.docx differ diff --git a/pydocx/fixtures/simple_lists.docx b/pydocx/fixtures/simple_lists.docx new file mode 100644 index 00000000..c09ad744 Binary files /dev/null and b/pydocx/fixtures/simple_lists.docx differ diff --git a/pydocx/fixtures/simple_table.docx b/pydocx/fixtures/simple_table.docx new file mode 100644 index 00000000..26de483c Binary files /dev/null and b/pydocx/fixtures/simple_table.docx differ diff --git a/pydocx/fixtures/special_chars.docx b/pydocx/fixtures/special_chars.docx new file mode 100644 index 00000000..b4b9287f Binary files /dev/null and b/pydocx/fixtures/special_chars.docx differ diff --git a/pydocx/fixtures/split_header.docx b/pydocx/fixtures/split_header.docx new file mode 100644 index 00000000..cc4bd5cf Binary files /dev/null and b/pydocx/fixtures/split_header.docx differ diff --git a/pydocx/fixtures/super_and_subscript.docx b/pydocx/fixtures/super_and_subscript.docx new file mode 100644 index 00000000..06ea2d7a Binary files /dev/null and b/pydocx/fixtures/super_and_subscript.docx differ diff --git a/pydocx/fixtures/table_col_row_span.docx b/pydocx/fixtures/table_col_row_span.docx new file mode 100644 index 00000000..856abfdf Binary files /dev/null and b/pydocx/fixtures/table_col_row_span.docx differ diff --git a/pydocx/fixtures/tables_in_lists.docx b/pydocx/fixtures/tables_in_lists.docx new file mode 100644 index 00000000..11859541 Binary files /dev/null and b/pydocx/fixtures/tables_in_lists.docx differ diff --git a/pydocx/fixtures/track_changes_on.docx b/pydocx/fixtures/track_changes_on.docx new file mode 100644 index 00000000..dcb7ba1c Binary files /dev/null and b/pydocx/fixtures/track_changes_on.docx differ diff --git a/pydocx/fixtures/upper_alpha_all_bold.docx b/pydocx/fixtures/upper_alpha_all_bold.docx new file mode 100644 index 00000000..d518b2c5 Binary files /dev/null and b/pydocx/fixtures/upper_alpha_all_bold.docx differ diff --git a/pydocx/lxmlparser.py b/pydocx/lxmlparser.py deleted file mode 100644 index 94b130d3..00000000 --- a/pydocx/lxmlparser.py +++ /dev/null @@ -1,111 +0,0 @@ -import zipfile -from lxml import etree -from StringIO import StringIO -__author__ = 'samportnow' - -#for el in tree.iter(): - # The way lists are handled could double visit certain elements; keep - # track of which elements have been visited and skip any that have been - # visited already. - #if el in visited_nodes: - #continue -with zipfile.ZipFile('/Users/samportnow/Documents/pydocx/helloworld.docx') as f: - document = f.read('word/document.xml') - numbering= f.read('word/numbering.xml') -parser=etree.XMLParser(ns_clean=True) -document=StringIO(document) -numbering=StringIO(numbering) -numbering_tree=etree.parse(numbering,parser) -numbering_namespace=numbering_tree.getroot().nsmap['w'] -visited_els=[] - -def get_parsed(): - parser=etree.XMLParser(ns_clean=True) - tree=etree.parse(document,parser) - namespace=tree.getroot().nsmap['w'] - #rpr is run properties for the paragraph mark - paragraph='' - run_text='' - running_text='' - for el in tree.iter(): - if el.tag=='{%s}p' %namespace: - for wp in el.iter(): - if wp.tag =='{%s}ins' %namespace: - for text in wp.iterchildren(): - if text not in visited_els: - run_text +='
'+get_text(text,namespace,visited_els)+'
' - visited_els.append(text) - if wp.tag=='{%s}r' %namespace and wp not in visited_els: - run_text+=get_text(wp,namespace,visited_els) - visited_els.append(wp) - if not el.getchildren(): - run_text+='
' - if wp.tag == '{%s}ilvl' %namespace: - for lst in el.iter(): - if lst.find('{%s}numId' %namespace) is not None and el not in visited_els: - numval = lst.find('{%s}numId' %namespace).attrib['{%s}val' %namespace] - lst_type=get_list_style(numval) - if get_text(lst,namespace,visited_els) and el not in visited_els and lst_type['{%s}val' %namespace] != 'bullet': - if lst.getnext() is not None: - if lst not in visited_els: - while lst.getnext() is not None: - if lst not in visited_els: - text = get_text(lst,namespace,visited_els) - next_txt = get_text(lst.getnext(),namespace,visited_els) - running_text += text + next_txt - visited_els.append(lst) - visited_els.append(lst.getnext()) - lst=lst.getnext() - else: - run_text += '
  • ' + running_text + '
  • ' - break - else: - run_text +='
  • ' + get_text(lst, namespace, visited_els) + '
  • ' - visited_els.append(lst) - print running_text - return run_text - - -def get_text(wp,namespace,visited_els): - run_text= '' - decorator = '' - closing = '' - if wp.find('{%s}tab' %namespace) is not None: - run_text+='%nbsp' - if wp.find('{%s}rPr' %namespace) is not None: - for tag in wp.iter(): - if tag.find('{%s}u' %namespace) is not None: - if wp.find('{%s}t' %namespace) is not None: - decorator +='' - closing += '' - visited_els.append(wp.find('{%s}t' %namespace)) - if tag.find('{%s}i' %namespace) is not None: - if wp.find('{%s}t' %namespace) is not None: - decorator += '' - closing += '' - visited_els.append(wp.find('{%s}t' %namespace)) - if tag.find('{%s}b' %namespace) is not None: - if wp.find('{%s}t' %namespace) is not None: - decorator += '' - closing += '' - visited_els.append(wp.find('{%s}t' %namespace)) - run_text = wp.find('{%s}t' %namespace).text - run_text = decorator + run_text + closing - if wp.find('{%s}t' %namespace) is not None and wp.find('{%s}t' %namespace) not in visited_els: - run_text+=wp.find('{%s}t' %namespace).text - return run_text - -def get_list_style(numval): - ids = numbering_tree.findall('{%s}num' %numbering_namespace) - for id in ids: - if id.attrib['{%s}numId' %numbering_namespace] == numval: - abstractid=id.find('{%s}abstractNumId' %numbering_namespace) - abstractid=abstractid.attrib['{%s}val' %numbering_namespace] - style_information=numbering_tree.findall('{%s}abstractNum' %numbering_namespace) - for info in style_information: - if info.attrib['{%s}abstractNumId' %numbering_namespace] == abstractid: - for i in info.iter(): - if i.find('{%s}numFmt' %numbering_namespace) is not None: - return i.find('{%s}numFmt' %numbering_namespace).attrib - -print get_parsed() diff --git a/pydocx/parsers/Docx2Html.py b/pydocx/parsers/Docx2Html.py index bfaad2a6..c829e33d 100644 --- a/pydocx/parsers/Docx2Html.py +++ b/pydocx/parsers/Docx2Html.py @@ -1,21 +1,46 @@ -from pydocx.DocxParser import DocxParser - +import base64 import xml.sax.saxutils +from pydocx.DocxParser import DocxParser + class Docx2Html(DocxParser): @property def parsed(self): - self._parsed = self._parsed.replace('

    ', '
    ') - self._parsed = self._parsed.replace('


    ', '

    ') - self._parsed = self._parsed.replace('