diff --git a/.gitignore b/.gitignore
index ce7a7cef..40efeefd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,6 @@ pip-log.txt
nosetests.xml
*.mo
.idea
+
+test.html
+testxml.html
diff --git a/.travis.yml b/.travis.yml
index 6a5babb4..4251ba15 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,9 +2,13 @@ language: python
python:
- "2.6"
- "2.7"
-script: python main.py
+script: ./run_tests.sh
install:
+ - python setup.py -q install
- pip install -r requirements.txt
+env:
+ - TRAVIS_EXECUTE_PERFORMANCE=1
notifications:
email:
- jason.louard.ward@gmail.com
+ - samson91787@gmail.com
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 00000000..33954f41
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,2 @@
+Sam Portnow
+Jason Ward
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 00000000..a3c57d6f
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,29 @@
+
+Changelog
+=========
+* 0.3.2
+ * We were not taking into account that vertical merges should have a
+ continue attribute, but sometimes they do not, and in those cases word
+ assumes the continue attribute. We updated the parser to handle the
+ cases in which the continue attribute is not there.
+* 0.3.1
+ * Added support for several more OOXML tags including:
+ * caps
+ * smallCaps
+ * strike
+ * dstrike
+ * vanish
+ * webHidden
+ More details in the README.
+* 0.3.0
+ * We switched from using stock *xml.etree.ElementTree* to using
+ *xml.etree.cElementTree*. This has resulted in a fairly significant speed
+ increase for python 2.6
+ * It is now possible to create your own pre processor to do additional pre
+ processing.
+ * Superscripts and subscripts are now extracted correctly.
+* 0.2.1
+ * Added a changelog
+ * Added the version in pydocx.__init__
+ * Fixed an issue with duplicating content if there was indentation or
+ justification on a p element that had multiple t tags.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..88fbbf67
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,7 @@
+include AUTHORS
+include CHANGELOG
+include LICENSE
+include MANIFEST.in
+include README.rst
+include pydocx/fixtures/*
+include pydocx/tests/templates/*
diff --git a/README.md b/README.md
deleted file mode 100644
index e3773551..00000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-pydocx
-======
\ No newline at end of file
diff --git a/README.rst b/README.rst
new file mode 100644
index 00000000..2f750299
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,228 @@
+======
+pydocx
+======
+.. image:: https://travis-ci.org/OpenScienceFramework/pydocx.png?branch=master
+ :align: left
+ :target: https://travis-ci.org/OpenScienceFramework/pydocx
+
+pydocx is a parser that breaks down the elements of a docxfile and converts them
+into different markup languages. Right now, HTML is supported. Markdown and LaTex
+will be available soon. You can extend any of the available parsers to customize it
+to your needs. You can also create your own class that inherits DocxParser
+to create your own methods for a markup language not yet supported.
+
+Currently Supported
+###################
+
+* tables
+ * nested tables
+ * rowspans
+ * colspans
+ * lists in tables
+* lists
+ * list styles
+ * nested lists
+ * list of tables
+ * list of pragraphs
+* justification
+* images
+* styles
+ * bold
+ * italics
+ * underline
+ * hyperlinks
+* headings
+
+Usage
+#####
+
+DocxParser includes abstracts methods that each parser overwrites to satsify its own needs. The abstract methods are as follows:
+
+::
+
+ class DocxParser:
+
+ @property
+ def parsed(self):
+ return self._parsed
+
+ @property
+ def escape(self, text):
+ return text
+
+ @abstractmethod
+ def linebreak(self):
+ return ''
+
+ @abstractmethod
+ def paragraph(self, text):
+ return text
+
+ @abstractmethod
+ def heading(self, text, heading_level):
+ return text
+
+ @abstractmethod
+ def insertion(self, text, author, date):
+ return text
+
+ @abstractmethod
+ def hyperlink(self, text, href):
+ return text
+
+ @abstractmethod
+ def image_handler(self, path):
+ return path
+
+ @abstractmethod
+ def image(self, path, x, y):
+ return self.image_handler(path)
+
+ @abstractmethod
+ def deletion(self, text, author, date):
+ return text
+
+ @abstractmethod
+ def bold(self, text):
+ return text
+
+ @abstractmethod
+ def italics(self, text):
+ return text
+
+ @abstractmethod
+ def underline(self, text):
+ return text
+
+ @abstractmethod
+ def superscript(self, text):
+ return text
+
+ @abstractmethod
+ def subscript(self, text):
+ return text
+
+ @abstractmethod
+ def tab(self):
+ return True
+
+ @abstractmethod
+ def ordered_list(self, text):
+ return text
+
+ @abstractmethod
+ def unordered_list(self, text):
+ return text
+
+ @abstractmethod
+ def list_element(self, text):
+ return text
+
+ @abstractmethod
+ def table(self, text):
+ return text
+ @abstractmethod
+ def table_row(self, text):
+ return text
+
+ @abstractmethod
+ def table_cell(self, text):
+ return text
+
+ @abstractmethod
+ def page_break(self):
+ return True
+
+ @abstractmethod
+ def indent(self, text, left='', right='', firstLine=''):
+ return text
+
+Docx2Html inherits DocxParser and implements basic HTML handling. Ex.
+
+::
+
+ class Docx2Html(DocxParser):
+
+ # Escape '&', '<', and '>' so we render the HTML correctly
+ def escape(self, text):
+ return xml.sax.saxutils.quoteattr(text)[1:-1]
+
+ # return a line break
+ def linebreak(self, pre=None):
+ return '
'
+
+ # add paragraph tags
+ def paragraph(self, text, pre=None):
+ return '' + text + '
'
+
+
+However, let's say you want to add a specific style to your HTML document. In order to do this, you want to make each paragraph a class of type `my_implementation`. Simply extend docx2Html and add what you need.
+
+::
+
+ class My_Implementation_of_Docx2Html(Docx2Html):
+
+ def paragraph(self, text, pre = None):
+ return + text + '
'
+
+
+
+OR, let's say FOO is your new favorite markup language. Simply customize your own new parser, overwritting the abstract methods of DocxParser
+
+::
+
+ class Docx2Foo(DocxParser):
+
+ # because linebreaks in are denoted by '!!!!!!!!!!!!' with the FOO markup langauge :)
+ def linebreak(self):
+ return '!!!!!!!!!!!!'
+
+Custom Pre-Processor
+####################
+
+When creating your own Parser (as described above) you can now add in your own custom Pre Processor. To do so you will need to set the `pre_processor` field on the custom parser, like so:
+
+::
+
+ class Docx2Foo(DocxParser):
+ pre_processor_class = FooPrePorcessor
+
+
+The `FooPrePorcessor` will need a few things to get you going:
+
+::
+
+ class FooPrePorcessor(PydocxPrePorcessor):
+ def perform_pre_processing(self, root, *args, **kwargs):
+ super(FooPrePorcessor, self).perform_pre_processing(root, *args, **kwargs)
+ self._set_foo(root)
+
+ def _set_foo(self, root):
+ pass
+
+If you want `_set_foo` to be called you must add it to `perform_pre_processing` which is called in the base parser for pydocx.
+
+Everything done during pre-processing is executed prior to `parse` being called for the first time.
+
+
+Styles
+######
+
+The base parser `Docx2Html` relies on certain css class being set for certain behaviour to occur. Currently these include:
+
+* class `pydocx-insert` -> Turns the text green.
+* class `pydocx-delete` -> Turns the text red and draws a line through the text.
+* class `pydocx-center` -> Aligns the text to the center.
+* class `pydocx-right` -> Aligns the text to the right.
+* class `pydocx-left` -> Aligns the text to the left.
+* class `pydocx-comment` -> Turns the text blue.
+* class `pydocx-underline` -> Underlines the text.
+* class `pydocx-caps` -> Makes all text uppercase.
+* class `pydocx-small-caps` -> Makes all text uppercase, however truly lowercase letters will be small than their uppercase counterparts.
+* class `pydocx-strike` -> Strike a line through.
+* class `pydocx-hidden` -> Hide the text.
+
+Optional Arguments
+##################
+
+You can pass in `convert_root_level_upper_roman=True` to the parser and it will convert all root level upper roman lists to headings instead.
diff --git a/main.py b/main.py
deleted file mode 100644
index c9e8e1d4..00000000
--- a/main.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from pydocx import *
-from bs4 import BeautifulSoup
-import xml.etree.ElementTree as ElementTree
-#import lxml.etree as etree
-
-with open('test.html', 'w') as f:
- f.write(docx2html('helloworld.docx'))
-with open('testxml.html','w') as f:
- f.write(BeautifulSoup(ElementTree.tostring(Docx2Html('helloworld.docx').root)).prettify())
-
-#print docx2html('helloworld.docx')
-#print docx2markdown('helloworld.docx')
\ No newline at end of file
diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py
index b3006ef0..092248f0 100644
--- a/pydocx/DocxParser.py
+++ b/pydocx/DocxParser.py
@@ -1,323 +1,634 @@
-from abc import abstractmethod, ABCMeta
-import zipfile
import logging
-import xml.etree.ElementTree as ElementTree
-from xml.etree.ElementTree import _ElementInterface
+import os
+import zipfile
+
+from abc import abstractmethod, ABCMeta
+from contextlib import contextmanager
+
+from pydocx.utils import (
+ PydocxPrePorcessor,
+ get_list_style,
+ parse_xml_from_string,
+ find_first,
+ find_all,
+ find_ancestor_with_tag,
+ has_descendant_with_tag,
+)
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("NewParser")
-def remove_namespaces(document):
- root = ElementTree.fromstring(document)
- for child in el_iter(root):
- child.tag = child.tag.split("}")[1]
- child.attrib = dict(
- (k.split("}")[1], v)
- for k, v in child.attrib.items()
- )
- return ElementTree.tostring(root)
-
-# Add some helper functions to Element to make it slightly more readable
-
-
-def has_child(self, tag):
- return True if self.find(tag) is not None else False
-
-
-def has_child_all(self, tag):
- return True if self.find('.//' + tag) is not None else False
-
-
-def find_all(self, tag):
- return self.find('.//' + tag)
-
-
-def findall_all(self, tag):
- return self.findall('.//' + tag)
+# http://openxmldeveloper.org/discussions/formats/f/15/p/396/933.aspx
+EMUS_PER_PIXEL = 9525
+USE_ALIGNMENTS = True
+JUSTIFY_CENTER = 'center'
+JUSTIFY_LEFT = 'left'
+JUSTIFY_RIGHT = 'right'
-def el_iter(el):
- try:
- return el.iter()
- except AttributeError:
- return el.findall('.//*')
+INDENTATION_RIGHT = 'right'
+INDENTATION_LEFT = 'left'
+INDENTATION_FIRST_LINE = 'firstLine'
+# Add some helper functions to Element to make it slightly more readable
-setattr(_ElementInterface, 'has_child', has_child)
-setattr(_ElementInterface, 'has_child_all', has_child_all)
-setattr(_ElementInterface, 'find_all', find_all)
-setattr(_ElementInterface, 'findall_all', findall_all)
-setattr(_ElementInterface, 'parent', None)
-setattr(_ElementInterface, 'parent_list', [])
-# End helpers
+@contextmanager
+def ZipFile(path): # This is not needed in python 3.2+
+ f = zipfile.ZipFile(path)
+ yield f
+ f.close()
class DocxParser:
__metaclass__ = ABCMeta
+ pre_processor_class = PydocxPrePorcessor
- def __init__(self, path):
- self._parsed = ''
- self.in_list = False
-
- f = zipfile.ZipFile(path)
- try:
+ def _build_data(self, path, *args, **kwargs):
+ with ZipFile(path) as f:
self.document_text = f.read('word/document.xml')
+ self.styles_text = f.read('word/styles.xml')
try:
+ self.fonts = f.read('/word/fontTable.xml')
+ except KeyError:
+ self.fonts = None
+ try: # Only present if there are lists
self.numbering_text = f.read('word/numbering.xml')
- except zipfile.BadZipfile:
- pass
- try:
+ except KeyError:
+ self.numbering_text = None
+ try: # Only present if there are comments
self.comment_text = f.read('word/comments.xml')
- except zipfile.BadZipfile:
- pass
- finally:
- f.close()
-
- self.root = ElementTree.fromstring(
- remove_namespaces(self.document_text),
- )
-
- def add_parent(el):
- for child in el.getchildren():
- setattr(child, 'parent', el)
- add_parent(child)
- add_parent(self.root)
-
- def create_parent_list(el, tmp=None):
- if tmp is None:
- tmp = []
- for child in el:
- tmp.append(el)
- tmp = create_parent_list(child, tmp)
- el.parent_list = tmp[:]
- try:
- tmp.pop()
- except:
- tmp = []
- return tmp
-
- create_parent_list(self.root)
-
+ except KeyError:
+ self.comment_text = None
+ self.relationship_text = f.read('word/_rels/document.xml.rels')
+ zipped_image_files = [
+ e for e in f.infolist()
+ if e.filename.startswith('word/media/')
+ ]
+ for e in zipped_image_files:
+ self._image_data[e.filename] = f.read(e.filename)
+
+ self.root = parse_xml_from_string(self.document_text)
+ self.numbering_root = None
+ if self.numbering_text:
+ self.numbering_root = parse_xml_from_string(self.numbering_text)
+ self.comment_root = None
+ if self.comment_text:
+ self.comment_root = parse_xml_from_string(self.comment_text)
+
+ def _parse_styles(self):
+ tree = parse_xml_from_string(self.styles_text)
+ result = {}
+ for style in find_all(tree, 'style'):
+ style_val = find_first(style, 'name').attrib['val']
+ result[style.attrib['styleId']] = style_val
+ return result
+
+ def _parse_rels_root(self):
+ tree = parse_xml_from_string(self.relationship_text)
+ rels_dict = {}
+ for el in tree:
+ rId = el.get('Id')
+ target = el.get('Target')
+ rels_dict[rId] = target
+ return rels_dict
+
+ def __init__(
+ self,
+ path,
+ convert_root_level_upper_roman=False,
+ *args,
+ **kwargs):
+ self._parsed = ''
+ self.block_text = ''
+ self.page_width = 0
+ self.convert_root_level_upper_roman = convert_root_level_upper_roman
+ self._image_data = {}
+ self._build_data(path, *args, **kwargs)
+ self.pre_processor = None
+
+ #divide by 20 to get to pt (Office works in 20th's of a point)
+ """
+ see http://msdn.microsoft.com/en-us/library/documentformat
+ .openxml.wordprocessing.indentation.aspx
+ """
+ if find_first(self.root, 'pgSz') is not None:
+ self.page_width = int(
+ find_first(self.root, 'pgSz').attrib['w']
+ ) / 20
+
+ #all blank when we init
self.comment_store = None
- self.numbering_store = None
- self.ignore_current = False
- self.elements = []
- self.tables_seen = []
self.visited = []
- try:
- self.numbering_root = ElementTree.fromstring(
- remove_namespaces(self.numbering_text),
- )
- except:
- pass
- self.parse_begin(self.root)
+ self.list_depth = 0
+ self.rels_dict = self._parse_rels_root()
+ self.styles_dict = self._parse_styles()
+ self.parse_begin(self.root) # begin to parse
def parse_begin(self, el):
- self._parsed += self.parse_lists(el)
-
-### parse table function and is_table flag
- def parse_lists(self, el):
- parsed = ''
- first_p = el.find_all('p')
- children = []
- for child in first_p.parent:
- if child.tag == 'p' or child.tag == 'tbl':
- children.append(child)
- p_list = children
- list_started = False
- list_type = ''
- list_chunks = []
- index_start = 0
- index_end = 1
- for i, el in enumerate(p_list):
- if not list_started and el.has_child_all('ilvl'):
- list_started = True
- list_type = self.get_list_style(
- el.find_all('numId').attrib['val'],
- )
- list_chunks.append(p_list[index_start:index_end])
- index_start = i
- index_end = i+1
- elif (
- list_started and
- el.has_child_all('ilvl') and
- not list_type == self.get_list_style(
- el.find_all('numId').attrib['val']
- )):
- list_type = self.get_list_style(
- el.find_all('numId').attrib['val'],
- )
- list_started = True
- list_chunks.append(p_list[index_start:index_end])
- index_start = i
- index_end = i+1
- elif list_started and not el.has_child_all('ilvl'):
- list_started = False
- list_chunks.append(p_list[index_start:index_end])
- index_start = i
- index_end = i+1
- else:
- index_end = i+1
- list_chunks.append(p_list[index_start:index_end])
- for chunk in list_chunks:
- chunk_parsed = ''
- for el in chunk:
- chunk_parsed += self.parse(el)
- if chunk[0].has_child_all('ilvl'):
- lst_style = self.get_list_style(
- chunk[0].find_all('numId').attrib['val'],
- )
- if lst_style['val'] == 'bullet':
- parsed += self.unordered_list(chunk_parsed)
- else:
- parsed += self.ordered_list(chunk_parsed)
- elif chunk[0].has_child_all('br'):
- parsed += self.page_break()
- else:
- parsed += chunk_parsed
-
- return parsed
+ self.pre_processor = self.pre_processor_class(
+ convert_root_level_upper_roman=self.convert_root_level_upper_roman,
+ styles_dict=self.styles_dict,
+ numbering_root=self.numbering_root,
+ )
+ self.pre_processor.perform_pre_processing(el)
+ self._parsed += self.parse(el)
def parse(self, el):
+ if el in self.visited:
+ return ''
+ self.visited.append(el)
parsed = ''
- if not self.ignore_current:
- tmp_d = dict(
- (tmpel.tag, i)
- for i, tmpel in enumerate(el.parent_list)
- )
- if (
- 'tbl' in tmp_d and
- el.parent_list[tmp_d['tbl']] not in self.tables_seen):
- self.ignore_current = True
- self.tables_seen.append(el.parent_list[tmp_d['tbl']])
- tmpout = self.table(self.parse(el.parent_list[tmp_d['tbl']]))
- self.ignore_current = False
- return tmpout
-
for child in el:
+ # recursive. So you can get all the way to the bottom
parsed += self.parse(child)
-
- if el.tag == 'br' and el.attrib['type'] == 'page':
- #TODO figure out what parsed is getting overwritten
- return self.page_break()
- # add it to the list so we don't repeat!
- if el.tag == 'ilvl' and el not in self.visited:
- self.in_list = True
- self.visited.append(el)
- ## This starts the returns
+ if el.tag == 'br' and el.attrib.get('type') == 'page':
+ return self.parse_page_break(el, parsed)
+ elif el.tag == 'tbl':
+ return self.parse_table(el, parsed)
elif el.tag == 'tr':
- return self.table_row(parsed)
+ return self.parse_table_row(el, parsed)
elif el.tag == 'tc':
- self.elements.append(el)
- return self.table_cell(parsed)
- if el.tag == 'r' and el not in self.elements:
- self.elements.append(el)
- return self.parse_r(el)
+ return self.parse_table_cell(el, parsed)
+ elif el.tag == 'r':
+ return self.parse_r(el, parsed)
+ elif el.tag == 't':
+ return self.parse_t(el, parsed)
+ elif el.tag == 'br':
+ return self.parse_break_tag(el, parsed)
+ elif el.tag == 'delText':
+ return self.parse_deletion(el, parsed)
elif el.tag == 'p':
return self.parse_p(el, parsed)
elif el.tag == 'ins':
- return self.insertion(parsed, '', '')
+ return self.parse_insertion(el, parsed)
+ elif el.tag == 'hyperlink':
+ return self.parse_hyperlink(el, parsed)
+ elif el.tag in ('pict', 'drawing'):
+ return self.parse_image(el)
else:
return parsed
+ def parse_page_break(self, el, text):
+ #TODO figure out what parsed is getting overwritten
+ return self.page_break()
+
+ def parse_table(self, el, text):
+ return self.table(text)
+
+ def parse_table_row(self, el, text):
+ return self.table_row(text)
+
+ def parse_table_cell(self, el, text):
+ v_merge = find_first(el, 'vMerge')
+ if v_merge is not None and (
+ 'restart' != v_merge.get('val', '')):
+ return self.empty_cell()
+ colspan = self.get_colspan(el)
+ rowspan = self._get_rowspan(el, v_merge)
+ if rowspan > 1:
+ rowspan = str(rowspan)
+ else:
+ rowspan = ''
+ return self.table_cell(
+ text, colspan, rowspan, self.pre_processor.is_last_row_item(el),
+ has_descendant_with_tag(el, 'ilvl'))
+
+ def parse_list(self, el, text):
+ """
+ All the meat of building the list is done in _parse_list, however we
+ call this method for two reasons: It is the naming convention we are
+ following. And we need a reliable way to raise and lower the list_depth
+ (which is used to determine if we are in a list). I could have done
+ this in _parse_list, however it seemed cleaner to do it here.
+ """
+ self.list_depth += 1
+ parsed = self._parse_list(el, text)
+ self.list_depth -= 1
+ if self.pre_processor.is_in_table(el):
+ return self.parse_table_cell_contents(el, parsed)
+ return parsed
+
+ def get_list_style(self, num_id, ilvl):
+ return get_list_style(self.numbering_root, num_id, ilvl)
+
+ def _build_list(self, el, text):
+ # Get the list style for the pending list.
+ lst_style = self.get_list_style(
+ self.pre_processor.num_id(el).num_id,
+ self.pre_processor.ilvl(el),
+ )
+
+ parsed = text
+ # Create the actual list and return it.
+ if lst_style == 'bullet':
+ return self.unordered_list(parsed)
+ else:
+ return self.ordered_list(
+ parsed,
+ lst_style,
+ )
+
+ def _parse_list(self, el, text):
+ parsed = self.parse_list_item(el, text)
+ num_id = self.pre_processor.num_id(el)
+ ilvl = self.pre_processor.ilvl(el)
+ # Everything after this point assumes the first element is not also the
+ # last. If the first element is also the last then early return by
+ # building and returning the completed list.
+ if self.pre_processor.is_last_list_item_in_root(el):
+ return self._build_list(el, parsed)
+ next_el = self.pre_processor.next(el)
+
+ def is_same_list(next_el, num_id, ilvl):
+ # Bail if next_el is not an element
+ if next_el is None:
+ return False
+ if self.pre_processor.is_last_list_item_in_root(next_el):
+ return False
+ # If next_el is not a list item then roll it into the list by
+ # returning True.
+ if not self.pre_processor.is_list_item(next_el):
+ return True
+ if self.pre_processor.num_id(next_el) != num_id:
+ # The next element is a new list entirely
+ return False
+ if self.pre_processor.ilvl(next_el) < ilvl:
+ # The next element is de-indented, so this is really the last
+ # element in the list
+ return False
+ return True
+
+ while is_same_list(next_el, num_id, ilvl):
+ if next_el in self.visited:
+ # Early continue for elements we have already visited.
+ next_el = self.pre_processor.next(next_el)
+ continue
+
+ if self.pre_processor.is_list_item(next_el):
+ # Reset the ilvl
+ ilvl = self.pre_processor.ilvl(next_el)
+
+ parsed += self.parse(next_el)
+ next_el = self.pre_processor.next(next_el)
+
+ def should_parse_last_el(last_el, first_el):
+ if last_el is None:
+ return False
+ # Different list
+ if (
+ self.pre_processor.num_id(last_el) !=
+ self.pre_processor.num_id(first_el)):
+ return False
+ # Will be handled when the ilvls do match (nesting issue)
+ if (
+ self.pre_processor.ilvl(last_el) !=
+ self.pre_processor.ilvl(first_el)):
+ return False
+ # We only care about last items that have not been
+ # parsed before (first list items are
+ # always parsed at the beginning of this method.)
+ return (
+ not self.pre_processor.is_first_list_item(last_el) and
+ self.pre_processor.is_last_list_item_in_root(last_el)
+ )
+ if should_parse_last_el(next_el, el):
+ parsed += self.parse(next_el)
+
+ # If the list has no content, then we don't need to worry about the
+ # list styling, because it will be stripped out.
+ if parsed == '':
+ return parsed
+
+ return self._build_list(el, parsed)
+
+ def justification(self, el, text):
+ paragraph_tag_property = el.find('pPr')
+ if paragraph_tag_property is None:
+ return text
+
+ _justification = paragraph_tag_property.find('jc')
+ indentation = paragraph_tag_property.find('ind')
+ if _justification is None and indentation is None:
+ return text
+ alignment = None
+ right = None
+ left = None
+ firstLine = None
+ if _justification is not None: # text alignments
+ value = _justification.attrib['val']
+ if value in [JUSTIFY_LEFT, JUSTIFY_CENTER, JUSTIFY_RIGHT]:
+ alignment = value
+ if indentation is not None:
+ if INDENTATION_RIGHT in indentation.attrib:
+ right = indentation.attrib[INDENTATION_RIGHT]
+ # divide by 20 to get to pt. multiply by (4/3) to get to px
+ right = (int(right) / 20) * float(4) / float(3)
+ right = str(right)
+ if INDENTATION_LEFT in indentation.attrib:
+ left = indentation.attrib[INDENTATION_LEFT]
+ left = (int(left) / 20) * float(4) / float(3)
+ left = str(left)
+ if INDENTATION_FIRST_LINE in indentation.attrib:
+ firstLine = indentation.attrib[INDENTATION_FIRST_LINE]
+ firstLine = (int(firstLine) / 20) * float(4) / float(3)
+ firstLine = str(firstLine)
+ if any([alignment, firstLine, left, right]):
+ return self.indent(
+ text, alignment, firstLine,
+ left, right, self.pre_processor.is_in_table(el))
+ return text
+
def parse_p(self, el, text):
+ if text == '':
+ return ''
+ # TODO This is still not correct, however it fixes the bug. We need to
+ # apply the classes/styles on p, td, li and h tags instead of inline,
+ # but that is for another ticket.
+ text = self.justification(el, text)
+ if self.pre_processor.is_first_list_item(el):
+ return self.parse_list(el, text)
+ if self.pre_processor.heading_level(el):
+ return self.parse_heading(el, text)
+ if self.pre_processor.is_list_item(el):
+ return self.parse_list_item(el, text)
+ if self.pre_processor.is_in_table(el):
+ return self.parse_table_cell_contents(el, text)
parsed = text
- if self.in_list:
- self.in_list = False
- parsed = self.list_element(parsed)
- elif (
- not el.has_child_all('t') and
- 'tbl' not in [i.tag for i in el.parent_list]):
- parsed = self.linebreak()
- elif el.parent not in self.elements:
+ # No p tags in li tags
+ if self.list_depth == 0:
parsed = self.paragraph(parsed)
return parsed
- def parse_r(self, el):
- is_deleted = False
- text = None
- if el.has_child('t'):
- text = self.escape(el.find('t').text)
- elif el.has_child('delText'):
- text = self.escape(el.find('delText').text)
- is_deleted = True
- if text:
- rpr = el.find('rPr')
- if rpr is not None:
- fns = []
- if rpr.has_child('b'):
- fns.append(self.bold)
- if rpr.has_child('i'):
- fns.append(self.italics)
- if rpr.has_child('u'):
- fns.append(self.underline)
- for fn in fns:
- text = fn(text)
- ppr = el.parent.find('pPr')
- if ppr is not None:
- jc = ppr.find('jc')
- if jc is not None:
- if jc.attrib['val'] == 'right':
- text = self.right_justify(text)
- if jc.attrib['val'] == 'center':
- text = self.center_justify(text)
- ind = ppr.find('ind')
- if ind is not None:
- right = None
- left = None
- firstLine = None
- if 'right' in ind.attrib:
- right = ind.attrib['right']
- right = int(right)/20
- right = str(right)
- if 'left' in ind.attrib:
- left = ind.attrib['left']
- left = int(left)/20
- left = str(left)
- if 'firstLine' in ind.attrib:
- firstLine = ind.attrib['firstLine']
- firstLine = int(firstLine)/20
- firstLine = str(firstLine)
- text = self.indent(text, right, left, firstLine)
- if is_deleted:
- text = self.deletion(text, '', '')
- return text
- else:
+ def _should_append_break_tag(self, next_el):
+ paragraph_like_tags = [
+ 'p',
+ ]
+ inline_like_tags = [
+ 'smartTag',
+ 'ins',
+ 'delText',
+ ]
+ if self.pre_processor.is_list_item(next_el):
+ return False
+ if self.pre_processor.previous(next_el) is None:
+ return False
+ tag_is_inline_like = any(
+ has_descendant_with_tag(next_el, tag) for
+ tag in inline_like_tags
+ )
+ if tag_is_inline_like:
+ return False
+ if (
+ self.pre_processor.is_last_list_item_in_root(
+ self.pre_processor.previous(next_el))):
+ return False
+ if self.pre_processor.previous(next_el).tag not in paragraph_like_tags:
+ return False
+ if next_el.tag not in paragraph_like_tags:
+ return False
+ return True
+
+ def parse_heading(self, el, parsed):
+ return self.heading(parsed, self.pre_processor.heading_level(el))
+
+ def parse_list_item(self, el, text):
+ # If for whatever reason we are not currently in a list, then start
+ # a list here. This will only happen if the num_id/ilvl combinations
+ # between lists is not well formed.
+ parsed = text
+ if self.list_depth == 0:
+ return self.parse_list(el, parsed)
+
+ def _should_parse_next_as_content(el):
+ """
+ Get the contents of the next el and append it to the
+ contents of the current el (that way things like tables
+ are actually in the li tag instead of in the ol/ul tag).
+ """
+ next_el = self.pre_processor.next(el)
+ if next_el is None:
+ return False
+ if (
+ not self.pre_processor.is_list_item(next_el) and
+ not self.pre_processor.is_last_list_item_in_root(el)
+ ):
+ return True
+ if self.pre_processor.is_first_list_item(next_el):
+ if (
+ self.pre_processor.num_id(next_el) ==
+ self.pre_processor.num_id(el)):
+ return True
+ return False
+
+ while el is not None:
+ if _should_parse_next_as_content(el):
+ el = self.pre_processor.next(el)
+ next_elements_content = self.parse(el)
+ if not next_elements_content:
+ continue
+ if self._should_append_break_tag(el):
+ parsed += self.break_tag(
+ self.pre_processor.is_in_table(el))
+ parsed += next_elements_content
+ else:
+ break
+ # Create the actual li element
+ return self.list_element(parsed)
+
+ def _get_rowspan(self, el, v_merge):
+ current_row = self.pre_processor.row_index(el)
+ current_col = self.pre_processor.column_index(el)
+ rowspan = 1
+ result = ''
+ tbl = find_ancestor_with_tag(self.pre_processor, el, 'tbl')
+ # We only want table cells that have a higher row_index that is greater
+ # than the current_row and that are on the current_col
+ if tbl is None:
+ return ''
+ tcs = [
+ tc for tc in find_all(tbl, 'tc')
+ if self.pre_processor.row_index(tc) >= current_row and
+ self.pre_processor.column_index(tc) == current_col
+ ]
+ restart_in_v_merge = False
+ if v_merge is not None and 'val' in v_merge.attrib:
+ restart_in_v_merge = 'restart' in v_merge.attrib['val']
+
+ def increment_rowspan(tc):
+ if not restart_in_v_merge:
+ return False
+ if not self.pre_processor.vmerge_continue(tc):
+ return False
+ return True
+
+ for tc in tcs:
+ if increment_rowspan(tc):
+ rowspan += 1
+ else:
+ rowspan = 1
+ if rowspan > 1:
+ result = rowspan
+ return str(result)
+
+ def get_colspan(self, el):
+ grid_span = find_first(el, 'gridSpan')
+ if grid_span is None:
return ''
+ return find_first(el, 'gridSpan').attrib['val']
+
+ def parse_table_cell_contents(self, el, text):
+ parsed = text
+
+ def _should_parse_next_as_content(el):
+ next_el = self.pre_processor.next(el)
+ if next_el is None:
+ return False
+ if self.pre_processor.is_in_table(next_el):
+ return True
+ while el is not None:
+ if _should_parse_next_as_content(el):
+ el = self.pre_processor.next(el)
+ next_elements_content = self.parse(el)
+ if not next_elements_content:
+ continue
+ if self._should_append_break_tag(el):
+ parsed += self.break_tag(
+ self.pre_processor.is_in_table(el))
+ parsed += next_elements_content
+ else:
+ break
+ return parsed
- def get_list_style(self, numval):
- ids = self.numbering_root.findall_all('num')
- for _id in ids:
- if _id.attrib['numId'] == numval:
- abstractid = _id.find('abstractNumId')
- abstractid = abstractid.attrib['val']
- style_information = self.numbering_root.findall_all(
- 'abstractNum',
- )
- for info in style_information:
- if info.attrib['abstractNumId'] == abstractid:
- for i in el_iter(info):
- if i.find('numFmt') is not None:
- return i.find('numFmt').attrib
-
- def get_comments(self, doc_id):
- if self.comment_store is None:
- # TODO throw appropriate error
- comment_root = ElementTree.fromstring(
- remove_namespaces(self.comment_text),
+ def parse_hyperlink(self, el, text):
+ rId = el.get('id')
+ href = self.rels_dict.get(rId)
+ if not href:
+ return text
+ href = self.escape(href)
+ return self.hyperlink(text, href)
+
+ def _get_image_id(self, el):
+ # Drawings
+ blip = find_first(el, 'blip')
+ if blip is not None:
+ # On drawing tags the id is actually whatever is returned from the
+ # embed attribute on the blip tag. Thanks a lot Microsoft.
+ return blip.get('embed')
+ # Picts
+ imagedata = find_first(el, 'imagedata')
+ if imagedata is not None:
+ return imagedata.get('id')
+
+ def _convert_image_size(self, size):
+ return size / EMUS_PER_PIXEL
+
+ def _get_image_size(self, el):
+ """
+ If we can't find a height or width, return 0 for whichever is not
+ found, then rely on the `image` handler to strip those attributes. This
+ functionality can change once we integrate PIL.
+ """
+ sizes = find_first(el, 'ext')
+ if sizes is not None and sizes.get('cx'):
+ if sizes.get('cx'):
+ x = self._convert_image_size(int(sizes.get('cx')))
+ if sizes.get('cy'):
+ y = self._convert_image_size(int(sizes.get('cy')))
+ return (
+ '%dpx' % x,
+ '%dpx' % y,
)
- ids_and_info = {}
- ids = comment_root.findall_all('comment')
- for _id in ids:
- ids_and_info[_id.attrib['id']] = {
- "author": _id.attrib['author'],
- "date": _id.attrib['date'],
- "text": _id.findall_all('t')[0].text,
- }
- self.comment_store = ids_and_info
- return self.comment_store[doc_id]
+ shape = find_first(el, 'shape')
+ if shape is not None and shape.get('style') is not None:
+ # If either of these are not set, rely on the method `image` to not
+ # use either of them.
+ x = 0
+ y = 0
+ styles = shape.get('style').split(';')
+
+ for s in styles:
+ if s.startswith('height:'):
+ y = s.split(':')[1]
+ if s.startswith('width:'):
+ x = s.split(':')[1]
+ return x, y
+ return 0, 0
+
+ def parse_image(self, el):
+ x, y = self._get_image_size(el)
+ rId = self._get_image_id(el)
+ src = self.rels_dict.get(rId)
+ if not src:
+ return ''
+ src = os.path.join(
+ 'word',
+ src,
+ )
+ if src in self._image_data:
+ filename = os.path.split(src)[-1]
+ return self.image(self._image_data[src], filename, x, y)
+ return ''
+
+ def _is_style_on(self, el):
+ """
+ For b, i, u (bold, italics, and underline) merely having the tag is not
+ sufficient. You need to check to make sure it is not set to "false" as
+ well.
+ """
+ return el.get('val') != 'false'
+
+ def parse_t(self, el, parsed):
+ return self.escape(el.text)
+
+ def parse_break_tag(self, el, parsed):
+ return self.break_tag(self.pre_processor.is_in_table(el))
+
+ def parse_deletion(self, el, parsed):
+ return self.deletion(el.text, '', '')
+
+ def parse_insertion(self, el, parsed):
+ return self.insertion(parsed, '', '')
+
+ def parse_r(self, el, parsed):
+ """
+ Parse the running text.
+ """
+ text = parsed
+ if not text:
+ return ''
+ run_tag_property = el.find('rPr')
+
+ def _has_style_on(run_tag_property, tag):
+ el = run_tag_property.find(tag)
+ if el is not None:
+ return self._is_style_on(el)
+ inline_tags = {
+ 'b': self.bold,
+ 'i': self.italics,
+ 'u': self.underline,
+ 'caps': self.caps,
+ 'smallCaps': self.small_caps,
+ 'strike': self.strike,
+ 'dstrike': self.strike,
+ 'vanish': self.hide,
+ 'webHidden': self.hide,
+ }
+ if run_tag_property is not None:
+ for child in run_tag_property:
+ # These tags are a little different, handle them separately
+ # from the rest.
+ # This could be a superscript or a subscript
+ if child.tag == 'vertAlign':
+ if child.attrib['val'] == 'superscript':
+ text = self.superscript(text)
+ elif child.attrib['val'] == 'subscript':
+ text = self.subscript(text)
+ elif child.tag in inline_tags and self._is_style_on(child):
+ text = inline_tags[child.tag](text)
+
+ return text
@property
def parsed(self):
@@ -335,10 +646,26 @@ def linebreak(self):
def paragraph(self, text):
return text
+ @abstractmethod
+ def heading(self, text, heading_level):
+ return text
+
@abstractmethod
def insertion(self, text, author, date):
return text
+ @abstractmethod
+ def hyperlink(self, text, href):
+ return text
+
+ @abstractmethod
+ def image_handler(self, path):
+ return path
+
+ @abstractmethod
+ def image(self, data, filename, x, y):
+ return self.image_handler(data)
+
@abstractmethod
def deletion(self, text, author, date):
return text
@@ -355,6 +682,30 @@ def italics(self, text):
def underline(self, text):
return text
+ @abstractmethod
+ def caps(self, text):
+ return text
+
+ @abstractmethod
+ def small_caps(self, text):
+ return text
+
+ @abstractmethod
+ def strike(self, text):
+ return text
+
+ @abstractmethod
+ def hide(self, text):
+ return text
+
+ @abstractmethod
+ def superscript(self, text):
+ return text
+
+ @abstractmethod
+ def subscript(self, text):
+ return text
+
@abstractmethod
def tab(self):
return True
@@ -388,15 +739,9 @@ def page_break(self):
return True
@abstractmethod
- def right_justify(self, text):
- return text
-
- @abstractmethod
- def center_justify(self, text):
+ def indent(self, text, left='', right='', firstLine=''):
return text
@abstractmethod
- def indent(self, text, left=None, right=None, firstLine=None):
- return text
-
- #TODO JUSTIFIED JUSTIFIED TEXT
+ def empty_cell(self):
+ return ''
diff --git a/pydocx/HtmlConversion.py b/pydocx/HtmlConversion.py
new file mode 100644
index 00000000..cab112f1
--- /dev/null
+++ b/pydocx/HtmlConversion.py
@@ -0,0 +1,394 @@
+import xml.etree.ElementTree as ElementTree
+from xml.etree.ElementTree import _ElementInterface
+from pydocx.py_docx.docx import *
+import py_docx.docx as docx
+
+
+def find_first(self, tag):
+ """
+ Find the first occurrence of a tag beneath the current element.
+ """
+ return self.find('.//' + tag)
+
+
+def find_all(self, tag):
+ """
+ Find all occurrences of a tag
+ """
+ return self.findall('.//' + tag)
+
+
+def has_descendant_with_tag(el, tag):
+ """
+ Determine if there is a child ahead in the element tree.
+ """
+ # Get child. stop at first child.
+ return True if el.find('.//' + tag) is not None else False
+
+
+setattr(_ElementInterface, 'find_first', find_first)
+setattr(_ElementInterface, 'find_all', find_all)
+setattr(_ElementInterface, 'is_first_list_item', False)
+setattr(_ElementInterface, 'is_last_list_item', False)
+setattr(_ElementInterface, 'in_table', False)
+setattr(_ElementInterface, 'has_descendant_with_tag', has_descendant_with_tag)
+setattr(_ElementInterface, 'new_list', False)
+setattr(_ElementInterface, 'new_ilvl', False)
+setattr(_ElementInterface, 'is_first_list', False)
+setattr(_ElementInterface, 'is_last_item_in_list', False)
+
+
+class Html2Docx():
+
+ def __init__(self, html):
+ # set up what is parsed
+ self.parsed = ''
+ with open(html, 'r') as f:
+ html = f.read()
+ # need to keep track of elements
+ # that have been visited
+ self.visited = []
+ self.stored_numId = 0
+ # need to keep track of the
+ # ilvl in the document
+ self.stored_ilvl = 0
+ #abstractId info for the numbering documents
+ self.abstractIdInfo = []
+ #numIds for the numbering document.
+ #these correspond to the abstractIdInfo
+ self.numIds = []
+ #for the numbering document
+ self.abstract = None
+ # set up the html
+ self.html = ElementTree.fromstring(html)
+ # get the relationship list
+ self.relationships = relationshiplist()
+ # make a new document
+ self.document = newdocument()
+ #get the body
+ self.body = self.document.xpath(
+ '/w:document/w:body', namespaces=nsprefixes)[0]
+ #make a new numbering document
+ self.numbering = new_numbering()
+ #start bulding the document
+ self.build()
+
+ def build(self):
+ #first step is to add parent attribute
+ #for the whole document
+ def add_parent(el):
+ for child in el.getchildren():
+ setattr(child, 'parent', el)
+ add_parent(child)
+ add_parent(self.html)
+ #now set the list attributes
+ self.set_list_attributes()
+ #and begin parsing
+ self.parse(self.html.find_first('body'))
+
+ def find_all_by_tags(self, html, *args):
+ #helper function to find all the elements
+ #with mutiple tags
+ list_elements = []
+ for el in html.iter():
+ if el.tag in args:
+ list_elements.append(el)
+ return list_elements
+
+ def check_for_lst_parent(self, el):
+ #helper function to see if a list
+ #has an li as a parent.
+ #meaning that its parent is itself
+ #a list and therefore, it is nested
+ lst_parent = False
+ if el.parent.tag != 'body':
+ if el.parent.tag == 'li':
+ lst_parent = True
+ #return true if you find a list parent
+ return lst_parent
+ self.check_for_lst_parent(el.parent)
+ else:
+ return lst_parent
+
+ def set_list_attributes(self):
+ #now we set the list attributes
+ ilvl = 0
+ numId = 0
+ lsts = self.find_all_by_tags(self.html, 'ol', 'ul')
+ for lst in lsts:
+ lst.getchildren()[0].is_first_list_item = True
+ lst.getchildren()[-1].is_last_list_item = True
+ for item in lst.getchildren():
+ #if the element does not have a parent and it is
+ #the last list item, we know it is safe to
+ #increment the numId, meaning there is a new
+ #list
+ if not self.check_for_lst_parent(item.parent):
+ if item.is_last_list_item:
+ numId += 1
+ #has to be true because a new list will
+ # automatically have a new ilvl
+ item.new_ilvl = True
+ item.new_list = True
+ #also have to set the ilvl back to 0
+ ilvl = 0
+ elif item.is_first_list_item and self.check_for_lst_parent(
+ item.parent):
+ #if a list if item has a parent that is a list
+ #and its the first item, we must increment the
+ #indentation level (ilvl)
+ item.new_ilvl = True
+ ilvl += 1
+ item.ilvl = ilvl
+ item.num_id = numId
+ item.is_list_item = True
+
+ def parse(self, el):
+ for child in el.getchildren():
+ if child.tag == 'br':
+ #if we find a break tag, look for text after it
+ text_and_style = self.parse_r(child)[0]
+ just = self.parse_r(child)[1]
+ self.body.append(paragraph(text_and_style, jc=just))
+ if child.tag == 'p':
+ #if we find a p tag, look for text after it
+ text_and_style = self.parse_r(child)[0]
+ just = self.parse_r(child)[1]
+ self.body.append(paragraph(text_and_style, jc=just))
+ if child.tag == 'ul' or child.tag == 'ol':
+ #if we find a list, look for text after it
+ lst_type = child.tag
+ self.parse_list(child, lst_type)
+ if child.tag == 'table':
+ #separate function for parsing tables
+ #because in word, the table tags are the parent
+ #of the p tag, so we have to handle
+ #them a bit differently
+ self.body.append(self.parse_table(child))
+ self.parse(child)
+ self.save()
+
+ def parse_r(self, el):
+ # we have to the whole block of
+ # text that will go in a paragraph
+ par_block = []
+ # we have to get the breaks that
+ # will go in the paragraph
+ breaks = []
+ #we need this to creating a string of the styles
+ #i.e., bold, italic, underline
+ style = ''
+ just = 'left'
+ for child in el.iter():
+ text = ''
+ if child.tag == 'div':
+ #look for what the justification is
+ if 'center' in child.attrib['class']:
+ just = 'center'
+ elif 'right' in child.attrib['class']:
+ just = 'right'
+ if child.tag == 'em':
+ #if there's an em tag,
+ #add italic to style
+ style += 'i'
+ if child.tag == 'strong':
+ #if there's a strong tag,
+ #add bold to style
+ style += 'b'
+ if child.tag == 'underline':
+ #if there's an underline tag,
+ #add underline to style
+ style += 'u'
+ if child.text:
+ #get the text
+ text = child.text
+ if child.tag == 'br' and child not in self.visited:
+ #dont want to hit breaks twice
+ #text of break comes at the tail
+ text = child.tail
+ breaks.append('br')
+ self.visited.append(child)
+ if text:
+ #if text, add everything to the parblock
+ #set the style back to blank
+ par_block.append([text, style, breaks])
+ style = ''
+ if child.parent and child.parent.tag == 'li':
+ #if it has a list parent, return early
+ return par_block, just
+ return par_block, just
+
+ def parse_list(self, lst, lst_type=''):
+ tentatives = None
+ """
+ parsing lists, we need to keep track of both
+ the list itself, and as we go through build up
+ the numbering document. for some reason,
+ there are two sections of a word numbering document:
+ an abstract numbering section that contains all of the
+ relevant list info, as well as a num section that contains
+ references to the abstract numbers defined earlier in the
+ numbering file
+ """
+ for child in lst.getchildren():
+ if child not in self.visited:
+ #first append the elements to
+ #the visisted elements
+ self.visited.append(child)
+ #get the text and style of this child
+ text_and_style = self.parse_r(child)[0]
+ #get the justication of the style
+ just = self.parse_r(child)[1]
+ #if its an ol, then its a decimal list
+ if lst_type == 'ol':
+ type_lst = 'decimal'
+ #if its a ul, then its a bulleted list
+ if lst_type == 'ul':
+ type_lst = 'bullet'
+ if child.new_ilvl:
+ #if theres a new ilvl, increase
+ #the indentation
+ ind = 720 * (child.ilvl + 1)
+ #create a numId attribute for the list, this
+ #is for the numbering document,
+ num = create_list_attributes(
+ ilvl=str(child.ilvl),
+ type=type_lst, just=just, left=str(ind))
+ #append that numId to the lists of
+ #all the numIds
+ #we will later append this info to the
+ #abstract id section of the numbering document
+ self.numIds.append(num)
+ self.stored_ilvl += 1
+ if not child.find('ol') and not child.find('ul'):
+ tentatives = fill_tentative(
+ self.stored_ilvl, type_lst=type_lst)
+ #if we cant find another list, we know its the
+ #last item and it's ok to fill out the rest of the
+ #abstract num info
+
+ #abstractnumid gets increased
+ # for every list, starts out at 0. numIds themselves
+ self.abstract = create_list(child.num_id - 1)
+ self.numbering.append(self.abstract)
+ #here is where we append to the abstract num section
+ for num in self.numIds:
+ self.abstract.append(num)
+ #now we have to create tentative lists. the way that
+ #word is able to nicely do indent to create new lists
+ #is by creating tentative lists that start past the
+ #last indent. it goes all the way up to 8, because that's
+ #all that will fit in the width of the file.
+ for tentative in tentatives:
+ self.abstract.append(tentative)
+ #now we have our abstract id info, and we have to append to
+ #it the current num_id
+ self.abstractIdInfo.append(
+ create_abstract_IdInfo(str(child.num_id)))
+ #we're done here, so we can set our stored_ilvl back to 0
+ self.stored_ilvl = 0
+ #and we can set our num ideas to zero
+ self.numIds = []
+ #now we append to hte body the relavent list info
+ self.body.append(
+ paragraph(
+ text_and_style, is_list=True,
+ ilvl=str(child.ilvl), numId=str(child.num_id),
+ style=lst_type, jc=just))
+ #if, from the current list element, we find another list,
+ # we have to parse that lists BEFORE we parse the next list
+ # item in the current list
+ if child.find('ul'):
+ lst = child.find('ul')
+ self.parse_list(lst, lst.tag)
+ if child.find('ol'):
+ lst = child.find('ol')
+ self.parse_list(lst, lst.tag)
+
+ def table_look_ahead(self, tbl):
+ #table look ahead function,
+ #we need to do this to account for vertical merges. in html
+ #all you need to do is include the rowspan and not include any
+ #extra table elements. word, on the other hand, expects an
+ #empty tale with a vmerge attribute inside it. so we're
+ #going to go thru and create these elements and insert them
+ #into the html document
+ trs = tbl.find_all('tr')
+ for i in range(len(trs)):
+ tcs = trs[i].find_all('td')
+ for j in range(len(tcs)):
+ if 'rowspan' in tcs[j].attrib:
+ for x in range(1, int(tcs[j].attrib['rowspan'])):
+ tc = ElementTree.Element('td')
+ setattr(tc, 'parent', trs[i+x])
+ tc.set('vmerge_continue', True)
+ trs[i + x].insert(j, tc)
+ return tbl
+
+ def get_columns(self, tbl):
+ #have to get the total number of columns
+ #for the table. just go by the first row
+ #but if there is a colspan, add that to the
+ #column count
+ columns = 0
+ trs = tbl.find_all('tr')
+ tcs = trs[0].find_all('td')
+ for tc in tcs:
+ tc.in_table = True
+ if 'colspan' in tc.attrib:
+ columns += int(tc.attrib['colspan'])
+ else:
+ columns += 1
+ return columns
+
+ def parse_table(self, el):
+ #get the number of columns
+ columns = self.get_columns(el)
+ #set up the table properties
+ tbl = createtblproperties(columns)
+ #going to have to do a look ahead and
+ #create those extra table rows
+ for tr in self.table_look_ahead(el).getchildren():
+ table_row = createtablerow()
+ tcs = tr.find_all('td')
+ for tc in tcs:
+ colspan = ''
+ vmerge = {}
+ #now look for colspans
+ #and rowspans (referenced by
+ #total number of vmerge starting from
+ #a vmerge:restart
+ if 'colspan' in tc.attrib:
+ colspan = tc.attrib['colspan']
+ if 'rowspan' in tc.attrib:
+ vmerge = {'val': 'restart'}
+ if 'vmerge_continue' in tc.attrib:
+ vmerge = {'val': 'continue'}
+ cell = createtablecell(gridspan=colspan, vmerge=vmerge)
+ text_and_style = self.parse_r(tc)[0]
+ just = self.parse_r(tc)[1]
+ par_run = paragraph(text_and_style, jc=just)
+ cell.append(par_run)
+ table_row.append(cell)
+ tbl.append(table_row)
+ return tbl
+
+ def save(self):
+ title = 'Python docx demo'
+ subject = 'A practical example of making docx from Python'
+ creator = 'Mike MacCana'
+ keywords = ['python', 'Office Open XML', 'Word']
+ for abstract in self.abstractIdInfo:
+ self.numbering.append(abstract)
+ coreprops = coreproperties(
+ title=title, subject=subject,
+ creator=creator, keywords=keywords)
+ appprops = appproperties()
+ contenttypes = docx.contenttypes()
+ websettings = docx.websettings()
+ wordrelationships = docx.wordrelationships(self.relationships)
+ # Save our document
+ savedocx(
+ self.document, coreprops,
+ appprops, contenttypes, websettings,
+ wordrelationships, 'Testing.docx', self.numbering)
diff --git a/pydocx/__init__.py b/pydocx/__init__.py
index 9b42e00f..07833131 100644
--- a/pydocx/__init__.py
+++ b/pydocx/__init__.py
@@ -1,8 +1,19 @@
-from .parsers import *
+from .parsers import Docx2LaTex, Docx2Html, Docx2Markdown
+from HtmlConversion import Html2Docx
+
def docx2html(path):
return Docx2Html(path).parsed
+
def docx2markdown(path):
return Docx2Markdown(path).parsed
+
+def docx2latex(path):
+ return Docx2LaTex(path).parsed
+
+def html2docx(path):
+ return Html2Docx(path).parsed
+
+VERSION = '0.3.1'
diff --git a/pydocx/fixtures/all_configured_styles.docx b/pydocx/fixtures/all_configured_styles.docx
new file mode 100644
index 00000000..8f514372
Binary files /dev/null and b/pydocx/fixtures/all_configured_styles.docx differ
diff --git a/pydocx/fixtures/attachment_is_tiff.docx b/pydocx/fixtures/attachment_is_tiff.docx
new file mode 100644
index 00000000..774362ca
Binary files /dev/null and b/pydocx/fixtures/attachment_is_tiff.docx differ
diff --git a/pydocx/fixtures/bigger_font_size_to_header.docx b/pydocx/fixtures/bigger_font_size_to_header.docx
new file mode 100644
index 00000000..c722888b
Binary files /dev/null and b/pydocx/fixtures/bigger_font_size_to_header.docx differ
diff --git a/pydocx/fixtures/convert_p_to_h.docx b/pydocx/fixtures/convert_p_to_h.docx
new file mode 100644
index 00000000..53769e15
Binary files /dev/null and b/pydocx/fixtures/convert_p_to_h.docx differ
diff --git a/pydocx/fixtures/fake_headings_by_length.docx b/pydocx/fixtures/fake_headings_by_length.docx
new file mode 100644
index 00000000..a130f5ba
Binary files /dev/null and b/pydocx/fixtures/fake_headings_by_length.docx differ
diff --git a/pydocx/fixtures/greek_alphabet.docx b/pydocx/fixtures/greek_alphabet.docx
new file mode 100644
index 00000000..46ab5429
Binary files /dev/null and b/pydocx/fixtures/greek_alphabet.docx differ
diff --git a/pydocx/fixtures/has_image.docx b/pydocx/fixtures/has_image.docx
new file mode 100644
index 00000000..2ebd0bd0
Binary files /dev/null and b/pydocx/fixtures/has_image.docx differ
diff --git a/pydocx/fixtures/has_missing_image.docx b/pydocx/fixtures/has_missing_image.docx
new file mode 100644
index 00000000..996e6671
Binary files /dev/null and b/pydocx/fixtures/has_missing_image.docx differ
diff --git a/pydocx/fixtures/has_title.docx b/pydocx/fixtures/has_title.docx
new file mode 100644
index 00000000..a87d88ed
Binary files /dev/null and b/pydocx/fixtures/has_title.docx differ
diff --git a/pydocx/fixtures/header_footer_problem.docx b/pydocx/fixtures/header_footer_problem.docx
new file mode 100644
index 00000000..6bc49a7a
Binary files /dev/null and b/pydocx/fixtures/header_footer_problem.docx differ
diff --git a/pydocx/fixtures/headers.docx b/pydocx/fixtures/headers.docx
new file mode 100644
index 00000000..890104c7
Binary files /dev/null and b/pydocx/fixtures/headers.docx differ
diff --git a/pydocx/fixtures/headers_with_full_line_styles.docx b/pydocx/fixtures/headers_with_full_line_styles.docx
new file mode 100644
index 00000000..38d6f6a8
Binary files /dev/null and b/pydocx/fixtures/headers_with_full_line_styles.docx differ
diff --git a/pydocx/fixtures/inline_tags.docx b/pydocx/fixtures/inline_tags.docx
new file mode 100644
index 00000000..4aba2347
Binary files /dev/null and b/pydocx/fixtures/inline_tags.docx differ
diff --git a/pydocx/fixtures/justification.docx b/pydocx/fixtures/justification.docx
new file mode 100644
index 00000000..7f8a3bf1
Binary files /dev/null and b/pydocx/fixtures/justification.docx differ
diff --git a/pydocx/fixtures/list_in_table.docx b/pydocx/fixtures/list_in_table.docx
new file mode 100644
index 00000000..d1a87388
Binary files /dev/null and b/pydocx/fixtures/list_in_table.docx differ
diff --git a/pydocx/fixtures/list_to_header.docx b/pydocx/fixtures/list_to_header.docx
new file mode 100644
index 00000000..f9b3946e
Binary files /dev/null and b/pydocx/fixtures/list_to_header.docx differ
diff --git a/pydocx/fixtures/lists_with_styles.docx b/pydocx/fixtures/lists_with_styles.docx
new file mode 100644
index 00000000..c1c7ecf8
Binary files /dev/null and b/pydocx/fixtures/lists_with_styles.docx differ
diff --git a/pydocx/fixtures/localDpi.docx b/pydocx/fixtures/localDpi.docx
new file mode 100644
index 00000000..0f6d7f77
Binary files /dev/null and b/pydocx/fixtures/localDpi.docx differ
diff --git a/pydocx/fixtures/missing_content.docx b/pydocx/fixtures/missing_content.docx
new file mode 100644
index 00000000..21bed964
Binary files /dev/null and b/pydocx/fixtures/missing_content.docx differ
diff --git a/pydocx/fixtures/nested_lists.docx b/pydocx/fixtures/nested_lists.docx
new file mode 100644
index 00000000..f4000dfa
Binary files /dev/null and b/pydocx/fixtures/nested_lists.docx differ
diff --git a/pydocx/fixtures/nested_table_rowspan.docx b/pydocx/fixtures/nested_table_rowspan.docx
new file mode 100644
index 00000000..b43b8a0d
Binary files /dev/null and b/pydocx/fixtures/nested_table_rowspan.docx differ
diff --git a/pydocx/fixtures/nested_tables.docx b/pydocx/fixtures/nested_tables.docx
new file mode 100644
index 00000000..af704d4d
Binary files /dev/null and b/pydocx/fixtures/nested_tables.docx differ
diff --git a/pydocx/fixtures/resized_image.docx b/pydocx/fixtures/resized_image.docx
new file mode 100644
index 00000000..913099c4
Binary files /dev/null and b/pydocx/fixtures/resized_image.docx differ
diff --git a/pydocx/fixtures/shift_enter.docx b/pydocx/fixtures/shift_enter.docx
new file mode 100644
index 00000000..4128c0a2
Binary files /dev/null and b/pydocx/fixtures/shift_enter.docx differ
diff --git a/pydocx/fixtures/simple.docx b/pydocx/fixtures/simple.docx
new file mode 100644
index 00000000..1d2a1c23
Binary files /dev/null and b/pydocx/fixtures/simple.docx differ
diff --git a/pydocx/fixtures/simple_lists.docx b/pydocx/fixtures/simple_lists.docx
new file mode 100644
index 00000000..c09ad744
Binary files /dev/null and b/pydocx/fixtures/simple_lists.docx differ
diff --git a/pydocx/fixtures/simple_table.docx b/pydocx/fixtures/simple_table.docx
new file mode 100644
index 00000000..26de483c
Binary files /dev/null and b/pydocx/fixtures/simple_table.docx differ
diff --git a/pydocx/fixtures/special_chars.docx b/pydocx/fixtures/special_chars.docx
new file mode 100644
index 00000000..b4b9287f
Binary files /dev/null and b/pydocx/fixtures/special_chars.docx differ
diff --git a/pydocx/fixtures/split_header.docx b/pydocx/fixtures/split_header.docx
new file mode 100644
index 00000000..cc4bd5cf
Binary files /dev/null and b/pydocx/fixtures/split_header.docx differ
diff --git a/pydocx/fixtures/super_and_subscript.docx b/pydocx/fixtures/super_and_subscript.docx
new file mode 100644
index 00000000..06ea2d7a
Binary files /dev/null and b/pydocx/fixtures/super_and_subscript.docx differ
diff --git a/pydocx/fixtures/table_col_row_span.docx b/pydocx/fixtures/table_col_row_span.docx
new file mode 100644
index 00000000..856abfdf
Binary files /dev/null and b/pydocx/fixtures/table_col_row_span.docx differ
diff --git a/pydocx/fixtures/tables_in_lists.docx b/pydocx/fixtures/tables_in_lists.docx
new file mode 100644
index 00000000..11859541
Binary files /dev/null and b/pydocx/fixtures/tables_in_lists.docx differ
diff --git a/pydocx/fixtures/track_changes_on.docx b/pydocx/fixtures/track_changes_on.docx
new file mode 100644
index 00000000..dcb7ba1c
Binary files /dev/null and b/pydocx/fixtures/track_changes_on.docx differ
diff --git a/pydocx/fixtures/upper_alpha_all_bold.docx b/pydocx/fixtures/upper_alpha_all_bold.docx
new file mode 100644
index 00000000..d518b2c5
Binary files /dev/null and b/pydocx/fixtures/upper_alpha_all_bold.docx differ
diff --git a/pydocx/lxmlparser.py b/pydocx/lxmlparser.py
deleted file mode 100644
index 94b130d3..00000000
--- a/pydocx/lxmlparser.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import zipfile
-from lxml import etree
-from StringIO import StringIO
-__author__ = 'samportnow'
-
-#for el in tree.iter():
- # The way lists are handled could double visit certain elements; keep
- # track of which elements have been visited and skip any that have been
- # visited already.
- #if el in visited_nodes:
- #continue
-with zipfile.ZipFile('/Users/samportnow/Documents/pydocx/helloworld.docx') as f:
- document = f.read('word/document.xml')
- numbering= f.read('word/numbering.xml')
-parser=etree.XMLParser(ns_clean=True)
-document=StringIO(document)
-numbering=StringIO(numbering)
-numbering_tree=etree.parse(numbering,parser)
-numbering_namespace=numbering_tree.getroot().nsmap['w']
-visited_els=[]
-
-def get_parsed():
- parser=etree.XMLParser(ns_clean=True)
- tree=etree.parse(document,parser)
- namespace=tree.getroot().nsmap['w']
- #rpr is run properties for the paragraph mark
- paragraph=''
- run_text=''
- running_text=''
- for el in tree.iter():
- if el.tag=='{%s}p' %namespace:
- for wp in el.iter():
- if wp.tag =='{%s}ins' %namespace:
- for text in wp.iterchildren():
- if text not in visited_els:
- run_text +=''+get_text(text,namespace,visited_els)+'
'
- visited_els.append(text)
- if wp.tag=='{%s}r' %namespace and wp not in visited_els:
- run_text+=get_text(wp,namespace,visited_els)
- visited_els.append(wp)
- if not el.getchildren():
- run_text+='
'
- if wp.tag == '{%s}ilvl' %namespace:
- for lst in el.iter():
- if lst.find('{%s}numId' %namespace) is not None and el not in visited_els:
- numval = lst.find('{%s}numId' %namespace).attrib['{%s}val' %namespace]
- lst_type=get_list_style(numval)
- if get_text(lst,namespace,visited_els) and el not in visited_els and lst_type['{%s}val' %namespace] != 'bullet':
- if lst.getnext() is not None:
- if lst not in visited_els:
- while lst.getnext() is not None:
- if lst not in visited_els:
- text = get_text(lst,namespace,visited_els)
- next_txt = get_text(lst.getnext(),namespace,visited_els)
- running_text += text + next_txt
- visited_els.append(lst)
- visited_els.append(lst.getnext())
- lst=lst.getnext()
- else:
- run_text += '' + running_text + ''
- break
- else:
- run_text +='' + get_text(lst, namespace, visited_els) + ''
- visited_els.append(lst)
- print running_text
- return run_text
-
-
-def get_text(wp,namespace,visited_els):
- run_text= ''
- decorator = ''
- closing = ''
- if wp.find('{%s}tab' %namespace) is not None:
- run_text+='%nbsp'
- if wp.find('{%s}rPr' %namespace) is not None:
- for tag in wp.iter():
- if tag.find('{%s}u' %namespace) is not None:
- if wp.find('{%s}t' %namespace) is not None:
- decorator +=''
- closing += ''
- visited_els.append(wp.find('{%s}t' %namespace))
- if tag.find('{%s}i' %namespace) is not None:
- if wp.find('{%s}t' %namespace) is not None:
- decorator += ''
- closing += ''
- visited_els.append(wp.find('{%s}t' %namespace))
- if tag.find('{%s}b' %namespace) is not None:
- if wp.find('{%s}t' %namespace) is not None:
- decorator += ''
- closing += ''
- visited_els.append(wp.find('{%s}t' %namespace))
- run_text = wp.find('{%s}t' %namespace).text
- run_text = decorator + run_text + closing
- if wp.find('{%s}t' %namespace) is not None and wp.find('{%s}t' %namespace) not in visited_els:
- run_text+=wp.find('{%s}t' %namespace).text
- return run_text
-
-def get_list_style(numval):
- ids = numbering_tree.findall('{%s}num' %numbering_namespace)
- for id in ids:
- if id.attrib['{%s}numId' %numbering_namespace] == numval:
- abstractid=id.find('{%s}abstractNumId' %numbering_namespace)
- abstractid=abstractid.attrib['{%s}val' %numbering_namespace]
- style_information=numbering_tree.findall('{%s}abstractNum' %numbering_namespace)
- for info in style_information:
- if info.attrib['{%s}abstractNumId' %numbering_namespace] == abstractid:
- for i in info.iter():
- if i.find('{%s}numFmt' %numbering_namespace) is not None:
- return i.find('{%s}numFmt' %numbering_namespace).attrib
-
-print get_parsed()
diff --git a/pydocx/parsers/Docx2Html.py b/pydocx/parsers/Docx2Html.py
index bfaad2a6..c829e33d 100644
--- a/pydocx/parsers/Docx2Html.py
+++ b/pydocx/parsers/Docx2Html.py
@@ -1,21 +1,46 @@
-from pydocx.DocxParser import DocxParser
-
+import base64
import xml.sax.saxutils
+from pydocx.DocxParser import DocxParser
+
class Docx2Html(DocxParser):
@property
def parsed(self):
- self._parsed = self._parsed.replace('', '
')
- self._parsed = self._parsed.replace('
', '
')
- self._parsed = self._parsed.replace('
', '')
- return (
- '{content}'
- ).format(content=self._parsed)
+ content = self._parsed
+ content = "%(head)s%(content)s" % {
+ 'head': self.head(),
+ 'content': content,
+ }
+ return unicode(content)
+
+ def head(self):
+ return "%(style)s" % {
+ 'style': self.style(),
+ }
+
+ def style(self):
+ result = (
+ ''
+ ) % {
+ #multiple by (4/3) to get to px
+ 'width': (self.page_width * (4 / 3)),
+ }
+ return result
def escape(self, text):
return xml.sax.saxutils.quoteattr(text)[1:-1]
@@ -26,35 +51,106 @@ def linebreak(self, pre=None):
def paragraph(self, text, pre=None):
return '' + text + '
'
+ def heading(self, text, heading_value):
+ return '<%(tag)s>%(text)s%(tag)s>' % {
+ 'tag': heading_value,
+ 'text': text,
+ }
+
def insertion(self, text, author, date):
return (
- "{text}"
- ).format(author=author, date=date, text=text)
+ "%(text)s"
+ ) % {
+ 'author': author,
+ 'date': date,
+ 'text': text,
+ }
+
+ def hyperlink(self, text, href):
+ if text == '':
+ return ''
+ return '%(text)s' % {
+ 'href': href,
+ 'text': text,
+ }
+
+ def image_handler(self, image_data, filename):
+ extension = filename.split('.')[-1].lower()
+ b64_encoded_src = 'data:image/%s;base64,%s' % (
+ extension,
+ base64.b64encode(image_data),
+ )
+ b64_encoded_src = self.escape(b64_encoded_src)
+ return b64_encoded_src
+
+ def image(self, image_data, filename, x, y):
+ src = self.image_handler(image_data, filename)
+ if not src:
+ return ''
+ if all([x, y]):
+ return '
' % (
+ src,
+ y,
+ x,
+ )
+ else:
+ return '
' % src
def deletion(self, text, author, date):
return (
- "{text}"
- ).format(author=author, date=date, text=text)
+ "%(text)s"
+ ) % {
+ 'author': author,
+ 'date': date,
+ 'text': text,
+ }
def list_element(self, text):
- return "- {text}
".format(text=text)
+ return "- %(text)s
" % {
+ 'text': text,
+ }
- def ordered_list(self, text):
- return "{text}
".format(text=text)
+ def ordered_list(self, text, list_style):
+ return '%(text)s
' % {
+ 'text': text,
+ 'list_style': list_style,
+ }
def unordered_list(self, text):
- return "".format(text=text)
+ return "" % {
+ 'text': text,
+ }
def bold(self, text):
- return '' + text + ''
+ return '' + text + ''
def italics(self, text):
- return '' + text + ''
+ return '' + text + ''
def underline(self, text):
- return '' + text + ''
+ return '' + text + ''
+
+ def caps(self, text):
+ return '' + text + ''
+
+ def small_caps(self, text):
+ return '' + text + ''
+
+ def strike(self, text):
+ return '' + text + ''
+
+ def hide(self, text):
+ return '' + text + ''
+
+ def superscript(self, text):
+ return '%(text)s' % {
+ 'text': text,
+ }
+
+ def subscript(self, text):
+ return '%(text)s' % {
+ 'text': text,
+ }
def tab(self):
# Insert before the text right?? So got the text and just do an insert
@@ -62,25 +158,55 @@ def tab(self):
return '    '
def table(self, text):
- return ''
+ return ''
def table_row(self, text):
return '' + text + '
'
- def table_cell(self, text):
- return '' + text + ' | '
+ def table_cell(self, text, col='', row='', *args):
+ slug = '%(text)s | '
+ return slug % {
+ 'colspan': col,
+ 'rowspan': row,
+ 'text': text,
+ }
def page_break(self):
- return '
'
-
- def center_justify(self, text):
- return "" + text + '
'
+ return '
'
+
+ def indent(self, text, just='', firstLine='', left='',
+ right='', hanging=''):
+ slug = '%(text)s
"
+ return slug % {
+ 'text': text,
+ 'just': just,
+ 'firstLine': firstLine,
+ 'left': left,
+ 'right': right,
+ }
+
+ def break_tag(self, *args):
+ return '
'
- def right_justify(self, text):
- return "" + text + '
'
+ def change_orientation(self, parsed, orient):
+ return '
'
- def indent(self, text, right, left, firstLine):
- return "{text}
".format(
- left=left,
- text=text,
- )
+ def empty_cell(self):
+ return ''
diff --git a/pydocx/parsers/Docx2LaTex.py b/pydocx/parsers/Docx2LaTex.py
new file mode 100644
index 00000000..5a59ed3c
--- /dev/null
+++ b/pydocx/parsers/Docx2LaTex.py
@@ -0,0 +1,283 @@
+import base64
+from pydocx.DocxParser import DocxParser
+
+
+class Docx2LaTex(DocxParser):
+
+ def __init__(self, *args, **kwargs):
+ self.table_info = []
+ self.counted_columns = False
+ self.previous_orient = ''
+ self.col_count = 0
+ self.hit_list = False
+ self.line_break_in_table = False
+ super(Docx2LaTex, self).__init__(*args, **kwargs)
+
+ @property
+ def parsed(self):
+ content = r"%(head)s\begin{document}%(content)s\end{document}" % {
+ 'head': self.head(),
+ 'content': self._parsed}
+ return unicode(content)
+
+ def escape(self, text):
+ chars = ['%', '&', '#', '$', '~', '_', '^', '{', '}']
+ for ch in chars:
+ if ch in text:
+ text = text.replace(ch, '\\'+ch)
+ return text
+
+ def linebreak(self):
+ return '\n\n'
+
+ def paragraph(self, text, pre=None):
+ return text + '\n\n'
+
+ def bold(self, text):
+ return r'\textbf {%s}' % text
+
+ def italics(self, text):
+ return r'\emph {%s}' % text
+
+ def underline(self, text):
+ return r'\underline {%s}' % text
+
+ def list_element(self, text):
+ return r'\item %s' % text + '\n'
+
+ def ordered_list(self, text, list_style):
+ self.hit_list = True
+ return r'\begin{enumerate} %s \end{enumerate}' % text
+
+ def unordered_list(self, text):
+ self.hit_list = True
+ return r'\begin{itemize} %s \end{itemize}' % text
+
+ def head(self):
+ return r'''\documentclass{article}\usepackage{hyperref}
+ \usepackage{graphicx}\usepackage{changes}
+ \usepackage{changepage}
+ \usepackage{hanging}\usepackage{multirow}
+ \usepackage{pbox}\usepackage{pdflscape}
+ \usepackage{ulem}\usepackage{comment}'''
+
+ def heading(self, text, heading_value):
+ if heading_value == 'h1':
+ return r'\section{%s}' % text + '\n\n'
+ elif heading_value == 'h2':
+ return r'\subsection{%s}' % text + '\n\n'
+ elif heading_value == 'h3':
+ return r'\paragraph{%s}' % text + '\n\n'
+ elif heading_value == 'h4':
+ return r'\subparagraph{%s}' % text + '\n\n'
+ else:
+ return text + '\n\n'
+
+ def insertion(self, text, author, date):
+ return r'\added[id='+author+',remark='+date+']{%s}' % text
+
+ def hyperlink(self, text, href):
+ if text == '':
+ return ''
+ return r'\href{%(href)s}{%(text)s}' % {
+ 'href': href,
+ 'text': text,
+ }
+
+ def image_handler(self, image_data, filename):
+ extension = filename.split('.')[-1].lower()
+ b64_encoded_src = 'data:image/%s;base64,%s' % (
+ extension,
+ base64.b64encode(image_data),
+ )
+ b64_encoded_src = self.escape(b64_encoded_src)
+ return b64_encoded_src
+
+ def image(self, image_data, filename, x, y):
+ src = self.image_handler(image_data, filename)
+ if not src:
+ return ''
+ if all([x, y]):
+ if x.find('px') != -1:
+ x = x.replace('px', '')
+ x = float(x)
+ x = x * float(3) / float(4)
+ x = str(x) + 'pt'
+ elif y.find('px') != -1:
+ y = y.replace('px', '')
+ y = float(y)
+ y = y * float(3) / float(4)
+ y = str(y) + 'pt'
+ return r'\includegraphics[height=%spt, width=%s]{%s}' % (
+ y,
+ x,
+ src)
+ else:
+ return r'\includegraphics {%s}' % src
+
+ def tab(self):
+ return r'\qquad '
+
+ def table(self, text):
+ center = False
+ right = False
+ pcm = False
+ setup_cols = ''
+ for i in range(0, self.col_count):
+ match = next((
+ column for column in self.table_info
+ if 'Column' in column and column['Column'] == i), None)
+ if match:
+ if 'justify' in match:
+ if match['justify'] == 'center':
+ center = True
+ elif match['justify'] == 'right':
+ right = True
+ elif match['list']:
+ pcm = True
+ if center is True:
+ setup_cols += 'c'
+ center = False
+ elif right is True:
+ setup_cols += 'r'
+ right = False
+ elif pcm is True:
+ setup_cols += 'p{3cm}'
+ else:
+ setup_cols += 'l'
+ self.table_info = []
+ return '\n' + r'\begin{tabular}{%s}' % setup_cols\
+ + '\n' + r'%s\end{tabular}'\
+ % text + '\n\n'
+
+ def table_row(self, text):
+ self.counted_columns = True
+ return text
+
+ def table_cell(
+ self, text, col='', row='',
+ is_last_row_item=False, is_list_item=False):
+ if is_list_item:
+ self.columns = {}
+ self.columns['Column'] = self.col_count
+ self.columns['list'] = True
+ self.table_info.append(self.columns)
+ if col:
+ col = int(col)
+ if not self.counted_columns and col:
+ self.col_count += col
+ elif not self.counted_columns:
+ self.col_count += 1
+ if row:
+ row = int(row)
+ slug = ''
+ if col:
+ slug += r'\multicolumn{%s}{c}' % col
+ if row:
+ slug += r'\multirow{%s}{*}' % row
+ if self.line_break_in_table:
+ slug += r'\parbox{20cm}'
+ if text == '':
+ slug += '{}'
+ else:
+ slug += '{' + text + '}'
+ if is_last_row_item:
+ slug += r' \\' + '\n'
+ return slug
+ self.line_break_in_table = False
+ return '%s & ' % slug
+
+ def page_break(self):
+ return r'\newpage '
+
+ def indent(self, text, just='', firstLine='',
+ left='', right='', hanging='', is_in_table=False):
+ if not is_in_table:
+ raggedright = False
+ raggedleft = False
+ center = False
+ slug = ''
+ if hanging:
+ hanging = float(hanging)
+ hanging = hanging * float(3)/float(4)
+ return r'\begin{hangparas}{%spt}{1} %s ' \
+ r'\end{hangparas}' % (hanging, text) + '\n'
+ if right and left:
+ left = float(left)
+ right = float(right)
+ left = left * float(3) / float(4)
+ right = right * float(3) / float(4)
+ slug += r'\begin{adjustwidth}{%spt}{%spt}' % (left, right)
+ elif left:
+ left = float(left)
+ left = left * float(3) / float(4)
+ slug += r'\begin{adjustwidth}{}{%spt}' % (left)
+ elif right:
+ right = float(right)
+ right = right * float(3) / float(4)
+ slug += r'\begin{adjustwidth}{%spt}{}' % (right)
+ if firstLine:
+ slug += r'\setlength{\parindent}{'+firstLine+r'pt}\indent '
+ if just:
+ if just == 'left':
+ raggedright = True
+ slug += r'\begin{flushright} '
+ elif just == 'center':
+ center = True
+ slug += r'\begin{center} '
+ elif just == 'right':
+ raggedleft = True
+ slug += r'\begin{flushleft} '
+ slug += text
+ if raggedright:
+ slug += r'\end{flushright}'
+ if center:
+ slug += r'\end{center}'
+ if raggedleft:
+ slug += r'\end{flushleft}'
+ if left or right:
+ slug += r'\end{adjustwidth}'
+ return slug
+ else:
+ self.columns = {}
+ self.columns['Column'] = self.col_count
+ self.columns['justify'] = just
+ if self.columns not in self.table_info:
+ self.table_info.append(self.columns)
+ return text
+
+ def break_tag(self, is_in_table):
+ if is_in_table:
+ self.line_break_in_table = True
+ return r'\\'
+
+ def change_orientation(self, parsed, orient):
+ if orient == 'portrait':
+ return parsed
+ if orient == 'landscape':
+ return r'\begin{landscape}' + '\n' \
+ + parsed + '\end{landscape}' + '\n'
+
+ def deletion(self, text, author, date):
+ return r'\deleted[id='+author+',remark='+date+']{%s}' % text
+
+ def caps(self, text):
+ return r'\MakeUppercase{%s}' % text
+
+ def small_caps(self, text):
+ return r'\textsx{%s}' % text
+
+ def strike(self, text):
+ return r'\sout{%s}' % text
+
+ def hide(self, text):
+ return r'\begin{comment}%s\end{comment}' % text
+
+ def superscript(self, text):
+ return r'\textsuperscript{%s}' % text
+
+ def subscript(self, text):
+ return r'\textsubscript{%s}' % text
+
+ def empty_cell(self):
+ return ' & '
diff --git a/pydocx/parsers/Docx2Markdown.py b/pydocx/parsers/Docx2Markdown.py
index 1bb43e16..d023df7a 100644
--- a/pydocx/parsers/Docx2Markdown.py
+++ b/pydocx/parsers/Docx2Markdown.py
@@ -1,5 +1,6 @@
from pydocx.DocxParser import DocxParser
+
class Docx2Markdown(DocxParser):
def escape(self, text):
return text
@@ -17,8 +18,9 @@ def bold(self, text):
return '**' + text + '**'
def italics(self, text):
- # TODO do we need a "pre" variable, so I can check for *italics**italics* and turn it into *italicsitatlics*?
+ # TODO do we need a "pre" variable, so I can check for
+ # *italics**italics* and turn it into *italicsitatlics*?
return '*' + text + '*'
def underline(self, text):
- return '***' +text + '***'
\ No newline at end of file
+ return '***' + text + '***'
diff --git a/pydocx/parsers/__init__.py b/pydocx/parsers/__init__.py
index a9524657..f6bb520f 100644
--- a/pydocx/parsers/__init__.py
+++ b/pydocx/parsers/__init__.py
@@ -1,2 +1,4 @@
-from .Docx2Html import *
-from .Docx2Markdown import *
\ No newline at end of file
+from pydocx.parsers.Docx2Html import Docx2Html
+from pydocx.parsers.Docx2Markdown import Docx2Markdown
+from pydocx.parsers.Docx2LaTex import Docx2LaTex
+__all__ = (Docx2Html, Docx2Markdown, Docx2LaTex)
diff --git a/pydocx/py_docx/.gitignore b/pydocx/py_docx/.gitignore
new file mode 100644
index 00000000..a67f55a1
--- /dev/null
+++ b/pydocx/py_docx/.gitignore
@@ -0,0 +1,8 @@
+.coverage
+*.pyc
+*.docx
+*.kpf
+build
+dist
+template/word/media
+MANIFEST
diff --git a/pydocx/py_docx/HACKING.markdown b/pydocx/py_docx/HACKING.markdown
new file mode 100644
index 00000000..9009eee2
--- /dev/null
+++ b/pydocx/py_docx/HACKING.markdown
@@ -0,0 +1,104 @@
+Adding Features
+===============
+
+# Recommended reading
+
+- The [LXML tutorial](http://codespeak.net/lxml/tutorial.html) covers the basics of XML etrees, which we create, append and insert to make XML documents. LXML also provides XPath, which we use to specify locations in the document.
+- If you're stuck. check out the [OpenXML specs and videos](http://openxmldeveloper.org). In particular, the is [OpenXML ECMA spec] [] is well worth a read.
+- Learning about [XML namespaces](http://www.w3schools.com/XML/xml_namespaces.asp)
+- The [Namespaces section of Dive into Python](http://diveintopython3.org/xml.html)
+- Microsoft's [introduction to the Office (2007) Open XML File Formats](http://msdn.microsoft.com/en-us/library/aa338205.aspx)
+
+# How can I contribute?
+
+Fork the project on github, then send the main project a [pull request](http://github.com/guides/pull-requests). The project will then accept your pull (in most cases), which will show your changes part of the changelog for the main project, along with your name and picture.
+
+# A note about namespaces and LXML
+
+LXML doesn't use namespace prefixes. It just uses the actual namespaces, and wants you to set a namespace on each tag. For example, rather than making an element with the 'w' namespace prefix, you'd make an element with the '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' prefix.
+
+To make this easier:
+
+- The most common namespace, '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' (prefix 'w') is automatically added by makeelement()
+- You can specify other namespaces with 'nsprefix', which maps the prefixes Word files use to the actual namespaces, eg:
+
+
makeelement('coreProperties',nsprefix='cp')
+
+will generate:
+
+
+
+which is the same as what Word generates:
+
+
+
+The namespace prefixes are different, but that's irrelevant as the namespaces themselves are the same.
+
+There's also a cool side effect - you can ignore setting 'xmlns' attributes that aren't used directly in the current element, since there's no need. Eg, you can make the equivalent of this from a Word file:
+
+
+
+
+With the following code:
+
+ docprops = makeelement('coreProperties',nsprefix='cp')
+
+We only need to specify the 'cp' prefix because that's what this element uses. The other 'xmlns' attributes are used to specify the prefixes for child elements. We don't need to specify them here because each child element will have its namespace specified when we make that child.
+
+# Coding Style
+
+Basically just look at what's there. But if you need something more specific:
+
+- Functional - every function should take some inputs, return something, and not use any globals.
+- [Google Python Style Guide style](http://code.google.com/p/soc/wiki/PythonStyleGuide)
+
+# Unit Testing
+
+After adding code, open **tests/test_docx.py** and add a test that calls your function and checks its output.
+
+- Use **easy_install** to fetch the **nose** and **coverage** modules
+- Run
+
+nosetests --with-coverage
+
+to run all the doctests. They should all pass.
+
+# Tips
+
+## If Word complains about files:
+
+First, determine whether Word can recover the files:
+- If Word cannot recover the file, you most likely have a problem with your zip file
+- If Word can recover the file, you most likely have a problem with your XML
+
+### Common Zipfile issues
+
+- Ensure the same file isn't included twice in your zip archive. Zip supports this, Word doesn't.
+- Ensure that all media files have an entry for their file type in [Content_Types].xml
+- Ensure that files in zip file file have leading '/'s removed.
+
+### Common XML issues
+
+- Ensure the _rels, docProps, word, etc directories are in the top level of your zip file.
+- Check your namespaces - on both the tags, and the attributes
+- Check capitalization of tag names
+- Ensure you're not missing any attributes
+- If images or other embedded content is shown with a large red X, your relationships file is missing data.
+
+#### One common debugging technique we've used before
+
+- Re-save the document in Word will produced a fixed version of the file
+- Unzip and grabbing the serialized XML out of the fixed file
+- Use etree.fromstring() to turn it into an element, and include that in your code.
+- Check that a correct file is generated
+- Remove an element from your string-created etree (including both opening and closing tags)
+- Use element.append(makelement()) to add that element to your tree
+- Open the doc in Word and see if it still works
+- Repeat the last three steps until you discover which element is causing the prob
+
+[OpenXML ECMA spec]: http://www.ecma-international.org/publications/files/ECMA-ST/Office%20Open%20XML%201st%20edition%20Part%204%20(DOCX).zip
\ No newline at end of file
diff --git a/pydocx/py_docx/LICENSE b/pydocx/py_docx/LICENSE
new file mode 100644
index 00000000..c621d034
--- /dev/null
+++ b/pydocx/py_docx/LICENSE
@@ -0,0 +1,22 @@
+Copyright (c) 2009-2010 Mike MacCana
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/pydocx/py_docx/MANIFEST.in b/pydocx/py_docx/MANIFEST.in
new file mode 100644
index 00000000..da4ec342
--- /dev/null
+++ b/pydocx/py_docx/MANIFEST.in
@@ -0,0 +1,5 @@
+include template/*
+include template/_rels/*
+include template/docProps/*
+include template/word/*
+include template/word/theme/*
diff --git a/pydocx/py_docx/Makefile b/pydocx/py_docx/Makefile
new file mode 100644
index 00000000..52d1c96f
--- /dev/null
+++ b/pydocx/py_docx/Makefile
@@ -0,0 +1,28 @@
+PYTHON = $(shell test -x bin/python && echo bin/python || echo `which python`)
+SETUP = $(PYTHON) ./setup.py
+
+.PHONY: clean help coverage register sdist upload
+
+help:
+ @echo "Please use \`make ' where is one or more of"
+ @echo " clean delete intermediate work product and start fresh"
+ @echo " coverage run nosetests with coverage"
+ @echo " register update metadata (README.rst) on PyPI"
+ @echo " sdist generate a source distribution into dist/"
+ @echo " upload upload distribution tarball to PyPI"
+
+clean:
+ find . -type f -name \*.pyc -exec rm {} \;
+ rm -rf dist .coverage .DS_Store MANIFEST
+
+coverage:
+ nosetests --with-coverage --cover-package=docx --cover-erase
+
+register:
+ $(SETUP) register
+
+sdist:
+ $(SETUP) sdist
+
+upload:
+ $(SETUP) sdist upload
diff --git a/pydocx/py_docx/README.markdown b/pydocx/py_docx/README.markdown
new file mode 100644
index 00000000..cbccf12a
--- /dev/null
+++ b/pydocx/py_docx/README.markdown
@@ -0,0 +1,81 @@
+Python docx
+===========
+
+## Introduction
+
+The docx module creates, reads and writes Microsoft Office Word 2007 docx files.
+
+These are referred to as 'WordML', 'Office Open XML' and 'Open XML' by Microsoft.
+
+These documents can be opened in Microsoft Office 2007 / 2010, Microsoft Mac Office 2008, Google Docs, OpenOffice.org 3, and Apple iWork 08.
+
+They also [validate as well formed XML](http://validator.w3.org/check).
+
+The module was created when I was looking for a Python support for MS Word .doc files, but could only find various hacks involving COM automation, calling .net or Java, or automating OpenOffice or MS Office.
+
+The docx module has the following features:
+
+### Making documents
+
+Features for making documents include:
+
+- Paragraphs
+- Bullets
+- Numbered lists
+- Document properties (author, company, etc)
+- Multiple levels of headings
+- Tables
+- Section and page breaks
+- Images
+
+
+
+### Editing documents
+
+Thanks to the awesomeness of the lxml module, we can:
+
+- Search and replace
+- Extract plain text of document
+- Add and delete items anywhere within the document
+- Change document properties
+- Run xpath queries against particular locations in the document - useful for retrieving data from user-completed templates.
+
+# Getting started
+
+## Making and Modifying Documents
+
+- Just [download python docx](http://github.com/mikemaccana/python-docx/tarball/master).
+- Use **pip** or **easy_install** to fetch the **lxml** and **PIL** modules.
+- Then run:
+
+example-makedocument.py
+
+Congratulations, you just made and then modified a Word document!
+
+## Extracting Text from a Document
+
+If you just want to extract the text from a Word file, run:
+
+ example-extracttext.py 'Some word file.docx' 'new file.txt'
+
+### Ideas & To Do List
+
+- Further improvements to image handling
+- Document health checks
+- Egg
+- Markdown conversion support
+
+### We love forks, changes and pull requests!
+
+- Check out the [HACKING](HACKING.markdown) to add your own changes!
+- For this project on github
+- Send a pull request via github and we'll add your changes!
+
+### Want to talk? Need help?
+
+Email .
+
+### License
+
+Licensed under the [MIT license](http://www.opensource.org/licenses/mit-license.php)
+Short version: this code is copyrighted to me (Mike MacCana), I give you permission to do what you want with it except remove my name from the credits. See the LICENSE file for specific terms.
diff --git a/pydocx/py_docx/SERVING_SUGGESTIONS.markdown b/pydocx/py_docx/SERVING_SUGGESTIONS.markdown
new file mode 100644
index 00000000..86e51e48
--- /dev/null
+++ b/pydocx/py_docx/SERVING_SUGGESTIONS.markdown
@@ -0,0 +1,12 @@
+Serving Suggestions
+===================
+
+# Mashing docx with other modules
+
+This is a list of interesting things you could do with Python docx when mashed up with other modules.
+
+- [LinkedIn Python API](http://code.google.com/p/python-linkedin/) - Auto-build a Word doc whenever some old recruiting dude asks one.
+- [Python Natural Language Toolkit](http://www.nltk.org/) - can analyse text and extract meaning.
+- [Lamson](http://lamsonproject.org/) - transparently parse or modify email attachments.
+
+Any other ideas? Doing something cool you want to tell the world about? python.docx@librelist.com
\ No newline at end of file
diff --git a/pydocx/py_docx/__init__.py b/pydocx/py_docx/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/pydocx/py_docx/docx.py b/pydocx/py_docx/docx.py
new file mode 100755
index 00000000..2292e1c6
--- /dev/null
+++ b/pydocx/py_docx/docx.py
@@ -0,0 +1,1271 @@
+#!/usr/bin/env python2.6
+# -*- coding: utf-8 -*-
+"""
+Open and modify Microsoft Word 2007 docx files (called 'OpenXML' and
+'Office OpenXML' by Microsoft)
+
+Part of Python's docx module - http://github.com/mikemaccana/python-docx
+See LICENSE for licensing information.
+"""
+
+import logging
+import lxml
+from lxml import etree
+from PIL import Image
+import zipfile
+import shutil
+import re
+import time
+import os
+from os.path import join
+
+log = logging.getLogger(__name__)
+
+# Record template directory's location which is just 'template' for a docx
+# developer or 'site-packages/docx-template' if you have installed docx
+template_dir = join(os.path.dirname(__file__), 'docx-template') # installed
+if not os.path.isdir(template_dir):
+ template_dir = join(os.path.dirname(__file__), 'template') # dev
+
+# All Word prefixes / namespace matches used in document.xml & core.xml.
+# LXML doesn't actually use prefixes (just the real namespace) , but these
+# make it easier to copy Word output more easily.
+nsprefixes = {
+ 'mo': 'http://schemas.microsoft.com/'
+ 'office/mac/office/2008/main',
+ 'o': 'urn:schemas-microsoft-com:office:office',
+ 've': 'http://schemas.openxmlformats.org/'
+ 'markup-compatibility/2006',
+ # Text Content
+ 'w': 'http://schemas.openxmlformats.org/'
+ 'wordprocessingml/2006/main',
+ 'w10': 'urn:schemas-microsoft-com:office:word',
+ 'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
+ # Drawing
+ 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
+ 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
+ 'mv': 'urn:schemas-microsoft-com:mac:vml',
+ 'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
+ 'v': 'urn:schemas-microsoft-com:vml',
+ 'wp': 'http://schemas.openxmlformats.org/'
+ 'drawingml/2006/wordprocessingDrawing',
+ # Properties (core and extended)
+ 'cp': 'http://schemas.openxmlformats.org/'
+ 'package/2006/metadata/core-properties',
+ 'dc': 'http://purl.org/dc/elements/1.1/',
+ 'ep': 'http://schemas.openxmlformats.org/'
+ 'officeDocument/2006/extended-properties',
+ 'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+ # Content Types
+ 'ct': 'http://schemas.openxmlformats.org/'
+ 'package/2006/content-types',
+ # Package Relationships
+ 'r': 'http://schemas.openxmlformats.org/'
+ 'officeDocument/2006/relationships',
+ 'pr': 'http://schemas.openxmlformats.org/'
+ 'package/2006/relationships',
+ # Dublin Core document properties
+ 'dcmitype': 'http://purl.org/dc/dcmitype/',
+ 'dcterms': 'http://purl.org/dc/terms/'}
+
+
+def opendocx(file):
+ '''Open a docx file, return a document XML tree'''
+ mydoc = zipfile.ZipFile(file)
+ xmlcontent = mydoc.read('word/document.xml')
+ document = etree.fromstring(xmlcontent)
+ return document
+
+
+def newdocument():
+ #create a new document
+ #add the body the document
+ document = makeelement('document')
+ document.append(makeelement('body'))
+ return document
+
+
+def new_numbering():
+ #create a new numbering file. this is needed for lists
+ numbering = makeelement('numbering')
+ return numbering
+
+
+def create_list(abstractNum=0):
+ #the numbering file requires an abstractNum for each list
+ abstractnum = makeelement(
+ 'abstractNum', attributes={'abstractNumId': str(abstractNum)})
+ return abstractnum
+
+
+def create_list_attributes(
+ ilvl='0', start='1', type='bullet',
+ just='left', left='720', hanging='360'):
+ #create the attributes of a list that will
+ #go into the numbering file
+ lvl = makeelement('lvl', attributes={'ilvl': ilvl})
+ lvl.append(makeelement('start', attributes={'val': start}))
+ if type == 'decimal':
+ if int(ilvl) % 3 == 1:
+ type = 'lowerLetter'
+ if int(ilvl) % 3 == 2:
+ type = 'lowerRoman'
+ if int(ilvl) % 3 == 0:
+ type = 'decimal'
+ lvl.append(makeelement('numFmt', attributes={'val': type}))
+ if type == 'bullet':
+ lvl.append(makeelement('lvlText', attributes={'val': u"\u2022"}))
+ else:
+ lvl.append(makeelement(
+ 'lvlText', attributes=
+ {'val': '%'+str(int(ilvl)+1)+'.'}))
+ lvl.append(makeelement('lvlJc', attributes={'val': just}))
+ ppr = makeelement('pPr')
+ ppr.append(makeelement(
+ 'ind', attributes={'left': left, 'hanging': hanging}))
+ lvl.append(ppr)
+ if type == 'bullet':
+ rpr = makeelement('rPr')
+ rpr.append(makeelement('rFonts', attributes={
+ 'ascii': 'Symbol', 'hAnsi': 'Symbol', 'hint': 'default'}))
+ lvl.append(rpr)
+ else:
+ rpr = makeelement('rPr')
+ rpr.append(makeelement('rFonts', attributes={'hint': 'default'}))
+ lvl.append(rpr)
+ return lvl
+
+
+def fill_tentative(ilvl, type_lst, left='720'):
+ #fill tentative is for the list items that
+ #the user has not filled out, but might
+ #later fill out
+ start_from = int(ilvl)
+ if type_lst == 'decimal':
+ #decimal requires different tentatives than bullet points, so need
+ #to separate these out
+ numbers = True
+ else:
+ numbers = False
+ tentatives = []
+ for i in range(start_from, 9):
+ lvl = makeelement('lvl', attributes={'ilvl': str(i), 'tentative': '1'})
+ lvl.append(makeelement('start', attributes={'val': '1'}))
+ if numbers:
+ #lists usually go in a pattern of three
+ #decimal, lower letter, then lower roman
+ if i % 3 == 2:
+ lvl.append(makeelement(
+ 'numFmt', attributes={'val': 'lowerRoman'}))
+ elif i % 3 == 0:
+ lvl.append(makeelement(
+ 'numFmt', attributes={'val': 'decimal'}))
+ elif i % 3 == 1:
+ lvl.append(makeelement(
+ 'numFmt', attributes={'val': 'lowerLetter'}))
+ else:
+ lvl.append(makeelement('numFmt', attributes={'val': type_lst}))
+ if type_lst == 'bullet':
+ #using unicode for now for bullet representation
+ lvl.append(makeelement('lvlText', attributes={'val': u"\u2022"}))
+ else:
+ level = i + 1
+ level = str(level)
+ #there's a lvlText attrib for numbered lists
+ #which just is just 1 more than the current ilvl
+ lvl.append(makeelement(
+ 'lvlText', attributes={'val': '%'+level+'.'}))
+ if i % 3 == 2:
+ #it seems that for every second list, there justification level
+ #switches to the right
+ lvl.append(makeelement('lvlJc', attributes={'val': 'right'}))
+ else:
+ lvl.append(makeelement('lvlJc', attributes={'val': 'left'}))
+ ppr = makeelement('pPr')
+ #making appropriate indentation
+ left = int(left)
+ left = 720 * (i + 1)
+ left = str(left)
+ if i % 3 == 2:
+ #hanging is usually 360, but for every second list
+ #the hanging value changes to 180, or so it seems
+ ppr.append(makeelement(
+ 'ind', attributes={'left': left, 'hanging': '180'}))
+ else:
+ ppr.append(makeelement(
+ 'ind', attributes={'left': left, 'hanging': '360'}))
+ lvl.append(ppr)
+ if type_lst == 'bullet':
+ #this can be made more complex and put in some
+ #special types of bullets
+ rpr = makeelement('rPr')
+ rpr.append(makeelement(
+ 'rFonts', attributes={'ascii': 'Symbol',
+ 'hAnsi': 'Symbol', 'hint': 'default'}))
+ lvl.append(rpr)
+ tentatives.append(lvl)
+ return tentatives
+
+
+def create_abstract_IdInfo(numId):
+ #abstractIdInfo for the bottom of the numbering file
+ #each val refers to a list in the numbering xml
+ #file
+ abstractId = str(int(numId)-1)
+ num = makeelement('num', attributes={'numId': numId})
+ abstractNumId = makeelement(
+ 'abstractNumId', attributes={'val': abstractId})
+ num.append(abstractNumId)
+ return num
+
+
+def makeelement(
+ tagname, tagtext=None, nsprefix='w',
+ attributes=None, attrnsprefix=None):
+ '''Create an element & return it'''
+ # Deal with list of nsprefix by making namespacemap
+ namespacemap = None
+ if isinstance(nsprefix, list):
+ namespacemap = {}
+ for prefix in nsprefix:
+ namespacemap[prefix] = nsprefixes[prefix]
+ # FIXME: rest of code below expects a single prefix
+ nsprefix = nsprefix[0]
+ if nsprefix:
+ namespace = '{'+nsprefixes[nsprefix]+'}'
+ else:
+ # For when namespace = None
+ namespace = ''
+ newelement = etree.Element(namespace+tagname, nsmap=namespacemap)
+ # Add attributes with namespaces
+ if attributes:
+ # If they haven't bothered setting
+ # attribute namespace, use an empty string
+ # (equivalent of no namespace)
+ if not attrnsprefix:
+ # Quick hack: it seems every element
+ # that has a 'w' nsprefix for its tag uses the
+ # same prefix for it's attributes
+ if nsprefix == 'w':
+ attributenamespace = namespace
+ else:
+ attributenamespace = ''
+ else:
+ attributenamespace = '{'+nsprefixes[attrnsprefix]+'}'
+
+ for tagattribute in attributes:
+ newelement.set(
+ attributenamespace+tagattribute, attributes[tagattribute])
+ if tagtext:
+ newelement.text = tagtext
+ return newelement
+
+
+def pagebreak(type='page', orient='portrait'):
+ '''Insert a break, default 'page'.
+ See http://openxmldeveloper.org/forums/thread/4075.aspx
+ Return our page break element.'''
+ # Need to enumerate different types of page breaks.
+ validtypes = ['page', 'section']
+ if type not in validtypes:
+ tmpl = 'Page break style "%s" not implemented. Valid styles: %s.'
+ raise ValueError(tmpl % (type, validtypes))
+ pagebreak = makeelement('p')
+ if type == 'page':
+ run = makeelement('r')
+ br = makeelement('br', attributes={'type': type})
+ run.append(br)
+ pagebreak.append(run)
+ elif type == 'section':
+ pPr = makeelement('pPr')
+ sectPr = makeelement('sectPr')
+ if orient == 'portrait':
+ pgSz = makeelement('pgSz', attributes={'w': '12240', 'h': '15840'})
+ elif orient == 'landscape':
+ pgSz = makeelement('pgSz', attributes={'h': '12240', 'w': '15840',
+ 'orient': 'landscape'})
+ sectPr.append(pgSz)
+ pPr.append(sectPr)
+ pagebreak.append(pPr)
+ return pagebreak
+
+
+def paragraph(paratext, style='BodyText',
+ breakbefore=False, jc='left',
+ is_list=False, ilvl='0', numId='1'):
+ #added is_list, because justification is included in the numbering
+ #file for lists, so we need not include it. also ilvl and numId
+ #is included so that we can nest lists
+
+ '''Make a new paragraph element, containing a run, and some text.
+ Return the paragraph element.
+
+ @param string jc: Paragraph alignment, possible values:
+ left, center, right, both (justified), ...
+ see http://www.schemacentral.com/sc/ooxml/t-w_ST_Jc.html
+ for a full list
+
+ If paratext is a list, spawn multiple run/text elements.
+ Support text styles (paratext must then be a list of lists in the form
+ / '
+)
+
+BASE_HTML = '''
+
+
+ %s
+
+ %%s
+
+''' % STYLE
+
+BASE_LATEX = r'''\documentclass{article}\usepackage{hyperref}
+\usepackage{graphicx}\usepackage{changes}
+\usepackage{changepage}
+\usepackage{hanging}\usepackage{multirow}
+\usepackage{pbox}\usepackage{pdflscape}
+\usepackage{ulem}\usepackage{comment}
+\begin{document}''' + "%s" + r'''\end{document}
+'''
+
+
+def assert_html_equal(actual_html, expected_html):
+ assert collapse_html(
+ actual_html,
+ ) == collapse_html(
+ expected_html
+ ), actual_html
+
+
+def assert_latex_equal(actual_latex, expected_latex):
+ assert collapse_latex(
+ actual_latex,
+ ) == collapse_latex(
+ expected_latex
+ ), actual_latex
+
+
+def collapse_latex(latex):
+
+ def smart_space(match):
+ # Put a space in between lines, unless exactly one side of the line
+ # break butts up against a tag.
+ before = match.group(1)
+ after = match.group(2)
+ space = ' '
+ return before + space + after
+ # Replace newlines and their surrounding
+ # whitespace with a single space (or
+ # empty string)
+ latex = re.sub(
+ r'(>?)\s*\s*()',
+ smart_space,
+ latex,
+ )
+ return latex.strip()
+
+
+def collapse_html(html):
+ """
+ Remove insignificant whitespace from the html.
+
+ >>> print collapse_html('''\\
+ ...
+ ... Heading
+ ...
+ ... ''')
+ Heading
+ >>> print collapse_html('''\\
+ ...
+ ... Paragraph with
+ ... multiple lines.
+ ...
+ ... ''')
+ Paragraph with multiple lines.
+ """
+ def smart_space(match):
+ # Put a space in between lines, unless exactly one side of the line
+ # break butts up against a tag.
+ before = match.group(1)
+ after = match.group(2)
+ space = ' '
+ if before == '>' or after == '<':
+ space = ''
+ return before + space + after
+ # Replace newlines and their surrounding whitespace with a single space (or
+ # empty string)
+ html = re.sub(
+ r'(>?)\s*\n\s*()',
+ smart_space,
+ html,
+ )
+ return html.strip()
+
+
+class XMLDocx2Latex(Docx2LaTex):
+
+ """
+ Create the object without passing in a path to the document, set them
+ manually.
+ """
+ def __init__(self, *args, **kwargs):
+ # Pass in nothing for the path
+ super(XMLDocx2Latex, self).__init__(path=None, *args, **kwargs)
+
+ def _build_data(
+ self,
+ path,
+ document_xml=None,
+ rels_dict=None,
+ numbering_dict=None,
+ styles_dict=None,
+ *args, **kwargs):
+ self._test_rels_dict = rels_dict
+ if rels_dict:
+ for value in rels_dict.values():
+ self._image_data['word/%s' % value] = 'word/%s' % value
+ self.numbering_root = None
+ if numbering_dict is not None:
+ self.numbering_root = parse_xml_from_string(
+ DXB.numbering(numbering_dict),
+ )
+ self.numbering_dict = numbering_dict
+ # Intentionally not calling super
+ if document_xml is not None:
+ self.root = parse_xml_from_string(document_xml)
+ self.zip_path = ''
+
+ # This is the standard page width for a word document, Also the page
+ # width that we are looking for in the test.
+ self.page_width = 612
+
+ self.styles_dict = styles_dict
+
+ def _parse_rels_root(self, *args, **kwargs):
+ if self._test_rels_dict is None:
+ return {}
+ return self._test_rels_dict
+
+ def get_list_style(self, num_id, ilvl):
+ try:
+ return self.numbering_dict[num_id][ilvl]
+ except KeyError:
+ return 'decimal'
+
+ def _parse_styles(self):
+ if self.styles_dict is None:
+ return {}
+ return self.styles_dict
+
+
+DEFAULT_NUMBERING_DICT = {
+ '1': {
+ '0': 'decimal',
+ '1': 'decimal',
+ },
+ '2': {
+ '0': 'lowerLetter',
+ '1': 'lowerLetter',
+ },
+}
+
+
+class XMLDocx2Html(Docx2Html):
+ """
+ Create the object without passing in a path to the document, set them
+ manually.
+ """
+ def __init__(self, *args, **kwargs):
+ # Pass in nothing for the path
+ super(XMLDocx2Html, self).__init__(path=None, *args, **kwargs)
+
+ def _build_data(
+ self,
+ path,
+ document_xml=None,
+ rels_dict=None,
+ numbering_dict=None,
+ styles_dict=None,
+ *args, **kwargs):
+ self._test_rels_dict = rels_dict
+ if rels_dict:
+ for value in rels_dict.values():
+ self._image_data['word/%s' % value] = 'word/%s' % value
+ self.numbering_root = None
+ if numbering_dict is not None:
+ self.numbering_root = parse_xml_from_string(
+ DXB.numbering(numbering_dict),
+ )
+ self.numbering_dict = numbering_dict
+ # Intentionally not calling super
+ if document_xml is not None:
+ self.root = parse_xml_from_string(document_xml)
+ self.zip_path = ''
+
+ # This is the standard page width for a word document, Also the page
+ # width that we are looking for in the test.
+ self.page_width = 612
+
+ self.styles_dict = styles_dict
+
+ def _parse_rels_root(self, *args, **kwargs):
+ if self._test_rels_dict is None:
+ return {}
+ return self._test_rels_dict
+
+ def get_list_style(self, num_id, ilvl):
+ try:
+ return self.numbering_dict[num_id][ilvl]
+ except KeyError:
+ return 'decimal'
+
+ def _parse_styles(self):
+ if self.styles_dict is None:
+ return {}
+ return self.styles_dict
+
+
+DEFAULT_NUMBERING_DICT = {
+ '1': {
+ '0': 'decimal',
+ '1': 'decimal',
+ },
+ '2': {
+ '0': 'lowerLetter',
+ '1': 'lowerLetter',
+ },
+}
+
+
+class _TranslationTestCase(TestCase):
+ expected_output = None
+ latex_expected_output = None
+ relationship_dict = None
+ styles_dict = None
+ numbering_dict = DEFAULT_NUMBERING_DICT
+ run_expected_output = True
+ parser = XMLDocx2Html
+ latex_parser = XMLDocx2Latex
+ latex_expected_output = None
+ use_base_html = True
+ convert_root_level_upper_roman = False
+
+ def get_xml(self):
+ raise NotImplementedError()
+
+ @contextmanager
+ def toggle_run_expected_output(self):
+ self.run_expected_output = not self.run_expected_output
+ yield
+ self.run_expected_output = not self.run_expected_output
+
+ def test_expected_output(self):
+ if self.expected_output is None:
+ raise NotImplementedError('expected_output is not defined')
+ if not self.run_expected_output:
+ return
+
+ # Create the xml
+ tree = self.get_xml()
+
+ # Verify the final output.
+ parser = self.parser
+ latex_parser = self.latex_parser
+
+ def image_handler(self, src, *args, **kwargs):
+ return src
+ parser.image_handler = image_handler
+ html = parser(
+ convert_root_level_upper_roman=self.convert_root_level_upper_roman,
+ document_xml=tree,
+ rels_dict=self.relationship_dict,
+ numbering_dict=self.numbering_dict,
+ styles_dict=self.styles_dict,
+ ).parsed
+ if self.use_base_html:
+ assert_html_equal(html, BASE_HTML % self.expected_output)
+ else:
+ assert_html_equal(html, self.expected_output)
+
+ latex_parser.image_handler = image_handler
+ latex = latex_parser(
+ convert_root_level_upper_roman=self.convert_root_level_upper_roman,
+ document_xml=tree,
+ rels_dict=self.relationship_dict,
+ numbering_dict=self.numbering_dict,
+ styles_dict=self.styles_dict,
+ ).parsed
+ assert_latex_equal(latex, BASE_LATEX % self.latex_expected_output)
diff --git a/pydocx/tests/document_builder.py b/pydocx/tests/document_builder.py
new file mode 100644
index 00000000..c28e1e02
--- /dev/null
+++ b/pydocx/tests/document_builder.py
@@ -0,0 +1,271 @@
+from jinja2 import Environment, PackageLoader
+from pydocx.DocxParser import EMUS_PER_PIXEL
+
+templates = {
+ 'delete': 'text_delete.xml',
+ 'drawing': 'drawing.xml',
+ 'hyperlink': 'hyperlink.xml',
+ 'insert': 'insert.xml',
+ 'linebreak': 'linebreak.xml',
+ 'main': 'base.xml',
+ 'numbering': 'numbering.xml',
+ 'p': 'p.xml',
+ 'pict': 'pict.xml',
+ 'r': 'r.xml',
+ 'rpr': 'rpr.xml',
+ 'sdt': 'sdt.xml',
+ 'sectPr': 'sectPr.xml',
+ 'smartTag': 'smart_tag.xml',
+ 'style': 'style.xml',
+ 'styles': 'styles.xml',
+ 't': 't.xml',
+ 'table': 'table.xml',
+ 'tc': 'tc.xml',
+ 'tr': 'tr.xml',
+}
+
+env = Environment(
+ loader=PackageLoader(
+ 'pydocx.tests',
+ 'templates',
+ ),
+)
+
+
+class DocxBuilder(object):
+
+ @classmethod
+ def xml(self, body):
+ template = env.get_template(templates['main'])
+ return template.render(body=body)
+
+ @classmethod
+ def p_tag(
+ self,
+ text,
+ style='style0',
+ jc=None,
+ ):
+ if isinstance(text, str):
+ # Use create a single r tag based on the text and the bold
+ run_tag = DocxBuilder.r_tag(
+ [DocxBuilder.t_tag(text)],
+ )
+ run_tags = [run_tag]
+ elif isinstance(text, list):
+ run_tags = text
+ else:
+ run_tags = [self.r_tag([])]
+ template = env.get_template(templates['p'])
+
+ kwargs = {
+ 'run_tags': run_tags,
+ 'style': style,
+ 'jc': jc,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def linebreak(self):
+ template = env.get_template(templates['linebreak'])
+ kwargs = {}
+ return template.render(**kwargs)
+
+ @classmethod
+ def t_tag(self, text):
+ template = env.get_template(templates['t'])
+ kwargs = {
+ 'text': text,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def r_tag(
+ self,
+ elements,
+ rpr=None,
+ ):
+ template = env.get_template(templates['r'])
+ if rpr is None:
+ rpr = DocxBuilder.rpr_tag()
+ kwargs = {
+ 'elements': elements,
+ 'rpr': rpr,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def rpr_tag(self, inline_styles=None, *args, **kwargs):
+ if inline_styles is None:
+ inline_styles = {}
+ valid_styles = (
+ 'b',
+ 'i',
+ 'u',
+ 'caps',
+ 'smallCaps',
+ 'strike',
+ 'dstrike',
+ 'vanish',
+ 'webHidden',
+ 'vertAlign',
+ )
+ for key in inline_styles:
+ if key not in valid_styles:
+ raise AssertionError('%s is not a valid style' % key)
+ template = env.get_template(templates['rpr'])
+ kwargs = {
+ 'tags': inline_styles,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def hyperlink_tag(self, r_id, run_tags):
+ template = env.get_template(templates['hyperlink'])
+ kwargs = {
+ 'r_id': r_id,
+ 'run_tags': run_tags,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def insert_tag(self, run_tags):
+ template = env.get_template(templates['insert'])
+ kwargs = {
+ 'run_tags': run_tags,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def delete_tag(self, deleted_texts):
+ template = env.get_template(templates['delete'])
+ kwargs = {
+ 'deleted_texts': deleted_texts,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def smart_tag(self, run_tags):
+ template = env.get_template(templates['smartTag'])
+ kwargs = {
+ 'run_tags': run_tags,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def sdt_tag(self, p_tag):
+ template = env.get_template(templates['sdt'])
+ kwargs = {
+ 'p_tag': p_tag,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def li(self, text, ilvl, numId, bold=False):
+ if isinstance(text, str):
+ # Use create a single r tag based on the text and the bold
+ run_tag = DocxBuilder.r_tag([DocxBuilder.t_tag(text)], bold)
+ run_tags = [run_tag]
+ elif isinstance(text, list):
+ run_tags = []
+ for run_text, run_bold in text:
+ run_tags.append(
+ DocxBuilder.r_tag(
+ [DocxBuilder.t_tag(run_tags)],
+ run_bold,
+ ),
+ )
+ else:
+ raise AssertionError('text must be a string or a list')
+ template = env.get_template(templates['p'])
+
+ kwargs = {
+ 'run_tags': run_tags,
+ 'is_list': True,
+ 'ilvl': ilvl,
+ 'numId': numId,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def table_cell(self, paragraph, merge=False, merge_continue=False):
+ kwargs = {
+ 'paragraph': paragraph,
+ 'merge': merge,
+ 'merge_continue': merge_continue
+ }
+ template = env.get_template(templates['tc'])
+ return template.render(**kwargs)
+
+ @classmethod
+ def table_row(self, tcs):
+ template = env.get_template(templates['tr'])
+ return template.render(table_cells=tcs)
+
+ @classmethod
+ def table(self, trs):
+ template = env.get_template(templates['table'])
+ return template.render(table_rows=trs)
+
+ @classmethod
+ def drawing(self, r_id, height=None, width=None):
+ template = env.get_template(templates['drawing'])
+ if height is not None:
+ height = height * EMUS_PER_PIXEL
+ if width is not None:
+ width = width * EMUS_PER_PIXEL
+ kwargs = {
+ 'r_id': r_id,
+ 'height': height,
+ 'width': width,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def pict(self, r_id=None, height=None, width=None):
+ template = env.get_template(templates['pict'])
+ kwargs = {
+ 'r_id': r_id,
+ 'height': height,
+ 'width': width,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def sectPr_tag(self, p_tag):
+ template = env.get_template(templates['sectPr'])
+
+ kwargs = {
+ 'p_tag': p_tag,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def styles_xml(self, style_tags):
+ template = env.get_template(templates['styles'])
+
+ kwargs = {
+ 'style_tags': style_tags,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def style(self, style_id, value):
+ template = env.get_template(templates['style'])
+
+ kwargs = {
+ 'style_id': style_id,
+ 'value': value,
+ }
+
+ return template.render(**kwargs)
+
+ @classmethod
+ def numbering(self, numbering_dict):
+ template = env.get_template(templates['numbering'])
+
+ kwargs = {
+ 'numbering_dict': numbering_dict,
+ }
+
+ return template.render(**kwargs)
diff --git a/pydocx/tests/templates/base.xml b/pydocx/tests/templates/base.xml
new file mode 100644
index 00000000..20f1ff75
--- /dev/null
+++ b/pydocx/tests/templates/base.xml
@@ -0,0 +1,4 @@
+
+
+ {{ body }}
+
diff --git a/pydocx/tests/templates/drawing.xml b/pydocx/tests/templates/drawing.xml
new file mode 100644
index 00000000..dfd470b4
--- /dev/null
+++ b/pydocx/tests/templates/drawing.xml
@@ -0,0 +1,65 @@
+
+
+
+
+
+
+
+
+
+
+ 2397125
+
+
+ 0
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pydocx/tests/templates/hyperlink.xml b/pydocx/tests/templates/hyperlink.xml
new file mode 100644
index 00000000..83645948
--- /dev/null
+++ b/pydocx/tests/templates/hyperlink.xml
@@ -0,0 +1,5 @@
+
+ {% for run_tag in run_tags %}
+ {{ run_tag }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/insert.xml b/pydocx/tests/templates/insert.xml
new file mode 100644
index 00000000..afeb2691
--- /dev/null
+++ b/pydocx/tests/templates/insert.xml
@@ -0,0 +1,5 @@
+
+ {% for run_tag in run_tags %}
+ {{ run_tag }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/linebreak.xml b/pydocx/tests/templates/linebreak.xml
new file mode 100644
index 00000000..ab92e811
--- /dev/null
+++ b/pydocx/tests/templates/linebreak.xml
@@ -0,0 +1 @@
+
diff --git a/pydocx/tests/templates/numbering.xml b/pydocx/tests/templates/numbering.xml
new file mode 100644
index 00000000..4eaac3cc
--- /dev/null
+++ b/pydocx/tests/templates/numbering.xml
@@ -0,0 +1,23 @@
+
+
+ {% for num_id, ilvl_data in numbering_dict.items() %}
+
+ {% for ilvl, format in ilvl_data.items() %}
+
+
+
+
+
+
+
+
+
+ {% endfor %}
+
+ {% endfor %}
+ {% for num_id in numbering_dict %}
+
+
+
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/p.xml b/pydocx/tests/templates/p.xml
new file mode 100644
index 00000000..7a78a060
--- /dev/null
+++ b/pydocx/tests/templates/p.xml
@@ -0,0 +1,19 @@
+
+
+
+ {% if is_list %}
+
+ {% if ilvl != None %}
+
+ {% endif %}
+ {% if numId != None %}
+
+ {% endif %}
+
+ {% endif %}
+ {% if jc %}{% endif %}
+
+ {% for run_tag in run_tags %}
+ {{ run_tag }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/pict.xml b/pydocx/tests/templates/pict.xml
new file mode 100644
index 00000000..26f772a3
--- /dev/null
+++ b/pydocx/tests/templates/pict.xml
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
+
+
+
+ {% if r_id %}{% endif %}
+
+
+
+
diff --git a/pydocx/tests/templates/r.xml b/pydocx/tests/templates/r.xml
new file mode 100644
index 00000000..2f28a66b
--- /dev/null
+++ b/pydocx/tests/templates/r.xml
@@ -0,0 +1,6 @@
+
+ {{ rpr }}
+ {% for element in elements %}
+ {{ element }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/rpr.xml b/pydocx/tests/templates/rpr.xml
new file mode 100644
index 00000000..f49eb08b
--- /dev/null
+++ b/pydocx/tests/templates/rpr.xml
@@ -0,0 +1,5 @@
+
+ {% for tag, value in tags.items() %}
+
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/sdt.xml b/pydocx/tests/templates/sdt.xml
new file mode 100644
index 00000000..fe9a7e77
--- /dev/null
+++ b/pydocx/tests/templates/sdt.xml
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ p_tag }}
+
+
diff --git a/pydocx/tests/templates/sectPr.xml b/pydocx/tests/templates/sectPr.xml
new file mode 100644
index 00000000..16a12050
--- /dev/null
+++ b/pydocx/tests/templates/sectPr.xml
@@ -0,0 +1,3 @@
+
+ {{ p_tag }}
+
diff --git a/pydocx/tests/templates/smart_tag.xml b/pydocx/tests/templates/smart_tag.xml
new file mode 100644
index 00000000..e45ee5b9
--- /dev/null
+++ b/pydocx/tests/templates/smart_tag.xml
@@ -0,0 +1,5 @@
+
+ {% for run_tag in run_tags %}
+ {{ run_tag }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/style.xml b/pydocx/tests/templates/style.xml
new file mode 100644
index 00000000..5fa9f00f
--- /dev/null
+++ b/pydocx/tests/templates/style.xml
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pydocx/tests/templates/styles.xml b/pydocx/tests/templates/styles.xml
new file mode 100644
index 00000000..a30e752e
--- /dev/null
+++ b/pydocx/tests/templates/styles.xml
@@ -0,0 +1,6 @@
+
+
+ {% for style in style_tags %}
+ {{ style }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/t.xml b/pydocx/tests/templates/t.xml
new file mode 100644
index 00000000..92412f72
--- /dev/null
+++ b/pydocx/tests/templates/t.xml
@@ -0,0 +1 @@
+{{ text }}
diff --git a/pydocx/tests/templates/table.xml b/pydocx/tests/templates/table.xml
new file mode 100644
index 00000000..e47783b6
--- /dev/null
+++ b/pydocx/tests/templates/table.xml
@@ -0,0 +1,18 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% for table_row in table_rows %}
+ {{ table_row }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/tc.xml b/pydocx/tests/templates/tc.xml
new file mode 100644
index 00000000..eff9ce0d
--- /dev/null
+++ b/pydocx/tests/templates/tc.xml
@@ -0,0 +1,28 @@
+
+
+
+ {% if merge_continue %}
+
+
+ {% endif %}
+ {% if merge %}
+
+
+ {% endif %}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% if paragraph %}
+ {{ paragraph }}
+ {% endif %}
+
diff --git a/pydocx/tests/templates/text_delete.xml b/pydocx/tests/templates/text_delete.xml
new file mode 100644
index 00000000..783b3ad3
--- /dev/null
+++ b/pydocx/tests/templates/text_delete.xml
@@ -0,0 +1,10 @@
+
+ {% for deleted_text in deleted_texts %}
+
+
+
+
+ {{ deleted_text }}
+
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/tr.xml b/pydocx/tests/templates/tr.xml
new file mode 100644
index 00000000..6e2f6925
--- /dev/null
+++ b/pydocx/tests/templates/tr.xml
@@ -0,0 +1,8 @@
+
+
+
+
+ {% for table_cell in table_cells %}
+ {{ table_cell }}
+ {% endfor %}
+
diff --git a/pydocx/tests/test_docx.py b/pydocx/tests/test_docx.py
new file mode 100644
index 00000000..d7b49b9c
--- /dev/null
+++ b/pydocx/tests/test_docx.py
@@ -0,0 +1,773 @@
+import base64
+
+from os import path
+
+from nose.plugins.skip import SkipTest
+
+from pydocx.tests import assert_html_equal, BASE_HTML
+from pydocx.parsers.Docx2Html import Docx2Html
+from pydocx.DocxParser import ZipFile
+
+
+def convert(path, *args, **kwargs):
+ return Docx2Html(path, *args, **kwargs).parsed
+
+
+def test_extract_html():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'simple.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ Simple text
+
+
+ - one
+ - two
+ - three
+
+
+
+ Cell1 |
+ Cell2 |
+
+
+ Cell3 |
+ Cell4 |
+
+
+ ''')
+
+
+def test_nested_list():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'nested_lists.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ - one
+ - two
+ - three
+
+ - AAA
+ - BBB
+ - CCC
+
+ - alpha
+
+
+
+
+ - four
+
+
+ - xxx
+
+ - yyy
+
+
+
+
+ ''')
+
+
+def test_simple_list():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'simple_lists.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ - One
+
+
+ ''')
+
+
+def test_inline_tags():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'inline_tags.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % (
+ 'This sentence has some bold, '
+ 'some italics and some '
+ 'underline, '
+ 'as well as a hyperlink.
'
+ ))
+
+
+def test_all_configured_styles():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'all_configured_styles.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ aaa
+ bbb
+ ccc
+ ddd
+ eee
+ fff
+ ggg
+ hhh
+ iii
+ ''')
+
+
+def test_super_and_subscript():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'super_and_subscript.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAABBB
+ CCCDDD
+ ''')
+
+
+def test_unicode():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'greek_alphabet.docx',
+ )
+ actual_html = convert(file_path)
+ assert actual_html is not None
+ assert u'\u0391\u03b1' in actual_html
+
+
+def test_special_chars():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'special_chars.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ & < > link
''') # noqa
+
+
+def test_table_col_row_span():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'table_col_row_span.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
+ AAA |
+
+
+ BBB |
+ CCC |
+
+
+ DDD |
+
+
+
+ EEE
+ |
+ FFF |
+
+
+
+ GGG
+ |
+
+
+
+
+ 1 |
+ 2 |
+ 3 |
+ 4 |
+
+
+ 5 |
+ 6 |
+ 7 |
+
+
+ 8 |
+ 9 |
+
+
+ 10 |
+ 11 |
+ 12 |
+ 13 |
+
+
+ ''')
+
+
+def test_nested_table_rowspan():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'nested_table_rowspan.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
+ AAA |
+
+
+ BBB |
+
+
+
+ CCC |
+ DDD |
+
+
+ EEE |
+
+
+ |
+
+
+ ''')
+
+
+def test_nested_tables():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'nested_tables.docx',
+ )
+ actual_html = convert(file_path)
+ # Find out why br tag is there.
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
+ AAA |
+ BBB |
+
+
+ CCC |
+
+
+
+ DDD |
+ EEE |
+
+
+ FFF |
+ GGG |
+
+
+ |
+
+
+ ''')
+
+
+def test_list_in_table():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'list_in_table.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
+
+
+ - AAA
+ - BBB
+ - CCC
+
+ |
+
+
+ ''')
+
+
+def test_tables_in_lists():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'tables_in_lists.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ - AAA
+ - BBB
+
+
+ CCC |
+ DDD |
+
+
+ EEE |
+ FFF |
+
+
+
+ - GGG
+
+ ''')
+
+
+def test_track_changes_on():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'track_changes_on.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ This was some content.
+ ''')
+
+
+def test_headers():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'headers.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ This is an H1
+ This is an H2
+ This is an H3
+ This is an H4
+ This is an H5
+ This is an H6
+ This is an H7
+ This is an H8
+ This is an H9
+ This is an H10
+ ''')
+
+
+def test_split_headers():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'split_header.docx',
+ )
+
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
BBB
CCC
+ ''')
+
+
+def get_image_data(docx_file_path, image_name):
+ """
+ Return base 64 encoded data for the image_name that is stored in the
+ docx_file_path.
+ """
+ with ZipFile(docx_file_path) as f:
+ images = [
+ e for e in f.infolist()
+ if e.filename == 'word/media/%s' % image_name
+ ]
+ if not images:
+ raise AssertionError('%s not in %s' % (image_name, docx_file_path))
+ data = f.read(images[0].filename)
+ return base64.b64encode(data)
+
+
+def test_has_image():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'has_image.docx',
+ )
+
+ actual_html = convert(file_path)
+ image_data = get_image_data(file_path, 'image1.gif')
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ AAA
+
+
+ ''' % image_data)
+
+
+def test_local_dpi():
+ # The image in this file does not have a set height or width, show that the
+ # html will generate without it.
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'localDpi.docx',
+ )
+ actual_html = convert(file_path)
+ image_data = get_image_data(file_path, 'image1.jpeg')
+ assert_html_equal(actual_html, BASE_HTML % '''
+ 
+ ''' % image_data)
+
+
+def test_has_image_using_image_handler():
+ raise SkipTest('This needs to be converted to an xml test')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'has_image.docx',
+ )
+
+ def image_handler(*args, **kwargs):
+ return 'test'
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ ''')
+
+
+def test_headers_with_full_line_styles():
+ raise SkipTest('This test is not yet passing')
+ # Show that if a natural header is completely bold/italics that
+ # bold/italics will get stripped out.
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'headers_with_full_line_styles.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ BBB
+ CCC
+ ''')
+
+
+def test_convert_p_to_h():
+ raise SkipTest('This test is not yet passing')
+ # Show when it is correct to convert a p tag to an h tag based on
+ # bold/italics
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'convert_p_to_h.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ BBB
+ CCC
+
+ - DDD
+ - EEE
+ - FFF
+
+
+
+ GGG |
+ HHH |
+
+
+ III |
+ JJJ |
+
+
+ ''')
+
+
+def test_fake_headings_by_length():
+ raise SkipTest('This test is not yet passing')
+ # Show that converting p tags to h tags has a length limit. If the p tag is
+ # supposed to be converted to an h tag but has more than seven words in the
+ # paragraph do not convert it.
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'fake_headings_by_length.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ Heading.
+ Still a heading.
+
+ This is not a heading because it is too many words.
+
+ ''')
+
+
+def test_shift_enter():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'shift_enter.docx',
+ )
+
+ # Test just the convert without clean_html to make sure the first
+ # break tag is present.
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
BBB
+ CCC
+
+ - DDD
EEE
+ - FFF
+
+
+
+ GGG HHH |
+ III JJJ |
+
+
+ KKK |
+ LLL |
+
+
+ ''')
+
+
+def test_lists_with_styles():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'lists_with_styles.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ - AAA
+ - BBB
+
+ - CCC
+ - DDD
+
+ - EEE
+
+ - FFF
+
+
+
+
+
+
+
+ ''')
+
+
+def test_list_to_header():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'list_to_header.docx',
+ )
+ actual_html = convert(file_path, convert_root_level_upper_roman=True)
+ # It should be noted that list item `GGG` is upper roman in the word
+ # document to show that only top level upper romans get converted.
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+
+ - BBB
+
+ CCC
+
+ - DDD
+
+ EEE
+
+ - FFF
+
+ - GGG
+
+
+
+ ''')
+
+
+def test_has_title():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'has_title.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ Title
+ Text
+ ''')
+
+
+def test_upper_alpha_all_bold():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'upper_alpha_all_bold.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ BBB
+ CCC
+ ''')
+
+
+def test_simple_table():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'simple_table.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
+
+ Cell1
+ Cell3
+ |
+ Cell2
+ And I am writing in the table
+ |
+
+
+ Cell4 |
+
+
+ ''')
+
+
+def test_justification():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'justification.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
Center Justified
+
+
+
Right justified
+
+
+
+ Right justified and pushed in from right
+
+
+
+
+ Center justified and pushed in from left and it is
+ great and it is the coolest thing of all time and I like it and
+ I think it is cool
+
+
+
+
+ Left justified and pushed in from left
+
+
+ ''')
+
+
+def _converter(*args, **kwargs):
+ # Having a converter that does nothing is the same as if abiword fails to
+ # convert.
+ pass
+
+
+#def test_converter_broken():
+# file_path = 'test.doc'
+# assert_raises(
+# ConversionFailed,
+# lambda: convert(file_path, converter=_converter),
+# )
+
+
+def test_fall_back():
+ raise SkipTest('This test is not yet passing')
+ file_path = 'test.doc'
+
+ def fall_back(*args, **kwargs):
+ return 'success'
+ html = convert(file_path, fall_back=fall_back, converter=_converter)
+ assert html == 'success'
+
+
+#@mock.patch('docx2html.core.read_html_file')
+#@mock.patch('docx2html.core.get_zip_file_handler')
+#def test_html_files(patch_zip_handler, patch_read):
+def test_html_files():
+ raise SkipTest('This test is not yet passing')
+
+ def raise_assertion(*args, **kwargs):
+ raise AssertionError('Should not have called get_zip_file_handler')
+ #patch_zip_handler.side_effect = raise_assertion
+
+ def return_text(*args, **kwargs):
+ return 'test'
+ #patch_read.side_effect = return_text
+
+ # Try with an html file
+ file_path = 'test.html'
+
+ html = convert(file_path)
+ assert html == 'test'
+
+ # Try again with an htm file.
+ file_path = 'test.htm'
+
+ html = convert(file_path)
+ assert html == 'test'
diff --git a/pydocx/tests/test_xml.py b/pydocx/tests/test_xml.py
new file mode 100644
index 00000000..adfd05cb
--- /dev/null
+++ b/pydocx/tests/test_xml.py
@@ -0,0 +1,1447 @@
+import os
+import time
+
+from nose.plugins.skip import SkipTest
+
+from pydocx.tests.document_builder import DocxBuilder as DXB
+from pydocx.tests import (
+ XMLDocx2Html,
+ _TranslationTestCase,
+)
+from pydocx.utils import parse_xml_from_string, find_all
+
+
+class BoldTestCase(_TranslationTestCase):
+ expected_output = """
+ AAA
+ BBB
+ """
+ latex_expected_output = r'''
+ \textbf{AAA}'''\
+ + "\n" + '''BBB''' + "\n"
+
+ def get_xml(self):
+ tags = [
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('AAA')],
+ rpr=DXB.rpr_tag({'b': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('BBB')],
+ rpr=DXB.rpr_tag({'b': 'false'}),
+ ),
+ ],
+ ),
+ ]
+
+ body = ''
+ for tag in tags:
+ body += tag
+ xml = DXB.xml(body)
+ return xml
+
+
+class HyperlinkVanillaTestCase(_TranslationTestCase):
+
+ relationship_dict = {
+ 'rId0': 'www.google.com',
+ }
+
+ expected_output = '''
+ link.
+ '''
+
+ latex_expected_output = r'''
+ \href{www.google.com}{link}.
+ '''
+
+ def get_xml(self):
+ run_tags = []
+ run_tags.append(DXB.r_tag([DXB.t_tag('link')]))
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ run_tags.append(DXB.r_tag([DXB.t_tag('.')]))
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class HyperlinkWithMultipleRunsTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'www.google.com',
+ }
+
+ expected_output = '''
+ link.
+ '''
+
+ latex_expected_output = r'''
+ \href{www.google.com}{link}.
+ '''
+
+ def get_xml(self):
+ run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'link']
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ run_tags.append(DXB.r_tag([DXB.t_tag('.')]))
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class HyperlinkNoTextTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'www.google.com',
+ }
+
+ expected_output = ''
+
+ latex_expected_output = ''
+
+ def get_xml(self):
+ run_tags = []
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class HyperlinkNotInRelsDictTestCase(_TranslationTestCase):
+ relationship_dict = {
+ # 'rId0': 'www.google.com', missing
+ }
+
+ expected_output = 'link.
'
+
+ latex_expected_output = r'''
+ link.
+ '''
+
+ def get_xml(self):
+ run_tags = []
+ run_tags.append(DXB.r_tag([DXB.t_tag('link')]))
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ run_tags.append(DXB.r_tag([DXB.t_tag('.')]))
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class HyperlinkWithBreakTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'www.google.com',
+ }
+
+ expected_output = 'link
'
+
+ latex_expected_output = r'''
+ \href{www.google.com}{link\\}
+ '''
+
+ def get_xml(self):
+ run_tags = []
+ run_tags.append(DXB.r_tag([DXB.t_tag('link')]))
+ run_tags.append(DXB.r_tag([DXB.linebreak()]))
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class ImageLocal(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'media/image1.jpeg',
+ 'rId1': 'media/image2.jpeg',
+ }
+ expected_output = '''
+ 
+ 
+ '''
+
+ latex_expected_output = r'''
+ \includegraphics {word/media/image1.jpeg}
+ ''' + '\n' + '''
+ \includegraphics {word/media/image2.jpeg}
+ '''
+
+ def get_xml(self):
+ drawing = DXB.drawing(height=None, width=None, r_id='rId0')
+ pict = DXB.pict(height=None, width=None, r_id='rId1')
+ tags = [
+ drawing,
+ pict,
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class ImageTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'media/image1.jpeg',
+ 'rId1': 'media/image2.jpeg',
+ }
+ expected_output = '''
+
+
+
+
+
+
+ '''
+
+ latex_expected_output = r'''
+ \includegraphics[height=20pxpt, width=30.0pt]{word/media/image1.jpeg}
+ ''' + '\n' + '''
+ \includegraphics[height=21ptpt, width=41pt]{word/media/image2.jpeg}
+ '''
+
+ def get_xml(self):
+ drawing = DXB.drawing(height=20, width=40, r_id='rId0')
+ pict = DXB.pict(height=21, width=41, r_id='rId1')
+ tags = [
+ drawing,
+ pict,
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+ def test_get_image_id(self):
+ parser = XMLDocx2Html(
+ document_xml=self.get_xml(),
+ rels_dict=self.relationship_dict,
+ )
+ tree = parse_xml_from_string(self.get_xml())
+ els = []
+ els.extend(find_all(tree, 'drawing'))
+ els.extend(find_all(tree, 'pict'))
+ image_ids = []
+ for el in els:
+ image_ids.append(parser._get_image_id(el))
+ expected = [
+ 'rId0',
+ 'rId1',
+ ]
+ self.assertEqual(
+ set(image_ids),
+ set(expected),
+ )
+
+ def test_get_image_sizes(self):
+ parser = XMLDocx2Html(
+ document_xml=self.get_xml(),
+ rels_dict=self.relationship_dict,
+ )
+ tree = parse_xml_from_string(self.get_xml())
+ els = []
+ els.extend(find_all(tree, 'drawing'))
+ els.extend(find_all(tree, 'pict'))
+ image_ids = []
+ for el in els:
+ image_ids.append(parser._get_image_size(el))
+ expected = [
+ ('40px', '20px'),
+ ('41pt', '21pt'),
+ ]
+ self.assertEqual(
+ set(image_ids),
+ set(expected),
+ )
+
+
+class ImageNotInRelsDictTestCase(_TranslationTestCase):
+ relationship_dict = {
+ # 'rId0': 'media/image1.jpeg',
+ }
+ expected_output = ''
+
+ latex_expected_output = ''
+
+ def get_xml(self):
+ drawing = DXB.drawing(height=20, width=40, r_id='rId0')
+ body = drawing
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class ImageNoSizeTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': os.path.join(
+ os.path.abspath(os.path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'bullet_go_gray.png',
+ )
+ }
+ image_sizes = {
+ 'rId0': (0, 0),
+ }
+ expected_output = '''
+
+
+
+
+
+ ''' % relationship_dict['rId0']
+
+ latex_expected_output = r'\includegraphics{%s}' % relationship_dict['rId0']
+
+ @staticmethod
+ def image_handler(image_id, relationship_dict):
+ return relationship_dict.get(image_id)
+
+ def get_xml(self):
+ raise SkipTest(
+ 'Since we are not using PIL, we do not need this test yet.',
+ )
+ drawing = DXB.drawing('rId0')
+ tags = [
+ drawing,
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class TableTag(_TranslationTestCase):
+ expected_output = '''
+
+
+ AAA |
+ BBB |
+
+
+ CCC |
+ DDD |
+
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{tabular}{ll}
+ {AAA} & {BBB} \\
+ {CCC} & {DDD} \\
+ \end{tabular}
+ '''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+ cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ table = DXB.table(rows)
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class RowSpanTestCase(_TranslationTestCase):
+
+ expected_output = '''
+
+
+ AAA |
+ BBB |
+
+
+ CCC |
+
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{tabular}{ll}
+ \multirow{2}{*}{AAA} & {BBB} \\
+ & {CCC} \\
+ \end{tabular}
+ '''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(
+ paragraph=DXB.p_tag('AAA'), merge=True, merge_continue=False)
+ cell2 = DXB.table_cell(
+ paragraph=DXB.p_tag(None), merge=False, merge_continue=True)
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ table = DXB.table(rows)
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class NestedTableTag(_TranslationTestCase):
+ expected_output = '''
+
+
+ AAA |
+ BBB |
+
+
+ CCC |
+
+
+
+ DDD |
+ EEE |
+
+
+ FFF |
+ GGG |
+
+
+ |
+
+
+ '''
+
+ latex_expected_output = r'''\begin{tabular}{ll}
+ {AAA} & {BBB} \\
+ {CCC} & {
+ \begin{tabular}{ll}
+ {DDD} & {EEE} \\
+ {FFF} & {GGG} \\
+ \end{tabular}
+ } \\
+ \end{tabular}'''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+ cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF'))
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG'))
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ nested_table = DXB.table(rows)
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+ cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+ cell4 = DXB.table_cell(nested_table)
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ table = DXB.table(rows)
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class TableWithInvalidTag(_TranslationTestCase):
+ expected_output = '''
+
+
+ AAA |
+ BBB |
+
+
+ |
+ DDD |
+
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{tabular}{ l l }
+ {AAA} & {BBB} \\
+ {} & {DDD} \\
+ \end{tabular}
+ '''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+ cell2 = DXB.table_cell('CCC')
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ table = DXB.table(rows)
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class TableWithListAndParagraph(_TranslationTestCase):
+ expected_output = '''
+
+
+
+
+ - AAA
+ - BBB
+
+ CCC
+ DDD
+ |
+
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{tabular}{p{3cm}}
+ \parbox{20cm}{\begin{enumerate} \item AAA
+ \item BBB
+ \end{enumerate}CCC\\DDD} \\
+ \end{tabular}'''
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 1),
+ ('BBB', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+ els = [
+ lis,
+ DXB.p_tag('CCC'),
+ DXB.p_tag('DDD'),
+ ]
+ td = ''
+ for el in els:
+ td += el
+ cell1 = DXB.table_cell(td)
+ row = DXB.table_row([cell1])
+ table = DXB.table([row])
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class SimpleListTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+ - BBB
+ - CCC
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA
+ \item BBB
+ \item CCC
+ \end {enumerate}
+ '''
+ # Ensure its not failing somewhere and falling back to decimal
+ numbering_dict = {
+ '1': {
+ '0': 'lowerLetter',
+ }
+ }
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 1),
+ ('BBB', 0, 1),
+ ('CCC', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+ xml = DXB.xml(lis)
+ return xml
+
+
+class SingleListItemTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+ '''
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA
+ \end {enumerate}
+ '''
+
+ # Ensure its not failing somewhere and falling back to decimal
+ numbering_dict = {
+ '1': {
+ '0': 'lowerLetter',
+ }
+ }
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+ xml = DXB.xml(lis)
+ return xml
+
+
+class ListWithContinuationTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
BBB
+ - CCC
+
+
+ DDD |
+ EEE |
+
+
+ FFF |
+ GGG |
+
+
+
+ - HHH
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA \\ BBB
+ \item CCC
+ \begin{tabular} {ll}
+ {DDD} & {EEE} \\
+ {FFF} & {GGG} \\
+ \end{tabular}
+ \item HHH
+ \end{enumerate}
+ '''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+ cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF'))
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG'))
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ table = DXB.table(rows)
+ tags = [
+ DXB.li(text='AAA', ilvl=0, numId=1),
+ DXB.p_tag('BBB'),
+ DXB.li(text='CCC', ilvl=0, numId=1),
+ table,
+ DXB.li(text='HHH', ilvl=0, numId=1),
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class ListWithMultipleContinuationTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+
+
+ - DDD
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA
+ \begin{tabular} {l}
+ {BBB}\\
+ \end{tabular}
+ \begin{tabular} {l}
+ {CCC}\\
+ \end{tabular}
+ \item DDD
+ \end{enumerate}
+ '''
+
+ def get_xml(self):
+ cell = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+ row = DXB.table_row([cell])
+ table1 = DXB.table([row])
+ cell = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+ row = DXB.table_row([cell])
+ table2 = DXB.table([row])
+ tags = [
+ DXB.li(text='AAA', ilvl=0, numId=1),
+ table1,
+ table2,
+ DXB.li(text='DDD', ilvl=0, numId=1),
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class MangledIlvlTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+
+ - BBB
+
+ - CCC
+
+
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA
+ \end{enumerate}
+ \begin{enumerate}
+ \item BBB
+ \begin{enumerate}
+ \item CCC
+ \end{enumerate}
+ \end{enumerate}
+ '''
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 2),
+ ('BBB', 1, 1),
+ ('CCC', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+ xml = DXB.xml(lis)
+ return xml
+
+
+class SeperateListsTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+
+ - BBB
+
+
+ - CCC
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA
+ \end{enumerate}
+ \begin{enumerate}
+ \item BBB
+ \end{enumerate}
+ \begin{enumerate}
+ \item CCC
+ \end{enumerate}
+ '''
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 2),
+ # Because AAA and CCC are part of the same list (same list id)
+ # and BBB is different, these need to be split into three
+ # lists (or lose everything from BBB and after.
+ ('BBB', 0, 1),
+ ('CCC', 0, 2),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+ xml = DXB.xml(lis)
+ return xml
+
+
+class InvalidIlvlOrderTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+ - BBB
+
+ - CCC
+
+
+
+
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA
+ \begin{enumerate}
+ \item BBB
+ \begin{enumerate}
+ \item CCC
+ \end {enumerate}
+ \end{enumerate}
+ \end{enumerate}
+ '''
+
+ def get_xml(self):
+ tags = [
+ DXB.li(text='AAA', ilvl=1, numId=1),
+ DXB.li(text='BBB', ilvl=3, numId=1),
+ DXB.li(text='CCC', ilvl=2, numId=1),
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class DeeplyNestedTableTestCase(_TranslationTestCase):
+ expected_output = ''
+ run_expected_output = False
+
+ def get_xml(self):
+ paragraph = DXB.p_tag('AAA')
+
+ for _ in range(50):
+ cell = DXB.table_cell(paragraph)
+ row = DXB.table_cell([cell])
+ table = DXB.table([row])
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+ def test_performance(self):
+ with self.toggle_run_expected_output():
+ start_time = time.time()
+ try:
+ self.test_expected_output()
+ except AssertionError:
+ pass
+ end_time = time.time()
+ total_time = end_time - start_time
+ # This finishes in under a second on python 2.7
+ assert total_time < 3, total_time
+
+
+class NonStandardTextTagsTestCase(_TranslationTestCase):
+ expected_output = '''
+ insert
+ smarttag
+ '''
+
+ latex_expected_output = r'''
+ \added[id=, remark=]{insert} smarttag
+ '''
+
+ def get_xml(self):
+ run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'insert ']
+ insert_tag = DXB.insert_tag(run_tags)
+ run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'smarttag']
+ smart_tag = DXB.smart_tag(run_tags)
+
+ run_tags = [insert_tag, smart_tag]
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class RTagWithNoText(_TranslationTestCase):
+ expected_output = ''
+ latex_expected_output = ''
+
+ def get_xml(self):
+ p_tag = DXB.p_tag(None) # No text
+ run_tags = [p_tag]
+ # The bug is only present in a hyperlink
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ body = DXB.p_tag(run_tags)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class DeleteTagInList(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+ BBB
+
+ - CCC
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA \deleted[id=, remark=]{BBB}
+ \item CCC
+ \end{enumerate}
+ '''
+
+ def get_xml(self):
+ delete_tags = DXB.delete_tag(['BBB'])
+ p_tag = DXB.p_tag([delete_tags])
+
+ body = DXB.li(text='AAA', ilvl=0, numId=0)
+ body += p_tag
+ body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class InsertTagInList(_TranslationTestCase):
+ expected_output = '''
+
+ - AAABBB
+
+ - CCC
+
+ '''
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA\added[id=,remark=]{BBB}
+ \item CCC
+ \end{enumerate}
+ '''
+
+ def get_xml(self):
+ run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'BBB']
+ insert_tags = DXB.insert_tag(run_tags)
+ p_tag = DXB.p_tag([insert_tags])
+
+ body = DXB.li(text='AAA', ilvl=0, numId=0)
+ body += p_tag
+ body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class SmartTagInList(_TranslationTestCase):
+ expected_output = '''
+
+ - AAABBB
+
+ - CCC
+
+ '''
+
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAABBB
+ \item CCC
+ \end{enumerate}
+ '''
+
+ def get_xml(self):
+ run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'BBB']
+ smart_tag = DXB.smart_tag(run_tags)
+ p_tag = DXB.p_tag([smart_tag])
+
+ body = DXB.li(text='AAA', ilvl=0, numId=0)
+ body += p_tag
+ body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class SingleListItem(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+ BBB
+ '''
+
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA
+ \end{enumerate}''' + '\n' + 'BBB'
+
+ numbering_dict = {
+ '1': {
+ '0': 'lowerLetter',
+ }
+ }
+
+ def get_xml(self):
+ li = DXB.li(text='AAA', ilvl=0, numId=1)
+ p_tags = [
+ DXB.p_tag('BBB'),
+ ]
+ body = li
+ for p_tag in p_tags:
+ body += p_tag
+ xml = DXB.xml(body)
+ return xml
+
+
+class SimpleTableTest(_TranslationTestCase):
+ expected_output = '''
+
+
+ Blank |
+ Column 1 |
+ Column 2 |
+
+
+ Row 1 |
+ First |
+ Second |
+
+
+ Row 2 |
+ Third |
+ Fourth |
+
+
'''
+
+ latex_expected_output = r'''
+ \begin{tabular} { lll }
+ {Blank} & {Column 1} & {Column 2} \\
+ {Row 1} & {First} & {Second} \\
+ {Row 2} & {Third} & {Fourth} \\
+ \end{tabular}'''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('Blank'))
+ cell2 = DXB.table_cell(paragraph=DXB.p_tag('Row 1'))
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('Row 2'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('Column 1'))
+ cell5 = DXB.table_cell(paragraph=DXB.p_tag('First'))
+ cell6 = DXB.table_cell(paragraph=DXB.p_tag('Third'))
+ cell7 = DXB.table_cell(paragraph=DXB.p_tag('Column 2'))
+ cell8 = DXB.table_cell(paragraph=DXB.p_tag('Second'))
+ cell9 = DXB.table_cell(paragraph=DXB.p_tag('Fourth'))
+ rows = [DXB.table_row([cell1, cell4, cell7]),
+ DXB.table_row([cell2, cell5, cell8]),
+ DXB.table_row([cell3, cell6, cell9])]
+ table = DXB.table(rows)
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class MissingIlvl(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+ BBB
+
+ - CCC
+
+ '''
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAA \\
+ BBB
+ \item CCC
+ \end{enumerate}
+ '''
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 1),
+ ('BBB', None, 1), # Because why not.
+ ('CCC', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+ body = lis
+ xml = DXB.xml(body)
+ return xml
+
+
+class SameNumIdInTable(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+
+ - CCC
+
+ '''
+ latex_expected_output = r'''
+ \begin{enumerate} \item AAA
+ \begin{tabular}{p{3cm}}
+ {\begin{enumerate} \item BBB
+ \end{enumerate}} \\
+ \end{tabular}
+ \item CCC
+ \end{enumerate}
+ '''
+
+ # Ensure its not failing somewhere and falling back to decimal
+ numbering_dict = {
+ '1': {
+ '0': 'lowerLetter',
+ }
+ }
+
+ def get_xml(self):
+ li_text = [
+ ('BBB', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+ cell1 = DXB.table_cell(lis)
+ rows = DXB.table_row([cell1])
+ table = DXB.table([rows])
+ lis = ''
+ lis += DXB.li(text='AAA', ilvl=0, numId=1)
+ lis += table
+ lis += DXB.li(text='CCC', ilvl=0, numId=1)
+ body = lis
+ xml = DXB.xml(body)
+ return xml
+
+
+class SDTTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAABBB
+
+ - CCC
+
+ '''
+ latex_expected_output = r'''
+ \begin{enumerate}
+ \item AAABBB
+ \item CCC
+ \end{enumerate}
+ '''
+
+ def get_xml(self):
+ body = ''
+ body += DXB.li(text='AAA', ilvl=0, numId=0)
+ body += DXB.sdt_tag(p_tag=DXB.p_tag(text='BBB'))
+ body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class HeadingTestCase(_TranslationTestCase):
+ expected_output = '''
+ AAA
+ BBB
+ CCC
+ DDD
+ EEE
+ GGG
+ HHH
+ '''
+
+ latex_expected_output = r'''\section{AAA}
+ ''' + '\n' + '''
+ \subsection{BBB}
+ ''' + '\n' + '''
+ \paragraph{CCC}
+ ''' + '\n' + '''
+ \subparagraph{DDD}
+ ''' + '\n' + '''
+ EEE
+ ''' + '\n' + '''
+ GGG
+ ''' + '\n' + '''
+ HHH
+ '''
+
+ styles_dict = {
+ 'style0': 'heading 1',
+ 'style1': 'heading 2',
+ 'style2': 'heading 3',
+ 'style3': 'heading 4',
+ 'style4': 'heading 5',
+ 'style5': 'heading 6',
+ }
+
+ def get_xml(self):
+ p_tags = [
+ DXB.p_tag(text='AAA', style='style0'),
+ DXB.p_tag(text='BBB', style='style1'),
+ DXB.p_tag(text='CCC', style='style2'),
+ DXB.p_tag(text='DDD', style='style3'),
+ DXB.p_tag(text='EEE', style='style4'),
+ DXB.p_tag(text='GGG', style='style5'),
+ DXB.p_tag(text='HHH', style='garbage'),
+ ]
+ body = ''
+ for tag in p_tags:
+ body += tag
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class RomanNumeralToHeadingTestCase(_TranslationTestCase):
+ convert_root_level_upper_roman = True
+ numbering_dict = {
+ '1': {
+ '0': 'upperRoman',
+ '1': 'decimal',
+ '2': 'upperRoman',
+ },
+ '2': {
+ '0': 'upperRoman',
+ '1': 'decimal',
+ '2': 'upperRoman',
+ },
+ '3': {
+ '0': 'upperRoman',
+ '1': 'decimal',
+ '2': 'upperRoman',
+ },
+ }
+ expected_output = '''
+ AAA
+
+ - BBB
+
+ CCC
+
+ - DDD
+
+ EEE
+
+ - FFF
+
+ - GGG
+
+
+
+ '''
+
+ latex_expected_output = r'''
+ \subsection{AAA}\begin{enumerate} \item BBB
+ \end{enumerate}\subsection{CCC}\begin{enumerate} \item DDD
+ \end{enumerate}\subsection{EEE}\begin{enumerate}
+ \item FFF\begin{enumerate} \item GGG
+ \end{enumerate}
+ \end{enumerate}'''
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 1),
+ ('BBB', 1, 1),
+ ('CCC', 0, 2),
+ ('DDD', 1, 2),
+ ('EEE', 0, 3),
+ ('FFF', 1, 3),
+ ('GGG', 2, 3),
+ ]
+ body = ''
+ for text, ilvl, numId in li_text:
+ body += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class MultipleTTagsInRTag(_TranslationTestCase):
+ expected_output = '''
+ ABC
+ '''
+ latex_expected_output = 'ABC'
+
+ def get_xml(self):
+ r_tag = DXB.r_tag(
+ [DXB.t_tag(letter) for letter in 'ABC'],
+ )
+ p_tag = DXB.p_tag(
+ [r_tag],
+ jc='start',
+ )
+ body = p_tag
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class SuperAndSubScripts(_TranslationTestCase):
+ expected_output = '''
+ AAABBB
+ CCCDDD
+ '''
+
+ latex_expected_output = r'''
+ AAA \textsuperscript{BBB}
+ ''' + '\n' + r'\textsubscript{CCC} DDD'
+
+ def get_xml(self):
+ p_tags = [
+ DXB.p_tag(
+ [
+ DXB.r_tag([DXB.t_tag('AAA')]),
+ DXB.r_tag(
+ [DXB.t_tag('BBB')],
+ rpr=DXB.rpr_tag({'vertAlign': 'superscript'}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('CCC')],
+ rpr=DXB.rpr_tag({'vertAlign': 'subscript'}),
+ ),
+ DXB.r_tag([DXB.t_tag('DDD')]),
+ ],
+ ),
+ ]
+ body = ''
+ for p_tag in p_tags:
+ body += p_tag
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class AvaliableInlineTags(_TranslationTestCase):
+ expected_output = '''
+ aaa
+ bbb
+ ccc
+ ddd
+ eee
+ fff
+ ggg
+ hhh
+ iii
+ jjj
+ '''
+
+ latex_expected_output = r'''\textbf {aaa}
+ \underline {bbb}
+ \emph {ccc}
+ \MakeUppercase{ddd}
+ \textsx{eee}
+ \sout{fff}
+ \sout{ggg}
+ \begin{comment}hhh\end{comment}
+ \begin{comment}iii\end{comment}
+ \textsuperscript{jjj}
+ '''
+
+ def get_xml(self):
+ p_tags = [
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('aaa')],
+ rpr=DXB.rpr_tag({'b': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('bbb')],
+ rpr=DXB.rpr_tag({'u': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('ccc')],
+ rpr=DXB.rpr_tag({'i': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('ddd')],
+ rpr=DXB.rpr_tag({'caps': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('eee')],
+ rpr=DXB.rpr_tag({'smallCaps': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('fff')],
+ rpr=DXB.rpr_tag({'strike': None})
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('ggg')],
+ rpr=DXB.rpr_tag({'dstrike': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('hhh')],
+ rpr=DXB.rpr_tag({'vanish': None})
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('iii')],
+ rpr=DXB.rpr_tag({'webHidden': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('jjj')],
+ rpr=DXB.rpr_tag({'vertAlign': 'superscript'}),
+ ),
+ ],
+ ),
+ ]
+ body = ''
+ for p_tag in p_tags:
+ body += p_tag
+
+ xml = DXB.xml(body)
+ return xml
diff --git a/pydocx/utils.py b/pydocx/utils.py
new file mode 100644
index 00000000..e3db8bfe
--- /dev/null
+++ b/pydocx/utils.py
@@ -0,0 +1,425 @@
+from collections import defaultdict
+from xml.etree import cElementTree
+
+
+UPPER_ROMAN_TO_HEADING_VALUE = 'h2'
+TAGS_CONTAINING_CONTENT = (
+ 't',
+ 'pict',
+ 'drawing',
+ 'delText',
+ 'ins',
+)
+TAGS_HOLDING_CONTENT_TAGS = (
+ 'p',
+ 'tbl',
+ 'sdt',
+)
+
+
+def el_iter(el):
+ """
+ Go through all elements
+ """
+ try:
+ return el.iter()
+ except AttributeError:
+ return el.findall('.//*')
+
+
+def find_first(el, tag):
+ """
+ Find the first occurrence of a tag beneath the current element.
+ """
+ return el.find('.//' + tag)
+
+
+def find_all(el, tag):
+ """
+ Find all occurrences of a tag
+ """
+ return el.findall('.//' + tag)
+
+
+def find_ancestor_with_tag(pre_processor, el, tag):
+ """
+ Find the first ancestor with that is a `tag`.
+ """
+ while pre_processor.parent(el) is not None:
+ el = pre_processor.parent(el)
+ if el.tag == tag:
+ return el
+ return None
+
+
+def has_descendant_with_tag(el, tag):
+ """
+ Determine if there is a child ahead in the element tree.
+ """
+ # Get child. stop at first child.
+ return True if el.find('.//' + tag) is not None else False
+
+
+def _filter_children(element, tags):
+ return [
+ el for el in element.getchildren()
+ if el.tag in tags
+ ]
+
+
+def remove_namespaces(document):
+ root = cElementTree.fromstring(document)
+ for child in el_iter(root):
+ child.tag = child.tag.split("}")[1]
+ child.attrib = dict(
+ (k.split("}")[-1], v)
+ for k, v in child.attrib.items()
+ )
+ return cElementTree.tostring(root)
+
+
+def get_list_style(numbering_root, num_id, ilvl):
+ # This is needed on both the custom lxml parser and the pydocx parser. So
+ # make it a function.
+ ids = find_all(numbering_root, 'num')
+ for _id in ids:
+ if _id.attrib['numId'] != num_id:
+ continue
+ abstractid = _id.find('abstractNumId')
+ abstractid = abstractid.attrib['val']
+ style_information = find_all(
+ numbering_root,
+ 'abstractNum',
+ )
+ for info in style_information:
+ if info.attrib['abstractNumId'] == abstractid:
+ for i in el_iter(info):
+ if (
+ 'ilvl' in i.attrib and
+ i.attrib['ilvl'] != ilvl):
+ continue
+ if i.find('numFmt') is not None:
+ return i.find('numFmt').attrib['val']
+
+
+class NamespacedNumId(object):
+ def __init__(self, num_id, num_tables, *args, **kwargs):
+ self._num_id = num_id
+ self._num_tables = num_tables
+
+ def __unicode__(self, *args, **kwargs):
+ return '%s:%d' % (
+ self._num_id,
+ self._num_tables,
+ )
+
+ def __repr__(self, *args, **kwargs):
+ return self.__unicode__(*args, **kwargs)
+
+ def __eq__(self, other):
+ if other is None:
+ return False
+ return repr(self) == repr(other)
+
+ def __ne__(self, other):
+ if other is None:
+ return False
+ return repr(self) != repr(other)
+
+ @property
+ def num_id(self):
+ return self._num_id
+
+
+class PydocxPrePorcessor(object):
+ def __init__(
+ self,
+ convert_root_level_upper_roman=False,
+ styles_dict=None,
+ numbering_root=None,
+ *args, **kwargs):
+ self.meta_data = defaultdict(dict)
+ self.convert_root_level_upper_roman = convert_root_level_upper_roman
+ self.styles_dict = styles_dict
+ self.numbering_root = numbering_root
+
+ def perform_pre_processing(self, root, *args, **kwargs):
+ self._add_parent(root)
+ self._set_list_attributes(root)
+ self._set_table_attributes(root)
+ self._set_is_in_table(root)
+
+ body = find_first(root, 'body')
+ p_elements = [
+ child for child in find_all(body, 'p')
+ ]
+ list_elements = [
+ child for child in p_elements
+ if self.is_list_item(child)
+ ]
+ # Find the first and last li elements
+ num_ids = set([self.num_id(i) for i in list_elements])
+ ilvls = set([self.ilvl(i) for i in list_elements])
+ self._set_first_list_item(num_ids, ilvls, list_elements)
+ self._set_last_list_item(num_ids, list_elements)
+
+ self._set_headers(p_elements)
+ self._convert_upper_roman(body)
+ self._set_next(body)
+
+ def is_first_list_item(self, el):
+ return self.meta_data[el].get('is_first_list_item', False)
+
+ def is_last_list_item_in_root(self, el):
+ return self.meta_data[el].get('is_last_list_item_in_root', False)
+
+ def is_list_item(self, el):
+ return self.meta_data[el].get('is_list_item', False)
+
+ def num_id(self, el):
+ if not self.is_list_item(el):
+ return None
+ return self.meta_data[el].get('num_id')
+
+ def ilvl(self, el):
+ if not self.is_list_item(el):
+ return None
+ return self.meta_data[el].get('ilvl')
+
+ def heading_level(self, el):
+ return self.meta_data[el].get('heading_level')
+
+ def is_in_table(self, el):
+ return self.meta_data[el].get('is_in_table')
+
+ def is_last_row_item(self, el):
+ return self.meta_data[el].get('is_last_row_item')
+
+ def row_index(self, el):
+ return self.meta_data[el].get('row_index')
+
+ def column_index(self, el):
+ return self.meta_data[el].get('column_index')
+
+ def vmerge_continue(self, el):
+ return self.meta_data[el].get('vmerge_continue')
+
+ def next(self, el):
+ if el not in self.meta_data:
+ return
+ return self.meta_data[el].get('next')
+
+ def previous(self, el):
+ if el not in self.meta_data:
+ return
+ return self.meta_data[el].get('previous')
+
+ def parent(self, el):
+ return self.meta_data[el].get('parent')
+
+ def _add_parent(self, el): # if a parent, make that an attribute
+ for child in el.getchildren():
+ self.meta_data[child]['parent'] = el
+ self._add_parent(child)
+
+ def _set_list_attributes(self, el):
+ list_elements = find_all(el, 'numId')
+ for li in list_elements:
+ parent = find_ancestor_with_tag(self, li, 'p')
+ # Deleted text in a list will have a numId but no ilvl.
+ if parent is None:
+ continue
+ if find_first(parent, 'ilvl') is None:
+ continue
+ self.meta_data[parent]['is_list_item'] = True
+ self.meta_data[parent]['num_id'] = self._generate_num_id(parent)
+ self.meta_data[parent]['ilvl'] = find_first(
+ parent,
+ 'ilvl',
+ ).attrib['val']
+
+ def _generate_num_id(self, el):
+ '''
+ Fun fact: It is possible to have a list in the root, that holds a table
+ that holds a list and for both lists to have the same numId. When this
+ happens we should namespace the nested list with the number of tables
+ it is in to ensure it is considered a new list. Otherwise all sorts of
+ terrible html gets generated.
+ '''
+ num_id = find_first(el, 'numId').attrib['val']
+
+ # First, go up the parent until we get None and count the number of
+ # tables there are.
+ num_tables = 0
+ while self.parent(el) is not None:
+ if el.tag == 'tbl':
+ num_tables += 1
+ el = self.parent(el)
+ return NamespacedNumId(
+ num_id=num_id,
+ num_tables=num_tables,
+ )
+
+ def _set_first_list_item(self, num_ids, ilvls, list_elements):
+ # Lists are grouped by having the same `num_id` and `ilvl`. The first
+ # list item is the first list item found for each `num_id` and `ilvl`
+ # combination.
+ for num_id in num_ids:
+ for ilvl in ilvls:
+ filtered_list_elements = [
+ i for i in list_elements
+ if (
+ self.num_id(i) == num_id and
+ self.ilvl(i) == ilvl
+ )
+ ]
+ if not filtered_list_elements:
+ continue
+ first_el = filtered_list_elements[0]
+ self.meta_data[first_el]['is_first_list_item'] = True
+
+ def _set_last_list_item(self, num_ids, list_elements):
+ # Find last list elements. Only mark list tags as the last list tag if
+ # it is in the root of the document. This is only used to ensure that
+ # once a root level list is finished we do not roll in the rest of the
+ # non list elements into the first root level list.
+ for num_id in num_ids:
+ filtered_list_elements = [
+ i for i in list_elements
+ if self.num_id(i) == num_id
+ ]
+ if not filtered_list_elements:
+ continue
+ last_el = filtered_list_elements[-1]
+ self.meta_data[last_el]['is_last_list_item_in_root'] = True
+
+ def _set_table_attributes(self, el):
+ tables = find_all(el, 'tbl')
+ for table in tables:
+ rows = _filter_children(table, ['tr'])
+ if rows is None:
+ continue
+ for i, row in enumerate(rows):
+ tcs = _filter_children(row, ['tc'])
+ self.meta_data[tcs[-1]]['is_last_row_item'] = True
+ for j, child in enumerate(tcs):
+ self.meta_data[child]['row_index'] = i
+ self.meta_data[child]['column_index'] = j
+ v_merge = find_first(child, 'vMerge')
+ if (
+ v_merge is not None and
+ ('continue' == v_merge.get('val', '') or
+ v_merge.attrib == {})
+ ):
+ self.meta_data[child]['vmerge_continue'] = True
+
+ def _set_is_in_table(self, el):
+ paragraph_elements = find_all(el, 'p')
+ for p in paragraph_elements:
+ if find_ancestor_with_tag(self, p, 'tc') is not None:
+ self.meta_data[p]['is_in_table'] = True
+
+ def _set_headers(self, elements):
+ # These are the styles for headers and what the html tag should be if
+ # we have one.
+ headers = {
+ 'heading 1': 'h1',
+ 'heading 2': 'h2',
+ 'heading 3': 'h3',
+ 'heading 4': 'h4',
+ 'heading 5': 'h5',
+ 'heading 6': 'h6',
+ 'heading 7': 'h6',
+ 'heading 8': 'h6',
+ 'heading 9': 'h6',
+ 'heading 10': 'h6',
+ }
+ for element in elements:
+ # This element is using the default style which is not a heading.
+ if find_first(element, 'pStyle') is None:
+ continue
+ style = find_first(element, 'pStyle').attrib.get('val', '')
+ style = self.styles_dict.get(style)
+
+ # Check to see if this element is actually a header.
+ if style and style.lower() in headers:
+ # Set all the list item variables to false.
+ self.meta_data[element]['is_list_item'] = False
+ self.meta_data[element]['is_first_list_item'] = False
+ self.meta_data[element]['is_last_list_item_in_root'] = False
+ # Prime the heading_level
+ self.meta_data[element]['heading_level'] = headers[style.lower()] # noqa
+
+ def _convert_upper_roman(self, body):
+ if not self.convert_root_level_upper_roman:
+ return
+ first_root_list_items = [
+ # Only root level elements.
+ el for el in body.getchildren()
+ # And only first_list_items
+ if self.is_first_list_item(el)
+ ]
+ visited_num_ids = []
+ for root_list_item in first_root_list_items:
+ if self.num_id(root_list_item) in visited_num_ids:
+ continue
+ visited_num_ids.append(self.num_id(root_list_item))
+ lst_style = get_list_style(
+ self.numbering_root,
+ self.num_id(root_list_item).num_id,
+ self.ilvl(root_list_item),
+ )
+ if lst_style != 'upperRoman':
+ continue
+ ilvl = min(
+ self.ilvl(el) for el in find_all(body, 'p')
+ if self.num_id(el) == self.num_id(root_list_item)
+ )
+ root_upper_roman_list_items = [
+ el for el in find_all(body, 'p')
+ if self.num_id(el) == self.num_id(root_list_item) and
+ self.ilvl(el) == ilvl
+ ]
+ for list_item in root_upper_roman_list_items:
+ self.meta_data[list_item]['is_list_item'] = False
+ self.meta_data[list_item]['is_first_list_item'] = False
+ self.meta_data[list_item]['is_last_list_item_in_root'] = False # noqa
+
+ self.meta_data[list_item]['heading_level'] = UPPER_ROMAN_TO_HEADING_VALUE # noqa
+
+ def _set_next(self, body):
+ def _get_children_with_content(el):
+ # We only care about children if they have text in them.
+ children = []
+ for child in _filter_children(el, TAGS_HOLDING_CONTENT_TAGS):
+ _has_descendant_with_tag = any(
+ has_descendant_with_tag(child, tag) for
+ tag in TAGS_CONTAINING_CONTENT
+ )
+ if _has_descendant_with_tag:
+ children.append(child)
+ return children
+
+ def _assign_next(children):
+ # Populate the `next` attribute for all the child elements.
+ for i in range(len(children)):
+ try:
+ if children[i + 1] is not None:
+ self.meta_data[children[i]]['next'] = children[i + 1] # noqa
+ except IndexError:
+ pass
+ try:
+ if children[i - 1] is not None:
+ self.meta_data[children[i]]['previous'] = children[i - 1] # noqa
+ except IndexError:
+ pass
+ # Assign next for everything in the root.
+ _assign_next(_get_children_with_content(body))
+
+ # In addition set next for everything in table cells.
+ for tc in find_all(body, 'tc'):
+ _assign_next(_get_children_with_content(tc))
+
+
+def parse_xml_from_string(xml):
+ return cElementTree.fromstring(remove_namespaces(xml))
diff --git a/requirements.txt b/requirements.txt
index f9954ad0..77421ff8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,4 @@
-beautifulsoup4>=4.1.0
+Jinja2>=2.0
+coverage==3.6
+nose==1.3.0
+flake8
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100755
index 00000000..da46b811
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,4 @@
+#! /bin/sh
+
+nosetests --verbose --with-doctest --with-coverage --cover-package pydocx $@ &&
+find -name '*.py' | xargs flake8
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..c47dbe66
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,58 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+
+try:
+ from setuptools import setup, find_packages
+except ImportError:
+ from ez_setup import use_setuptools
+ use_setuptools()
+ from setuptools import setup, find_packages # noqa
+
+rel_file = lambda *args: os.path.join(
+ os.path.dirname(os.path.abspath(__file__)), *args)
+
+
+def get_file(filename):
+ with open(rel_file(filename)) as f:
+ return f.read()
+
+
+def get_description():
+ return get_file('README.rst') + get_file('CHANGELOG')
+
+setup(
+ name="PyDocX",
+ # Edit here and pydocx.__init__
+ version="0.3.1",
+ description="docx (OOXML) to html converter",
+ author="Jason Ward, Sam Portnow",
+ author_email="jason.louard.ward@gmail.com, samson91787@gmail.com",
+ url="http://github.com/OpenScienceFramework/pydocx",
+ platforms=["any"],
+ license="BSD",
+ packages=find_packages(),
+ package_data={
+ 'pydocx': [
+ 'tests/templates/*.xml',
+ ],
+ },
+ scripts=[],
+ zip_safe=False,
+ install_requires=[],
+ cmdclass={},
+ classifiers=[
+ "Development Status :: 3 - Alpha",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 2.6",
+ "Programming Language :: Python :: 2.7",
+ "Programming Language :: Python :: 2 :: Only",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: BSD License",
+ "Operating System :: OS Independent",
+ "Topic :: Text Processing :: Markup :: HTML",
+ "Topic :: Text Processing :: Markup :: XML",
+ ],
+ long_description=get_description(),
+)