diff --git a/AUTHORS b/AUTHORS index d1b33f05..c4b46c16 100644 --- a/AUTHORS +++ b/AUTHORS @@ -3,3 +3,4 @@ Jason Ward Kyle Gibson Chirica Gheorghe Anirudha Bose +Tarashish Mishra diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 41bb117b..120f60d2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,11 @@ +**dev** + +- Internal links and anchors are now retained. Thanks, sunu! `#222 `_ + +**0.9.10** + +- No longer error when processing margin positions with decimal points. + **0.9.9** - Rect elements now correctly handle image data diff --git a/docs/index.rst b/docs/index.rst index c304e103..05a0f761 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,4 +11,5 @@ PyDocX export_mixins enumerated_list_detection development + plugins release_notes diff --git a/docs/plugins.rst b/docs/plugins.rst new file mode 100644 index 00000000..40069e74 --- /dev/null +++ b/docs/plugins.rst @@ -0,0 +1,36 @@ +####### +Plugins +####### + +You may find yourself needing +a feature in PyDocX that doesn't exist +in the core library. + +If it's something that should exist, the +PyDocX project is always open to new +contributions. Details of how to contibute +can be found in :doc:`/development`. + +For things that don't fit in the core +library, it's easy to build a plugin +based on the :doc:`Extending PyDocX ` and +:doc:`Export Mixins ` sections. + +If you do build a plugin, edit this +documentation and add it below so that +other developers can find it. + +----------------- +Available Plugins +----------------- + +.. list-table:: + :widths: 20 80 + :header-rows: 1 + + * - Plugin + - Description + * - `pydocx-resize-images `_ + - Resizes large images to the dimensions they are in the docx file + * - `pydocx-s3-images `_ + - Uploads images to S3 instead of returning Data URIs diff --git a/pydocx/__init__.py b/pydocx/__init__.py index 4e5ebc69..d45f70ba 100644 --- a/pydocx/__init__.py +++ b/pydocx/__init__.py @@ -6,4 +6,4 @@ 'PyDocX', ] -__version__ = '0.9.9' +__version__ = '0.9.10' diff --git a/pydocx/export/base.py b/pydocx/export/base.py index 9ca2afef..67b002bc 100644 --- a/pydocx/export/base.py +++ b/pydocx/export/base.py @@ -286,7 +286,7 @@ def yield_numbering_spans(self, items): for item in items: yield item return - builder = self.numbering_span_builder_class(items) + builder = self.numbering_span_builder_class(items, process_components=True) numbering_spans = builder.get_numbering_spans() for item in numbering_spans: yield item diff --git a/pydocx/export/html.py b/pydocx/export/html.py index 40498a89..18e3ea53 100644 --- a/pydocx/export/html.py +++ b/pydocx/export/html.py @@ -17,7 +17,7 @@ POINTS_PER_EM, PYDOCX_STYLES, TWIPS_PER_POINT, - EMUS_PER_PIXEL, + EMUS_PER_PIXEL ) from pydocx.export.base import PyDocXExporter from pydocx.export.numbering_span import NumberingItem @@ -96,12 +96,12 @@ class HtmlTag(object): closed_tag_format = '' def __init__( - self, - tag, - allow_self_closing=False, - closed=False, - allow_whitespace=False, - **attrs + self, + tag, + allow_self_closing=False, + closed=False, + allow_whitespace=False, + **attrs ): self.tag = tag self.allow_self_closing = allow_self_closing @@ -270,6 +270,8 @@ def get_heading_tag(self, paragraph): heading_style.name.lower(), self.default_heading_level, ) + if paragraph.bookmark_name: + return HtmlTag(tag, id=paragraph.bookmark_name) return HtmlTag(tag) def export_paragraph(self, paragraph): @@ -309,46 +311,45 @@ def export_paragraph_property_justification(self, paragraph, results): def export_paragraph_property_indentation(self, paragraph, results): # TODO these classes should be applied on the paragraph, and not as # inline styles + properties = paragraph.effective_properties style = {} - if properties.indentation_right: - # TODO would be nice if this integer conversion was handled - # implicitly by the model somehow - try: - right = int(properties.indentation_right) - except ValueError: - right = None + # Numbering properties can define a text indentation on a paragraph + if properties.numbering_properties: + indentation_left = None + indentation_first_line = None - if right: - right = convert_twips_to_ems(right) - style['margin-right'] = '{0:.2f}em'.format(right) + paragraph_num_level = paragraph.get_numbering_level() - if properties.indentation_left: - # TODO would be nice if this integer conversion was handled - # implicitly by the model somehow - try: - left = int(properties.indentation_left) - except ValueError: - left = None + if paragraph_num_level: + listing_style = self.export_listing_paragraph_property_indentation( + paragraph, + paragraph_num_level.paragraph_properties, + include_text_indent=True + ) + if 'text-indent' in listing_style and listing_style['text-indent'] != '0.00em': + style['text-indent'] = listing_style['text-indent'] + style['display'] = 'inline-block' + else: + indentation_left = properties.to_int('indentation_left') + indentation_first_line = properties.to_int('indentation_first_line') - if left: - left = convert_twips_to_ems(left) - style['margin-left'] = '{0:.2f}em'.format(left) + indentation_right = properties.to_int('indentation_right') - if properties.indentation_first_line: - # TODO would be nice if this integer conversion was handled - # implicitly by the model somehow - try: - first_line = int(properties.indentation_first_line) - except ValueError: - first_line = None + if indentation_right: + right = convert_twips_to_ems(indentation_right) + style['margin-right'] = '{0:.2f}em'.format(right) + + if indentation_left: + left = convert_twips_to_ems(indentation_left) + style['margin-left'] = '{0:.2f}em'.format(left) - if first_line: - first_line = convert_twips_to_ems(first_line) - # TODO text-indent doesn't work with inline elements like span - style['text-indent'] = '{0:.2f}em'.format(first_line) + if indentation_first_line: + first_line = convert_twips_to_ems(indentation_first_line) + style['text-indent'] = '{0:.2f}em'.format(first_line) + style['display'] = 'inline-block' if style: attrs = { @@ -359,6 +360,93 @@ def export_paragraph_property_indentation(self, paragraph, results): return results + def export_listing_paragraph_property_indentation( + self, + paragraph, + level_properties, + include_text_indent=False + ): + style = {} + + if not level_properties or not paragraph.has_numbering_properties: + return style + + level_indentation_step = \ + paragraph.numbering_definition.get_indentation_between_levels() + + paragraph_properties = paragraph.properties + + level_ind_left = level_properties.to_int('indentation_left', default=0) + level_ind_hanging = level_properties.to_int('indentation_hanging', default=0) + + paragraph_ind_left = paragraph_properties.to_int('indentation_left', default=0) + paragraph_ind_hanging = paragraph_properties.to_int('indentation_hanging', default=0) + paragraph_ind_first_line = paragraph_properties.to_int('indentation_first_line', + default=0) + + left = paragraph_ind_left or level_ind_left + hanging = paragraph_ind_hanging or level_ind_hanging + # At this point we have no info about indentation, so we keep the default one + if not left and not hanging: + return style + + # All the bellow left margin calculation is done because html ul/ol/li elements have + # their default indentations and we need to make sure that we migrate as near as + # possible solution to html. + margin_left = left + + # Because hanging can be set independently, we remove it from left margin and will + # be added as text-indent later on + margin_left -= hanging + + # Take into account that current span can have custom left margin + if level_indentation_step > level_ind_hanging: + margin_left -= (level_indentation_step - level_ind_hanging) + else: + margin_left -= level_indentation_step + + # First line are added to left margins + margin_left += paragraph_ind_first_line + + if isinstance(paragraph.parent, NumberingItem): + try: + # In case of nested lists elements, we need to adjust left margin + # based on the parent item + parent_paragraph = paragraph.parent.numbering_span.parent.get_first_child() + + parent_ind_left = parent_paragraph.get_indentation('indentation_left') + parent_ind_hanging = parent_paragraph.get_indentation('indentation_hanging') + parent_lvl_ind_hanging = parent_paragraph.get_indentation( + 'indentation_hanging') + + margin_left -= (parent_ind_left - parent_ind_hanging) + margin_left -= parent_lvl_ind_hanging + # To mimic the word way of setting first line, we need to move back(left) all + # elements by first_line value + margin_left -= parent_paragraph.get_indentation('indentation_first_line') + except AttributeError: + pass + + # Here as well, we remove the default hanging which word adds + # because

tag will provide it's own + hanging -= level_ind_hanging + + if margin_left: + margin_left = convert_twips_to_ems(margin_left) + style['margin-left'] = '{0:.2f}em'.format(margin_left) + + # we don't allow negative hanging + if hanging < 0: + hanging = 0 + + if include_text_indent: + if hanging is not None: + # Now, here we add the hanging as text-indent for the paragraph + hanging = convert_twips_to_ems(hanging) + style['text-indent'] = '{0:.2f}em'.format(hanging) + + return style + def get_run_styles_to_apply(self, run): parent_paragraph = run.get_first_ancestor(wordprocessing.Paragraph) if parent_paragraph and parent_paragraph.heading_style: @@ -507,7 +595,10 @@ def get_hyperlink_tag(self, target_uri): def export_hyperlink(self, hyperlink): results = super(PyDocXHTMLExporter, self).export_hyperlink(hyperlink) - tag = self.get_hyperlink_tag(target_uri=hyperlink.target_uri) + if not hyperlink.target_uri and hyperlink.anchor: + tag = self.get_hyperlink_tag(target_uri='#' + hyperlink.anchor) + else: + tag = self.get_hyperlink_tag(target_uri=hyperlink.target_uri) if tag: results = tag.apply(results, allow_empty=False) @@ -732,7 +823,25 @@ def export_numbering_item(self, numbering_item): numbering_item.children, self.export_node, ) - tag = HtmlTag('li') + + style = None + + if numbering_item.children: + level_properties = numbering_item.numbering_span.\ + numbering_level.paragraph_properties + # get the first paragraph properties which will contain information + # on how to properly indent listing item + paragraph = numbering_item.children[0] + + style = self.export_listing_paragraph_property_indentation(paragraph, + level_properties) + + attrs = {} + + if style: + attrs['style'] = convert_dictionary_to_style_fragment(style) + + tag = HtmlTag('li', **attrs) return tag.apply(results) def export_field_hyperlink(self, simple_field, field_args): diff --git a/pydocx/export/numbering_span.py b/pydocx/export/numbering_span.py index f84dbe3e..809ff672 100644 --- a/pydocx/export/numbering_span.py +++ b/pydocx/export/numbering_span.py @@ -10,16 +10,14 @@ import string from pydocx.openxml import wordprocessing -from pydocx.util.memoize import memoized - from pydocx.openxml.wordprocessing.run import Run from pydocx.openxml.wordprocessing.tab_char import TabChar from pydocx.openxml.wordprocessing.text import Text +from pydocx.util.memoize import memoized # Defined in 17.15.1.25 DEFAULT_AUTOMATIC_TAB_STOP_INTERVAL = 720 # twips - roman_numeral_map = tuple(zip( (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1), ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I') @@ -140,6 +138,9 @@ class NumberingSpan(object): def __init__(self, numbering_level, numbering_definition, parent): self.children = [] + self._nested_level = 0 + # Mark a separate nested list + self.is_separate_list = False self.numbering_level = numbering_level self.numbering_definition = numbering_definition self.parent = parent @@ -156,6 +157,27 @@ def get_first_child_of_first_item(self): return return first_item.children[0] + def get_last_child(self): + if not self.children: + return + last_item = self.children[-1] + + return last_item + + def get_numbering_level(self): + return self.numbering_level + + @property + def nested_level(self): + return self._nested_level + + def inc_nested_level(self): + nested_level = 0 + if isinstance(self.parent, (NumberingSpan, NumberingItem)): + nested_level = self.parent.nested_level + + self._nested_level = nested_level + 1 + class NumberingItem(object): ''' @@ -174,6 +196,16 @@ def append_child(self, child): child.parent = self self.children.append(child) + @property + def nested_level(self): + return self.parent.nested_level + + def get_first_child(self): + if self.children: + return self.children[0] + + return None + class BaseNumberingSpanBuilder(object): ''' @@ -189,7 +221,7 @@ class BaseNumberingSpanBuilder(object): accomplished using the NumberingSpan and NumberingItem classes. ''' - def __init__(self, components=None): + def __init__(self, components=None, process_components=False): if not components: components = [] self.components = components @@ -198,6 +230,12 @@ def __init__(self, components=None): self.current_item = None self.current_item_index = 0 self.candidate_numbering_items = [] + self.child_parent_num_map = {} + self.parent_child_num_map = {} + self.list_start_stop_index = {} + + if process_components: + self.detect_parent_child_map_for_items() @memoized def get_numbering_level(self, paragraph): @@ -206,6 +244,147 @@ def get_numbering_level(self, paragraph): return None return level + def _get_component_item(self, component, to_tuple=False): + item = { + 'num_id': component.numbering_definition.abstract_num_id, + 'level': component.get_numbering_level().level_id + } + + if to_tuple: + item = (item['num_id'], item['level']) + + return item + + def detect_parent_child_map_for_items(self): + """ + There are cases when we have span inside an item and this span is different from + the parent one. + Example listing: + 1. A + 2. B + Separate + * B1 + * B2 + 3. C + + In the above example B1, B2 items are creating a separate span and does have different + num. definition. We need to somehow detect this cases and make sure we properly + continue numbering(in this case '3. C'). + + We parse this as following: + let say that list: A, B, C has abstract_num_id = 1, level = 0 + and list: B1, B2 has abstract_num_id = 4, level = 0 + + As output we will construct 2 dicts as follow: + child_parent_num_map = { + "4": {"num_id": '1', "level": '0'} + } + + parent_child_num_map = { + ("1", "0"): [{"num_id": '4', "level": '0'}] + } + + So, when we process paragraph item we know from the start that it has a parent or not. + """ + + if not self.components: + return False + + parent_child_map = {} + child_parent_map = {} + list_start_stop_index = {} + # we are interested only in components that are part of the listing + components = [component for component in self.components if + hasattr(component, 'properties') + and hasattr(component.properties, 'numbering_properties') + and component.numbering_definition + and component.get_numbering_level()] + if not components: + return False + + components_reversed_list = list(reversed(components)) + for i, component in enumerate(components[:-1]): + parent_item = self._get_component_item(component) + nums = [] + outer_item_found = False + if i > 0: + components_reversed = components_reversed_list[:-i] + else: + components_reversed = components_reversed_list + + for j, next_component in enumerate(components_reversed): + next_item = self._get_component_item(next_component) + if parent_item == next_item: + outer_item_found = True + if not parent_item['num_id'] in list_start_stop_index: + # We need to find the index of the component from original + # self.components list so that we take into account all additional + # paragraphs that a list can contain + list_start_stop_index[parent_item['num_id']] = { + 'start': self.components.index(component), + 'stop': self.components.index(next_component) + } + break + if outer_item_found: + for _component in components[i + 1:-j - 1]: + child_item = self._get_component_item(_component) + # We need to process only items that have different num_id + # which mean are part of the different list + if child_item['num_id'] != parent_item['num_id']: + # Check if child is not already a parent + child_item_children = parent_child_map.get( + (child_item['num_id'], child_item['level']), []) + if parent_item not in child_item_children: + nums.append(child_item) + if nums: + # parent_key = parent_item['num_id'] + parent_key = (parent_item['num_id'], parent_item['level']) + if parent_key not in parent_child_map: + parent_child_map[parent_key] = [] + + for num in nums: + child_parent_map[num['num_id']] = parent_item + if num not in parent_child_map[parent_key]: + parent_child_map[parent_key].append(num) + + self.child_parent_num_map = child_parent_map + self.parent_child_num_map = parent_child_map + self.list_start_stop_index = list_start_stop_index + + return True + + def has_parent_list(self, paragraph): + ''' + Check if current paragraph is inside a list which is separated from parent list. + ''' + + if not paragraph.has_numbering_properties or not paragraph.has_numbering_definition: + return False + + if not self.current_span: + return False + + num_item = self._get_component_item(paragraph) + + return bool(self.child_parent_num_map.get(num_item['num_id'], None)) + + def is_parent_of_current_span(self, paragraph): + ''' + + :param paragraph: + :return: + ''' + if not paragraph.has_numbering_properties or not paragraph.has_numbering_definition: + return False + + if not self.current_span: + return True + + num_item = self._get_component_item(paragraph, to_tuple=True) + span_item = self._get_component_item(self.current_span) + + return span_item in self.parent_child_num_map.get(num_item, []) + def include_candidate_items_in_current_item(self, new_item_index): ''' A generator to determine which of the candidate numbering items need to @@ -224,7 +403,7 @@ def include_candidate_items_in_current_item(self, new_item_index): # Since we've processed all of the candidate numbering items, reset it self.candidate_numbering_items = [] - def should_start_new_span(self, paragraph): + def should_start_new_span(self, index, paragraph): ''' If there's not a current span, and the paragraph is a heading style, do not start a new span. @@ -235,29 +414,129 @@ def should_start_new_span(self, paragraph): span, start a new span. Otherwise, do not start a new span. ''' + if self.current_span is None: return True + level = self.get_numbering_level(paragraph) num_def = None if level: num_def = level.parent + + if num_def == self.current_span.numbering_definition: + return False + elif self.has_parent_list(paragraph): + return False + elif self.is_parent_of_current_span(paragraph): + return False + elif self.current_span.is_separate_list: + return False + + list_idx = self.list_start_stop_index.get(num_def.abstract_num_id) + if list_idx and list_idx['start'] == index: + return True + return num_def != self.current_span.numbering_definition - def should_start_new_item(self, paragraph): + def should_start_new_item(self, index, paragraph): ''' If there is not a current span, do not start a new item. If the paragraph is a heading style, do not start a new item. - Otherwise, only start a new item if the numbering definition of the - paragraph matches the numbering definition of the current span. + Start new item if: + Paragraph is from separate list and inside a span + Paragraph is from separate list and is parent of the current span + Paragraph level id is bigger then 0 which mean we are still inside list + Numbering definition of the paragraph matches the numbering definition of the + current span. ''' + if self.current_span is None: return False + level = self.get_numbering_level(paragraph) num_def = None if level: num_def = level.parent + + if self.has_parent_list(paragraph): + return True + elif self.is_parent_of_current_span(paragraph): + return True + else: + list_idx = self.list_start_stop_index.get(num_def.abstract_num_id) + # For mangled lists we need to make sure that we are not handling + # the first element from the list which have level > 0 + if list_idx and index > list_idx['start']: + # We are still in the list + if int(level.level_id) > 0: + return True + return num_def == self.current_span.numbering_definition + def add_item_to_span(self, index, current_span=None): + ''' + Add a new item to the current span or the span we specify. + ''' + + self.current_span = current_span or self.current_span + + self.current_item = NumberingItem( + numbering_span=self.current_span, + ) + self.current_item_index = index + self.current_span.append_child(self.current_item) + + def add_new_span_and_item(self, index, level, parent_span=None): + parent_span = parent_span or self.current_span + + num_def = level.parent + + next_numbering_span = NumberingSpan( + numbering_level=level, + numbering_definition=num_def, + parent=parent_span, + ) + + self.numbering_span_stack.append(next_numbering_span) + next_numbering_item = NumberingItem( + numbering_span=next_numbering_span, + ) + + next_numbering_span.append_child(next_numbering_item) + self.current_item.append_child(next_numbering_span) + self.current_span = next_numbering_span + self.current_item = next_numbering_item + self.current_item_index = index + + self.current_span.inc_nested_level() + + def add_new_span_and_item_lower_level(self, index, level, previous_span=None): + num_def = level.parent + + level_id = int(level.level_id) + + if not previous_span: + # we need to "subtract" a level. To do that, find the level + # that we're going back to, which may not even exist + previous_span = self.find_previous_numbering_span_with_lower_level(level_id) + + if self.numbering_span_stack: + assert previous_span + self.current_span = previous_span + else: + # If the numbering_span_stack is empty now, it means + # we're handling a mangled level case + # For that scenario, create a new span + self.current_span = NumberingSpan( + numbering_level=level, + numbering_definition=num_def, + parent=self.current_span, + ) + self.numbering_span_stack = [self.current_span] + yield self.current_span + + self.add_item_to_span(index) + def handle_start_new_span(self, index, paragraph): level = self.get_numbering_level(paragraph) num_def = level.parent @@ -279,11 +558,7 @@ def handle_start_new_span(self, index, paragraph): self.numbering_span_stack = [self.current_span] - self.current_item = NumberingItem( - numbering_span=self.current_span, - ) - self.current_item_index = index - self.current_span.append_child(self.current_item) + self.add_item_to_span(index) def handle_start_new_item(self, index, paragraph): level = self.get_numbering_level(paragraph) @@ -298,54 +573,37 @@ def handle_start_new_item(self, index, paragraph): if level == self.current_span.numbering_level: # The level hasn't changed - self.current_item = NumberingItem( - numbering_span=self.current_span, - ) - self.current_item_index = index - self.current_span.append_child(self.current_item) + self.add_item_to_span(index) else: + has_parent_list = self.has_parent_list(paragraph) + is_parent_of_current_span = self.is_parent_of_current_span(paragraph) + level_id = int(level.level_id) current_level_id = int(self.current_span.numbering_level.level_id) - if level_id > current_level_id: - # Add a new span + item to hold this new level - next_numbering_span = NumberingSpan( - numbering_level=level, - numbering_definition=num_def, - parent=self.current_span, - ) - self.numbering_span_stack.append(next_numbering_span) - next_numbering_item = NumberingItem( - numbering_span=next_numbering_span, - ) - next_numbering_span.children.append(next_numbering_item) - self.current_item.append_child(next_numbering_span) - self.current_span = next_numbering_span - self.current_item = next_numbering_item - self.current_item_index = index - elif level_id < current_level_id: - # we need to "subtract" a level. To do that, find the level - # that we're going back to, which may not even exist - previous_span = self.find_previous_numbering_span_with_lower_level(level_id) - if self.numbering_span_stack: - assert previous_span - self.current_span = previous_span - else: - # If the numbering_span_stack is empty now, it means - # we're handling a mangled level case - # For that scenario, create a new span - self.current_span = NumberingSpan( - numbering_level=level, - numbering_definition=num_def, - parent=self.current_span, - ) - self.numbering_span_stack = [self.current_span] - yield self.current_span - self.current_item = NumberingItem( - numbering_span=self.current_span, - ) - self.current_item_index = index - self.current_span.append_child(self.current_item) + if num_def == self.current_span.numbering_definition: + # At this stage we process all the items that are part of the same list. + # All item from the same list have same numbering definition + if level_id > current_level_id: + self.add_new_span_and_item(index, level) + elif level_id < current_level_id: + for item in self.add_new_span_and_item_lower_level(index, level): + yield item + else: + # Here we deal with lists that separate from the parent list meaning + # that have different numbering definition + if not has_parent_list and not is_parent_of_current_span: + self.current_span = self.find_previous_numbering_span_by_num_def(paragraph) + self.current_item = self.current_span.get_last_child() + self.add_new_span_and_item(index, level) + elif has_parent_list and not is_parent_of_current_span: + self.current_span = self.find_parent_numbering_span(paragraph) + self.current_item = self.current_span.get_last_child() + self.add_new_span_and_item(index, level) + self.current_span.is_separate_list = True + else: + self.current_span = self.find_previous_numbering_span_by_num_def(paragraph) + self.add_item_to_span(index) def find_previous_numbering_span_with_lower_level(self, level_id): previous_span = None @@ -358,6 +616,36 @@ def find_previous_numbering_span_with_lower_level(self, level_id): self.numbering_span_stack.pop() return previous_span + def find_previous_numbering_span_by_num_def(self, paragraph): + previous_span = None + while self.numbering_span_stack: + previous_span = self.numbering_span_stack[-1] + if previous_span.numbering_definition == paragraph.numbering_definition: + # we found the parent span of the paragraph item + break + self.numbering_span_stack.pop() + return previous_span + + def find_parent_numbering_span(self, paragraph): + previous_span = None + + num_item = self._get_component_item(paragraph) + + parent_num_item = self.child_parent_num_map.get(num_item['num_id'], None) + if not parent_num_item: + return previous_span + + while self.numbering_span_stack: + previous_span = self.numbering_span_stack[-1] + previous_span_item = self._get_component_item(previous_span) + + if previous_span_item == parent_num_item: + # we found the parent span of the paragraph item + break + self.numbering_span_stack.pop() + + return previous_span + def handle_paragraph(self, index, paragraph): level = self.get_numbering_level(paragraph) num_def = None @@ -378,8 +666,8 @@ def handle_paragraph(self, index, paragraph): self.candidate_numbering_items.append((index, paragraph)) return - start_new_span = self.should_start_new_span(paragraph) - start_new_item = self.should_start_new_item(paragraph) + start_new_span = self.should_start_new_span(index, paragraph) + start_new_item = self.should_start_new_item(index, paragraph) if start_new_span: for item in self.handle_start_new_span(index, paragraph): @@ -549,8 +837,10 @@ def detect_new_faked_level_started(self, paragraph, current_level_id=None): def get_left_position_for_numbering_span(self, numbering_span): paragraph = numbering_span.get_first_child_of_first_item() + left_pos = self.get_left_position_for_paragraph(paragraph) num_level_para_properties = numbering_span.numbering_level.paragraph_properties + if num_level_para_properties: left_pos += num_level_para_properties.start_margin_position return left_pos diff --git a/pydocx/openxml/wordprocessing/__init__.py b/pydocx/openxml/wordprocessing/__init__.py index 02da2556..4fce72a2 100644 --- a/pydocx/openxml/wordprocessing/__init__.py +++ b/pydocx/openxml/wordprocessing/__init__.py @@ -1,6 +1,7 @@ # coding: utf-8 from pydocx.openxml.wordprocessing.abstract_num import AbstractNum from pydocx.openxml.wordprocessing.body import Body +from pydocx.openxml.wordprocessing.bookmark import Bookmark from pydocx.openxml.wordprocessing.br import Break from pydocx.openxml.wordprocessing.deleted_run import DeletedRun from pydocx.openxml.wordprocessing.deleted_text import DeletedText @@ -26,6 +27,7 @@ from pydocx.openxml.wordprocessing.picture import Picture from pydocx.openxml.wordprocessing.run import Run from pydocx.openxml.wordprocessing.run_properties import RunProperties # noqa +from pydocx.openxml.wordprocessing.rfonts import RFonts from pydocx.openxml.wordprocessing.sdt_block import SdtBlock from pydocx.openxml.wordprocessing.sdt_content_block import SdtContentBlock from pydocx.openxml.wordprocessing.sdt_content_run import SdtContentRun @@ -46,6 +48,7 @@ __all__ = [ 'AbstractNum', 'Body', + 'Bookmark', 'Break', 'DeletedRun', 'DeletedText', @@ -71,6 +74,7 @@ 'Picture', 'Run', 'RunProperties', + 'RFonts', 'SdtBlock', 'SdtContentBlock', 'SdtContentRun', diff --git a/pydocx/openxml/wordprocessing/abstract_num.py b/pydocx/openxml/wordprocessing/abstract_num.py index 98b0727f..cdad313e 100644 --- a/pydocx/openxml/wordprocessing/abstract_num.py +++ b/pydocx/openxml/wordprocessing/abstract_num.py @@ -27,3 +27,20 @@ def __init__(self, **kwargs): def get_level(self, level_id): return self._levels.get(level_id) + + def get_indentation_between_levels(self): + """ + Depending on the word version we may get a different default indentation between + levels. For this we will only check first 2 levels as the other follow the same step. + """ + + try: + lvl0_ind = self.levels[0].paragraph_properties.to_int('indentation_left', + default=0) + lvl1_ind = self.levels[1].paragraph_properties.to_int('indentation_left', + default=0) + ind_step = lvl1_ind - lvl0_ind + except IndexError: + ind_step = 720 # default one + + return ind_step diff --git a/pydocx/openxml/wordprocessing/bookmark.py b/pydocx/openxml/wordprocessing/bookmark.py new file mode 100644 index 00000000..1e7bf417 --- /dev/null +++ b/pydocx/openxml/wordprocessing/bookmark.py @@ -0,0 +1,14 @@ +# coding: utf-8 +from __future__ import ( + absolute_import, + print_function, + unicode_literals, +) + +from pydocx.models import XmlModel, XmlAttribute + + +class Bookmark(XmlModel): + XML_TAG = 'bookmarkStart' + + name = XmlAttribute(name='name') diff --git a/pydocx/openxml/wordprocessing/paragraph.py b/pydocx/openxml/wordprocessing/paragraph.py index af59dd7b..bdb6b387 100644 --- a/pydocx/openxml/wordprocessing/paragraph.py +++ b/pydocx/openxml/wordprocessing/paragraph.py @@ -4,7 +4,7 @@ print_function, unicode_literals, ) - +from pydocx.util.memoize import memoized from pydocx.models import XmlModel, XmlCollection, XmlChild from pydocx.openxml.wordprocessing.hyperlink import Hyperlink from pydocx.openxml.wordprocessing.paragraph_properties import ParagraphProperties # noqa @@ -16,6 +16,7 @@ from pydocx.openxml.wordprocessing.deleted_run import DeletedRun from pydocx.openxml.wordprocessing.sdt_run import SdtRun from pydocx.openxml.wordprocessing.simple_field import SimpleField +from pydocx.openxml.wordprocessing.bookmark import Bookmark class Paragraph(XmlModel): @@ -31,6 +32,7 @@ class Paragraph(XmlModel): DeletedRun, SdtRun, SimpleField, + Bookmark ) def __init__(self, **kwargs): @@ -45,6 +47,10 @@ def effective_properties(self): self._effective_properties = properties return self._effective_properties + @property + def numbering_definition(self): + return self.get_numbering_definition() + def has_structured_document_parent(self): from pydocx.openxml.wordprocessing import SdtBlock return self.has_ancestor(SdtBlock) @@ -83,9 +89,8 @@ def heading_style(self): def heading_style(self, style): self._heading_style = style + @memoized def get_numbering_definition(self): - # TODO add memoization - # TODO the getattr is necessary because of footnotes. From the context # of a footnote, a paragraph's container is the footnote part, which # doesn't have access to the numbering_definitions_part @@ -101,8 +106,8 @@ def get_numbering_definition(self): num_id=numbering_properties.num_id, ) + @memoized def get_numbering_level(self): - # TODO add memoization numbering_definition = self.get_numbering_definition() if not numbering_definition: return @@ -121,6 +126,12 @@ def runs(self): if isinstance(p_child, Run): yield p_child + @property + def bookmark_name(self): + for p_child in self.children: + if isinstance(p_child, Bookmark): + return p_child.name + def get_text(self, tab_char=None): ''' Return a string of all of the contained Text nodes concatenated @@ -168,3 +179,30 @@ def get_number_of_initial_tabs(self): else: break return tab_count + + @property + @memoized + def has_numbering_properties(self): + return bool(getattr(self.properties, 'numbering_properties', None)) + + @property + @memoized + def has_numbering_definition(self): + return bool(self.numbering_definition) + + def get_indentation(self, indentation, only_level_ind=False): + ''' + Get specific indentation of the current paragraph. If indentation is + not present on the paragraph level, get it from the numbering definition. + ''' + + ind = None + + if self.properties: + if not only_level_ind: + ind = self.properties.to_int(indentation) + if ind is None: + level = self.get_numbering_level() + ind = level.paragraph_properties.to_int(indentation, default=0) + + return ind diff --git a/pydocx/openxml/wordprocessing/paragraph_properties.py b/pydocx/openxml/wordprocessing/paragraph_properties.py index d893bd7b..c6bbc374 100644 --- a/pydocx/openxml/wordprocessing/paragraph_properties.py +++ b/pydocx/openxml/wordprocessing/paragraph_properties.py @@ -35,11 +35,19 @@ def start_margin_position(self): # ignored. start_margin = 0 if self.indentation_left: - start_margin += int(self.indentation_left) + start_margin += int(float(self.indentation_left)) if self.indentation_hanging: - start_margin -= int(self.indentation_hanging) + start_margin -= int(float(self.indentation_hanging)) elif self.indentation_first_line: - start_margin += int(self.indentation_first_line) + start_margin += int(float(self.indentation_first_line)) if start_margin: return start_margin return 0 + + def to_int(self, attribute, default=None): + # TODO would be nice if this integer conversion was handled + # implicitly by the model somehow + try: + return int(getattr(self, attribute, default)) + except (ValueError, TypeError): + return default diff --git a/pydocx/openxml/wordprocessing/rfonts.py b/pydocx/openxml/wordprocessing/rfonts.py new file mode 100644 index 00000000..ea95216e --- /dev/null +++ b/pydocx/openxml/wordprocessing/rfonts.py @@ -0,0 +1,25 @@ +# coding: utf-8 +from __future__ import ( + absolute_import, + print_function, + unicode_literals, +) + +from pydocx.models import XmlModel, XmlAttribute + + +class RFonts(XmlModel): + XML_TAG = 'rFonts' + + hint = XmlAttribute(name='hint') + ascii = XmlAttribute(name='ascii') + h_ansi = XmlAttribute(name='hAnsi') + east_asia = XmlAttribute(name='eastAsia') + cs = XmlAttribute(name='cs') + ascii_theme = XmlAttribute(name='asciiTheme') + h_ansi_theme = XmlAttribute(name='hAnsiTheme') + east_asia_theme = XmlAttribute(name='eastAsiaTheme') + cs_theme = XmlAttribute(name='cstheme') + + def is_symbol(self): + return self.h_ansi == 'Symbol' diff --git a/pydocx/openxml/wordprocessing/run_properties.py b/pydocx/openxml/wordprocessing/run_properties.py index 46867e65..63587a57 100644 --- a/pydocx/openxml/wordprocessing/run_properties.py +++ b/pydocx/openxml/wordprocessing/run_properties.py @@ -7,6 +7,7 @@ from pydocx.models import XmlModel, XmlChild from pydocx.types import OnOff, Underline +from pydocx.openxml.wordprocessing.rfonts import RFonts class RunProperties(XmlModel): @@ -26,6 +27,7 @@ class RunProperties(XmlModel): pos = XmlChild(name='position', attrname='val') sz = XmlChild(name='sz', attrname='val') clr = XmlChild(name='color', attrname='val') + r_fonts = XmlChild(type=RFonts) @property def color(self): diff --git a/setup.py b/setup.py index 9a17e299..74b40964 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def main(): author_email="jason.louard.ward@gmail.com, samson91787@gmail.com", url="http://github.com/CenterForOpenScience/pydocx", platforms=["any"], - license="BSD", + license="Apache", packages=find_packages(), package_data={ 'pydocx': [ @@ -64,7 +64,7 @@ def main(): "Programming Language :: Python :: 3.4", "Programming Language :: Python :: Implementation :: PyPy", "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", + "License :: OSI Approved :: Apache Software License" "Operating System :: OS Independent", "Topic :: Text Processing :: Markup :: HTML", "Topic :: Text Processing :: Markup :: XML", diff --git a/tests/export/html/test_heading.py b/tests/export/html/test_heading.py index efd7ab1c..b77dfc2d 100644 --- a/tests/export/html/test_heading.py +++ b/tests/export/html/test_heading.py @@ -744,3 +744,30 @@ def test_single_lvl_list_has_precedence_over_headings(self): ''' self.assert_document_generates_html(document, expected_html) + + def test_heading_with_bookmark(self): + document_xml = ''' +

+ + + + + + + aaa + +

+ ''' + + style_xml = ''' + + ''' + + document = WordprocessingDocumentFactory() + document.add(StyleDefinitionsPart, style_xml) + document.add(MainDocumentPart, document_xml) + + expected_html = '

aaa

' + self.assert_document_generates_html(document, expected_html) diff --git a/tests/export/html/test_hyperlink.py b/tests/export/html/test_hyperlink.py index a88ab748..dbbe4a0c 100644 --- a/tests/export/html/test_hyperlink.py +++ b/tests/export/html/test_hyperlink.py @@ -194,3 +194,21 @@ def test_with_anchor(self): expected_html = '

link.

' self.assert_document_generates_html(document, expected_html) + + def test_internal_link(self): + document_xml = ''' +

+ + + link + + +

+ ''' + + document = WordprocessingDocumentFactory() + + document.add(MainDocumentPart, document_xml) + + expected_html = '

link

' + self.assert_document_generates_html(document, expected_html) diff --git a/tests/export/html/test_numbering.py b/tests/export/html/test_numbering.py index 8386499c..b41dc9de 100644 --- a/tests/export/html/test_numbering.py +++ b/tests/export/html/test_numbering.py @@ -33,6 +33,19 @@ class NumberingTestBase(object):

''' + simple_list_item_with_indentation = ''' +

+ + + + + + + + {content} +

+ ''' + simple_list_definition = ''' @@ -1114,6 +1127,493 @@ def test_root_level_numfmt_None_with_sublist(self): self.assert_document_generates_html(document, expected_html) +class NumberingIndentationTestCase(NumberingTestBase, DocumentGeneratorTestCase): + def test_no_numbering_definition_defined(self): + document_xml = ''' + {aaa} + {bbb} + {ccc} + '''.format( + aaa=self.simple_list_item.format( + content='AAA', + num_id=1, + ilvl=0, + ), + bbb=self.simple_list_item.format( + content='BBB', + num_id=1, + ilvl=1, + ), + ccc=self.simple_list_item.format( + content='CCC', + num_id=1, + ilvl=2, + ), + ) + + document = WordprocessingDocumentFactory() + document.add(MainDocumentPart, document_xml) + + expected_html = ''' +

AAA

BBB

CCC

+ ''' + self.assert_document_generates_html(document, expected_html) + + def test_default_indentation(self): + document_xml = ''' + {aaa} + {bbb} + {ccc} + '''.format( + aaa=self.simple_list_item.format( + content='AAA', + num_id=1, + ilvl=0, + ), + bbb=self.simple_list_item.format( + content='BBB', + num_id=1, + ilvl=1, + ), + ccc=self.simple_list_item.format( + content='CCC', + num_id=1, + ilvl=2, + ), + ) + + numbering_xml = ''' + + + + + + + + + + + + + + + + + + + + + + + + ''' + + document = WordprocessingDocumentFactory() + document.add(NumberingDefinitionsPart, numbering_xml) + document.add(MainDocumentPart, document_xml) + + expected_html = ''' +

AAA +
1. BBB +
  1. CCC
  +
+

+ ''' + self.assert_document_generates_html(document, expected_html) + + def test_custom_indentation(self): + document_xml = ''' + {aaa} + {bbb} + {ccc} + '''.format( + aaa=self.simple_list_item_with_indentation.format( + content='AAA', + num_id=1, + ilvl=0, + ind='left="1440" hanging="360"' + ), + bbb=self.simple_list_item_with_indentation.format( + content='BBB', + num_id=1, + ilvl=1, + ind='left="2880" hanging="360"' + ), + ccc=self.simple_list_item_with_indentation.format( + content='CCC', + num_id=1, + ilvl=2, + ind='left="4320" hanging="360"' + ), + ) + + numbering_xml = ''' + + + + + + + + + + + + + + + + + + + + + + + + ''' + + document = WordprocessingDocumentFactory() + document.add(NumberingDefinitionsPart, numbering_xml) + document.add(MainDocumentPart, document_xml) + + expected_html = ''' +

AAA +
1. BBB +
  1. CCC
  +
+

+ ''' + self.assert_document_generates_html(document, expected_html) + + def test_custom_hanging_indentation(self): + document_xml = ''' + {aaa} + {bbb} + {ccc} + '''.format( + aaa=self.simple_list_item_with_indentation.format( + content='AAA', + num_id=1, + ilvl=0, + ind='left="720" hanging="500"' + ), + bbb=self.simple_list_item_with_indentation.format( + content='BBB', + num_id=1, + ilvl=1, + ind='left="1440" hanging="700"' + ), + ccc=self.simple_list_item_with_indentation.format( + content='CCC', + num_id=1, + ilvl=2, + ind='left="2160" hanging="800"' + ), + ) + + numbering_xml = ''' + + + + + + + + + + + + + + + + + + + + + + + + ''' + + document = WordprocessingDocumentFactory() + document.add(NumberingDefinitionsPart, numbering_xml) + document.add(MainDocumentPart, document_xml) + + expected_html = ''' +

+ AAA +
1. + BBB +
  1. + CCC + +
  +
+

+ ''' + self.assert_document_generates_html(document, expected_html) + + def test_custom_first_line_indentation(self): + document_xml = ''' + {aaa} + {bbb} + {ccc} + '''.format( + aaa=self.simple_list_item_with_indentation.format( + content='AAA', + num_id=1, + ilvl=0, + ind='firstLine="360"' + ), + bbb=self.simple_list_item_with_indentation.format( + content='BBB', + num_id=1, + ilvl=1, + ind='firstLine="360"' + ), + ccc=self.simple_list_item_with_indentation.format( + content='CCC', + num_id=1, + ilvl=2, + ind='firstLine="360"' + ), + ) + + numbering_xml = ''' + + + + + + + + + + + + + + + + + + + + + + + + ''' + + document = WordprocessingDocumentFactory() + document.add(NumberingDefinitionsPart, numbering_xml) + document.add(MainDocumentPart, document_xml) + + expected_html = ''' +

AAA +
1. BBB +
  1. CCC
  +
+

+ ''' + self.assert_document_generates_html(document, expected_html) + + def test_nested_separated_lists(self): + document_xml = ''' + {aaa} + {bbb} + {ccc} + {ddd} + '''.format( + aaa=self.simple_list_item.format( + content='AAA', + num_id=1, + ilvl=0 + ), + bbb=self.simple_list_item.format( + content='BBB', + num_id=1, + ilvl=1, + ), + ccc=self.simple_list_item.format( + content='CCC', + num_id=2, + ilvl=0, + ), + ddd=self.simple_list_item.format( + content='DDD', + num_id=1, + ilvl=1, + ), + ) + + numbering_xml = ''' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ''' + + document = WordprocessingDocumentFactory() + document.add(NumberingDefinitionsPart, numbering_xml) + document.add(MainDocumentPart, document_xml) + + expected_html = ''' +

+ AAA +
1. + BBB +
  1. CCC
  +
2. DDD
+

+ ''' + self.assert_document_generates_html(document, expected_html) + + def test_nested_separated_lists_different_level(self): + document_xml = ''' + {aaa} + {bbb} + {ccc} + {ddd} + '''.format( + aaa=self.simple_list_item.format( + content='AAA', + num_id=1, + ilvl=0 + ), + bbb=self.simple_list_item.format( + content='BBB', + num_id=2, + ilvl=1, + ), + ccc=self.simple_list_item.format( + content='CCC', + num_id=2, + ilvl=1, + ), + ddd=self.simple_list_item.format( + content='DDD', + num_id=1, + ilvl=0, + ), + ) + + numbering_xml = ''' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ''' + + document = WordprocessingDocumentFactory() + document.add(NumberingDefinitionsPart, numbering_xml) + document.add(MainDocumentPart, document_xml) + + expected_html = ''' +

+ AAA +
1. BBB
2. CCC
+
DDD

+ ''' + self.assert_document_generates_html(document, expected_html) + + class FakedNumberingManyItemsTestCase(NumberingTestBase, DocumentGeneratorTestCase): def assert_html(self, list_type, digit_generator): paragraphs = [] @@ -1386,7 +1886,7 @@ def test_real_nested_list_continuation_fake_nested_list_using_indentation(self): expected_html = '''

AAA +
AAA
1. BBB
2. CCC