Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit fcc5910

Browse files
committed
Preliminary code to mark paragraphs. Seems to work, but very slow.
1 parent 5c35520 commit fcc5910

1 file changed

Lines changed: 150 additions & 2 deletions

File tree

Doc/tools/sgmlconv/docfixer.py

Lines changed: 150 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
import xml.dom.esis_builder
2121

2222

23+
DEBUG_PARA_FIXER = 0
24+
25+
2326
# Workaround to deal with invalid documents (multiple root elements). This
2427
# does not indicate a bug in the DOM implementation.
2528
#
@@ -323,12 +326,157 @@ def cleanup_synopses(doc):
323326
create_module_info(doc, node)
324327

325328

329+
FIXUP_PARA_ELEMENTS = (
330+
"chapter",
331+
"section", "subsection", "subsubsection",
332+
"paragraph", "subparagraph")
333+
334+
PARA_LEVEL_ELEMENTS = (
335+
"moduleinfo", "title", "opcodedesc",
336+
"verbatim", "funcdesc", "methoddesc", "excdesc", "datadesc",
337+
"funcdescni", "methoddescni", "excdescni", "datadescni",
338+
"tableii", "tableiii", "tableiv", "localmoduletable",
339+
"sectionauthor",
340+
# include <para>, so we can just do it again to get subsequent paras:
341+
"para",
342+
)
343+
344+
PARA_LEVEL_PRECEEDERS = (
345+
"index", "indexii", "indexiii", "indexiv",
346+
"stindex", "obindex", "COMMENT", "label",
347+
)
348+
326349
def fixup_paras(doc):
327-
pass
350+
for child in doc.childNodes:
351+
if child.nodeType == xml.dom.core.ELEMENT \
352+
and child.tagName in FIXUP_PARA_ELEMENTS:
353+
fixup_paras_helper(doc, child)
354+
descriptions = child.getElementsByTagName("description")
355+
for description in descriptions:
356+
if DEBUG_PARA_FIXER:
357+
sys.stderr.write("-- Fixing up <description> element...\n")
358+
fixup_paras_helper(doc, description)
359+
360+
361+
def fixup_paras_helper(doc, container):
362+
# document is already normalized
363+
children = container.childNodes
364+
start = 0
365+
start_fixed = 0
366+
i = 0
367+
SKIP_ELEMENTS = PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS
368+
for child in children:
369+
if child.nodeType == xml.dom.core.ELEMENT:
370+
if child.tagName in FIXUP_PARA_ELEMENTS:
371+
fixup_paras_helper(doc, child)
372+
break
373+
elif child.tagName in SKIP_ELEMENTS:
374+
if not start_fixed:
375+
start = i + 1
376+
elif not start_fixed:
377+
start_fixed = 1
378+
i = i + 1
379+
else:
380+
if child.nodeType == xml.dom.core.TEXT \
381+
and string.strip(child.data) and not start_fixed:
382+
start_fixed = 1
383+
i = i + 1
384+
if DEBUG_PARA_FIXER:
385+
sys.stderr.write("fixup_paras_helper() called on <%s>; %d, %d\n"
386+
% (container.tagName, start, i))
387+
if i > start:
388+
# the first [start:i] children shoudl be rewritten as <para> elements
389+
# start by breaking text nodes that contain \n\n+ into multiple nodes
390+
nstart, i = skip_leading_nodes(container.childNodes, start, i)
391+
if i > nstart:
392+
build_para(doc, container, nstart, i)
393+
fixup_paras_helper(doc, container)
394+
395+
396+
def build_para(doc, parent, start, i):
397+
children = parent.childNodes
398+
# collect all children until \n\n+ is found in a text node or a
399+
# PARA_LEVEL_ELEMENT is found.
400+
after = start + 1
401+
have_last = 0
402+
BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + FIXUP_PARA_ELEMENTS
403+
for j in range(start, i):
404+
after = j + 1
405+
child = children[j]
406+
nodeType = child.nodeType
407+
if nodeType == xml.dom.core.ELEMENT:
408+
if child.tagName in BREAK_ELEMENTS:
409+
after = j
410+
break
411+
elif nodeType == xml.dom.core.TEXT:
412+
pos = string.find(child.data, "\n\n")
413+
if pos == 0:
414+
after = j
415+
break
416+
if pos >= 1:
417+
child.splitText(pos)
418+
break
419+
else:
420+
have_last = 1
421+
if children[after - 1].nodeType == xml.dom.core.TEXT:
422+
# we may need to split off trailing white space:
423+
child = children[after - 1]
424+
data = child.data
425+
if string.rstrip(data) != data:
426+
have_last = 0
427+
child.splitText(len(string.rstrip(data)))
428+
children = parent.childNodes
429+
para = doc.createElement("para")
430+
prev = None
431+
indexes = range(start, after)
432+
indexes.reverse()
433+
for j in indexes:
434+
node = children[j]
435+
parent.removeChild(node)
436+
para.insertBefore(node, prev)
437+
prev = node
438+
if have_last:
439+
parent.appendChild(para)
440+
else:
441+
parent.insertBefore(para, parent.childNodes[start])
442+
443+
444+
def skip_leading_nodes(children, start, i):
445+
i = min(i, len(children))
446+
while i > start:
447+
# skip over leading comments and whitespace:
448+
try:
449+
child = children[start]
450+
except IndexError:
451+
sys.stderr.write(
452+
"skip_leading_nodes() failed at index %d\n" % start)
453+
raise
454+
nodeType = child.nodeType
455+
if nodeType == xml.dom.core.COMMENT:
456+
start = start + 1
457+
elif nodeType == xml.dom.core.TEXT:
458+
data = child.data
459+
shortened = string.lstrip(data)
460+
if shortened:
461+
if data != shortened:
462+
# break into two nodes: whitespace and non-whitespace
463+
child.splitText(len(data) - len(shortened))
464+
return start + 1, i + 1
465+
break
466+
# all whitespace, just skip
467+
start = start + 1
468+
elif nodeType == xml.dom.core.ELEMENT:
469+
if child.tagName in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
470+
start = start + 1
471+
else:
472+
break
473+
else:
474+
break
475+
return start, i
328476

329477

330478
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
331-
479+
332480
def write_esis(doc, ofp, knownempty):
333481
for node in doc.childNodes:
334482
nodeType = node.nodeType

0 commit comments

Comments
 (0)