Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 54bad44

Browse files
committed
Update to the latest version of Michael Ernst's script.
1 parent 3a7a3d7 commit 54bad44

1 file changed

Lines changed: 104 additions & 24 deletions

File tree

Doc/tools/html2texi.pl

Lines changed: 104 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
#! /usr/bin/env perl -w
1+
#! /usr/bin/env perl
22
# html2texi.pl -- Convert HTML documentation to Texinfo format
33
# Michael Ernst <[email protected]>
4-
# Time-stamp: <1998-09-10 12:52:38 mernst>
4+
# Time-stamp: <1999-01-12 21:34:27 mernst>
55

66
# This program converts HTML documentation trees into Texinfo format.
77
# Given the name of a main (or contents) HTML file, it processes that file,
88
# and other files (transitively) referenced by it, into a Texinfo file
99
# (whose name is chosen from the file or directory name of the argument).
1010
# For instance:
11-
# html2texi.pl api/index.pl
11+
# html2texi.pl api/index.html
1212
# produces file "api.texi".
1313

1414
# Texinfo format can be easily converted to Info format (for browsing in
@@ -23,16 +23,23 @@
2323
# and mouse-free browsing.
2424

2525
# Limitations:
26-
# html2texi.pl is currently tuned to latex2html output, but should be
27-
# extensible to arbitrary HTML documents. It will be most useful for HTML
28-
# with a hierarchical structure and an index. The HTML tree to be
29-
# traversed must be on local disk, rather than being accessed via HTTP.
26+
# html2texi.pl is currently tuned to latex2html output (and it corrects
27+
# several latex2html bugs), but should be extensible to arbitrary HTML
28+
# documents. It will be most useful for HTML with a hierarchical structure
29+
# and an index, and it recognizes those features as created by latex2html
30+
# (and possibly by some other tools). The HTML tree to be traversed must
31+
# be on local disk, rather than being accessed via HTTP.
3032
# This script requires the use of "checkargs.pm". To eliminate that
3133
# dependence, replace calls to check_args* by @_ (which is always the last
3234
# argument to those functions).
3335
# Also see the "to do" section, below.
3436
# Comments, suggestions, bug fixes, and enhancements are welcome.
3537

38+
# Troubleshooting:
39+
# Malformed HTML can cause this program to abort, so
40+
# you should check your HTML files to make sure they are legal.
41+
42+
3643
###
3744
### Typical usage for the Python documentation:
3845
###
@@ -41,7 +48,7 @@
4148
# The resulting Info format Python documentation is currently available at
4249
# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
4350

44-
# Fix up HTML problems, eg <DL COMPACT><DD>
51+
# Fix up HTML problems, eg <DT><DL COMPACT><DD> should be <DT><DL COMPACT><DD>.
4552

4653
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
4754
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
@@ -55,7 +62,7 @@
5562
# * fix up any sectioning, such as for Abstract
5663
# * make Texinfo menus
5764
# * perhaps remove the @detailmenu ... @end detailmenu
58-
# In Emacs:
65+
# In Emacs, to do all this:
5966
# (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
6067

6168
# makeinfo api.texi
@@ -157,12 +164,10 @@
157164
require HTML::Element;
158165

159166
use File::Basename;
160-
use Cwd;
161167

162168
use strict;
163169
# use Carp;
164170

165-
166171
use checkargs;
167172

168173

@@ -290,7 +295,7 @@ ( )
290295
sub process_child_links ( $ )
291296
{ my ($he) = check_args(1, @_);
292297

293-
# $he->dump;
298+
# $he->dump();
294299
if (scalar(@current_contents_list) != 0)
295300
{ die "current_contents_list nonempty: @current_contents_list"; }
296301
$he->traverse(\&increment_current_contents_list, 'ignore text');
@@ -374,7 +379,7 @@ ( $ )
374379
$result .= "\}";
375380
return $result; }
376381
else
377-
{ $he->dump;
382+
{ $he->dump();
378383
die "html_to_texi confused by <$tag>"; }
379384
}
380385

@@ -477,7 +482,7 @@ ( $ )
477482
for (my $i = 0; $i < scalar(@content); $i++)
478483
{ my $this_he = $content[$i];
479484
if ($this_he->tag ne "dt")
480-
{ $this_he->dump;
485+
{ $this_he->dump();
481486
die "Expected <DT> tag: " . $this_he->tag; }
482487
if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))
483488
{ process_index_dt_and_dd($this_he, $content[$i+1]);
@@ -792,14 +797,11 @@ ( $$$ )
792797
return 0; } }
793798
else
794799
{ if ($startflag)
795-
{ $he->dump;
796-
warn "Can't deal with internal HREF anchors yet"; }
800+
{ # cross-references are not active Info links, but no text is lost
801+
print STDERR "Can't deal with internal HREF anchors yet:\n";
802+
$he->dump; }
797803
}
798804
}
799-
elsif ($tag eq "address")
800-
{ # this is part of the page footer, ignore
801-
return 0;
802-
}
803805
elsif ($tag eq "br")
804806
{ print TEXI "\@\n"; }
805807
elsif ($tag eq "body")
@@ -852,7 +854,7 @@ ( $$$ )
852854
{ }
853855
if (scalar(@index_deferrers) != 0)
854856
{ $he->dump;
855-
die "index deferrers: ", join(" ", @index_deferrers); }
857+
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
856858
do_deferred_index_entries();
857859
}
858860
elsif ($tag =~ /^(font|big|small)$/)
@@ -899,7 +901,8 @@ ( $$$ )
899901
# This should only happen once per file.
900902
label_add_index_entries("");
901903
if (scalar(@index_deferrers) != 0)
902-
{ die "index deferrers: ", join(" ", @index_deferrers); }
904+
{ $he->dump;
905+
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
903906
do_deferred_index_entries();
904907
return 0;
905908
}
@@ -922,7 +925,8 @@ ( $$$ )
922925
{ if ($startflag)
923926
{ print TEXI "\n\n"; }
924927
if (scalar(@index_deferrers) != 0)
925-
{ die "index deferrers: ", join(" ", @index_deferrers); }
928+
{ $he->dump;
929+
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
926930
do_deferred_index_entries(); }
927931
elsif ($tag eq "pre")
928932
{ print_pre($he);
@@ -969,7 +973,8 @@ ( $$$ )
969973
else
970974
{ print TEXI "\n\@end itemize\n"; } }
971975
else
972-
{ print STDERR "\nBailing out\n";
976+
{ # I used to have a newline before "output_body" here.
977+
print STDERR "output_body: ignoring <$tag> tag\n";
973978
$he->dump;
974979
return 0; }
975980

@@ -1202,6 +1207,7 @@ ( $ )
12021207
$he->traverse(\&delete_if_navigation, 'ignore text');
12031208
$he->traverse(\&delete_extra_spaces, 'ignore text');
12041209
$he->traverse(\&merge_dl, 'ignore text');
1210+
$he->traverse(\&reorder_dt_and_dl, 'ignore text');
12051211
return $he;
12061212
}
12071213

@@ -1276,6 +1282,78 @@ ( $ )
12761282
}
12771283

12781284

1285+
# LaTeX2HTML sometimes creates
1286+
# <DT>text
1287+
# <DL COMPACT><DD>text
1288+
# which should actually be:
1289+
# <DL COMPACT>
1290+
# <DT>text
1291+
# <DD>text
1292+
# Since a <DL> gets added, this ends up looking like
1293+
# <P>
1294+
# <DL>
1295+
# <DT>
1296+
# text1...
1297+
# <DL COMPACT>
1298+
# <DD>
1299+
# text2...
1300+
# dt_or_dd1...
1301+
# dt_or_dd2...
1302+
# which should become
1303+
# <P>
1304+
# <DL COMPACT>
1305+
# <DT>
1306+
# text1...
1307+
# <DD>
1308+
# text2...
1309+
# dt_or_dd1...
1310+
# dt_or_dd2...
1311+
1312+
sub reorder_dt_and_dl ( $$$ )
1313+
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1314+
if (!$startflag)
1315+
{ return; }
1316+
1317+
if ($he->tag() eq "p")
1318+
{ my $ref_pcontent = $he->content();
1319+
if (defined $ref_pcontent)
1320+
{ my @pcontent = @{$ref_pcontent};
1321+
# print "reorder_dt_and_dl found a <p>\n"; $he->dump();
1322+
if ((scalar(@pcontent) >= 1)
1323+
&& (ref $pcontent[0]) && ($pcontent[0]->tag() eq "dl")
1324+
&& $pcontent[0]->implicit())
1325+
{ my $ref_dlcontent = $pcontent[0]->content();
1326+
# print "reorder_dt_and_dl found a <p> and implicit <dl>\n";
1327+
if (defined $ref_dlcontent)
1328+
{ my @dlcontent = @{$ref_dlcontent};
1329+
if ((scalar(@dlcontent) >= 1)
1330+
&& (ref $dlcontent[0]) && ($dlcontent[0]->tag() eq "dt"))
1331+
{ my $ref_dtcontent = $dlcontent[0]->content();
1332+
# print "reorder_dt_and_dl found a <p>, implicit <dl>, and <dt>\n";
1333+
if (defined $ref_dtcontent)
1334+
{ my @dtcontent = @{$ref_dtcontent};
1335+
if ((scalar(@dtcontent) > 0)
1336+
&& (ref $dtcontent[$#dtcontent])
1337+
&& ($dtcontent[$#dtcontent]->tag() eq "dl"))
1338+
{ my $ref_dl2content = $dtcontent[$#dtcontent]->content();
1339+
# print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, and <dl>\n";
1340+
if (defined $ref_dl2content)
1341+
{ my @dl2content = @{$ref_dl2content};
1342+
if ((scalar(@dl2content) > 0)
1343+
&& (ref ($dl2content[0]))
1344+
&& ($dl2content[0]->tag() eq "dd"))
1345+
{
1346+
# print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, <dl>, and <dd>\n";
1347+
# print STDERR "CHANGING\n"; $he->dump();
1348+
html_replace_by_ignore($dtcontent[$#dtcontent]);
1349+
splice(@{$ref_dlcontent}, 1, 0, @dl2content);
1350+
# print STDERR "CHANGED TO:\n"; $he->dump();
1351+
return 0; # don't traverse children
1352+
} } } } } } } } }
1353+
return 1;
1354+
}
1355+
1356+
12791357
# If we find a paragraph that looks like
12801358
# <P>
12811359
# <HR>
@@ -1668,3 +1746,5 @@ ( $ )
16681746
{ die "Pass one argument, the main/contents page"; }
16691747

16701748
process_contents_file($ARGV[0]);
1749+
1750+
# end of html2texi.pl

0 commit comments

Comments
 (0)