1- # ! /usr/bin/env perl -w
1+ # ! /usr/bin/env perl
22# html2texi.pl -- Convert HTML documentation to Texinfo format
33# Michael Ernst <[email protected] >4- # Time-stamp: <1998-09-10 12:52:38 mernst>
4+ # Time-stamp: <1999-01-12 21:34:27 mernst>
55
66# This program converts HTML documentation trees into Texinfo format.
77# Given the name of a main (or contents) HTML file, it processes that file,
88# and other files (transitively) referenced by it, into a Texinfo file
99# (whose name is chosen from the file or directory name of the argument).
1010# For instance:
11- # html2texi.pl api/index.pl
11+ # html2texi.pl api/index.html
1212# produces file "api.texi".
1313
1414# Texinfo format can be easily converted to Info format (for browsing in
2323# and mouse-free browsing.
2424
2525# Limitations:
26- # html2texi.pl is currently tuned to latex2html output, but should be
27- # extensible to arbitrary HTML documents. It will be most useful for HTML
28- # with a hierarchical structure and an index. The HTML tree to be
29- # traversed must be on local disk, rather than being accessed via HTTP.
26+ # html2texi.pl is currently tuned to latex2html output (and it corrects
27+ # several latex2html bugs), but should be extensible to arbitrary HTML
28+ # documents. It will be most useful for HTML with a hierarchical structure
29+ # and an index, and it recognizes those features as created by latex2html
30+ # (and possibly by some other tools). The HTML tree to be traversed must
31+ # be on local disk, rather than being accessed via HTTP.
3032# This script requires the use of "checkargs.pm". To eliminate that
3133# dependence, replace calls to check_args* by @_ (which is always the last
3234# argument to those functions).
3335# Also see the "to do" section, below.
3436# Comments, suggestions, bug fixes, and enhancements are welcome.
3537
38+ # Troubleshooting:
39+ # Malformed HTML can cause this program to abort, so
40+ # you should check your HTML files to make sure they are legal.
41+
42+
3643# ##
3744# ## Typical usage for the Python documentation:
3845# ##
4148# The resulting Info format Python documentation is currently available at
4249# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
4350
44- # Fix up HTML problems, eg <DL COMPACT><DD>
51+ # Fix up HTML problems, eg <DT>< DL COMPACT><DD> should be <DT><DL COMPACT><DD>.
4552
4653# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
4754# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
5562# * fix up any sectioning, such as for Abstract
5663# * make Texinfo menus
5764# * perhaps remove the @detailmenu ... @end detailmenu
58- # In Emacs:
65+ # In Emacs, to do all this :
5966# (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
6067
6168# makeinfo api.texi
157164require HTML::Element;
158165
159166use File::Basename;
160- use Cwd;
161167
162168use strict;
163169# use Carp;
164170
165-
166171use checkargs;
167172
168173
290295sub process_child_links ( $ )
291296{ my ($he ) = check_args(1, @_ );
292297
293- # $he->dump;
298+ # $he->dump() ;
294299 if (scalar (@current_contents_list ) != 0)
295300 { die " current_contents_list nonempty: @current_contents_list " ; }
296301 $he -> traverse(\&increment_current_contents_list, ' ignore text' );
@@ -374,7 +379,7 @@ ( $ )
374379 $result .= " \} " ;
375380 return $result ; }
376381 else
377- { $he -> dump ;
382+ { $he -> dump () ;
378383 die " html_to_texi confused by <$tag >" ; }
379384}
380385
@@ -477,7 +482,7 @@ ( $ )
477482 for (my $i = 0; $i < scalar (@content ); $i ++)
478483 { my $this_he = $content [$i ];
479484 if ($this_he -> tag ne " dt" )
480- { $this_he -> dump ;
485+ { $this_he -> dump () ;
481486 die " Expected <DT> tag: " . $this_he -> tag; }
482487 if (($i < scalar (@content ) - 1) && ($content [$i +1]-> tag eq " dd" ))
483488 { process_index_dt_and_dd($this_he , $content [$i +1]);
@@ -792,14 +797,11 @@ ( $$$ )
792797 return 0; } }
793798 else
794799 { if ($startflag )
795- { $he -> dump ;
796- warn " Can't deal with internal HREF anchors yet" ; }
800+ { # cross-references are not active Info links, but no text is lost
801+ print STDERR " Can't deal with internal HREF anchors yet:\n " ;
802+ $he -> dump ; }
797803 }
798804 }
799- elsif ($tag eq " address" )
800- { # this is part of the page footer, ignore
801- return 0;
802- }
803805 elsif ($tag eq " br" )
804806 { print TEXI " \@\n " ; }
805807 elsif ($tag eq " body" )
@@ -852,7 +854,7 @@ ( $$$ )
852854 { }
853855 if (scalar (@index_deferrers ) != 0)
854856 { $he -> dump ;
855- die " index deferrers: " , join (" " , @index_deferrers ); }
857+ die " Unexpected < $tag > while inside: ( " . join (" " , @index_deferrers ) . " ); bad HTML? " ; }
856858 do_deferred_index_entries();
857859 }
858860 elsif ($tag =~ / ^(font|big|small)$ / )
@@ -899,7 +901,8 @@ ( $$$ )
899901 # This should only happen once per file.
900902 label_add_index_entries(" " );
901903 if (scalar (@index_deferrers ) != 0)
902- { die " index deferrers: " , join (" " , @index_deferrers ); }
904+ { $he -> dump ;
905+ die " Unexpected <$tag > while inside: (" . join (" " , @index_deferrers ) . " ); bad HTML?" ; }
903906 do_deferred_index_entries();
904907 return 0;
905908 }
@@ -922,7 +925,8 @@ ( $$$ )
922925 { if ($startflag )
923926 { print TEXI " \n\n " ; }
924927 if (scalar (@index_deferrers ) != 0)
925- { die " index deferrers: " , join (" " , @index_deferrers ); }
928+ { $he -> dump ;
929+ die " Unexpected <$tag > while inside: (" . join (" " , @index_deferrers ) . " ); bad HTML?" ; }
926930 do_deferred_index_entries(); }
927931 elsif ($tag eq " pre" )
928932 { print_pre($he );
@@ -969,7 +973,8 @@ ( $$$ )
969973 else
970974 { print TEXI " \n\@ end itemize\n " ; } }
971975 else
972- { print STDERR " \n Bailing out\n " ;
976+ { # I used to have a newline before "output_body" here.
977+ print STDERR " output_body: ignoring <$tag > tag\n " ;
973978 $he -> dump ;
974979 return 0; }
975980
@@ -1202,6 +1207,7 @@ ( $ )
12021207 $he -> traverse(\&delete_if_navigation, ' ignore text' );
12031208 $he -> traverse(\&delete_extra_spaces, ' ignore text' );
12041209 $he -> traverse(\&merge_dl, ' ignore text' );
1210+ $he -> traverse(\&reorder_dt_and_dl, ' ignore text' );
12051211 return $he ;
12061212}
12071213
@@ -1276,6 +1282,78 @@ ( $ )
12761282}
12771283
12781284
1285+ # LaTeX2HTML sometimes creates
1286+ # <DT>text
1287+ # <DL COMPACT><DD>text
1288+ # which should actually be:
1289+ # <DL COMPACT>
1290+ # <DT>text
1291+ # <DD>text
1292+ # Since a <DL> gets added, this ends up looking like
1293+ # <P>
1294+ # <DL>
1295+ # <DT>
1296+ # text1...
1297+ # <DL COMPACT>
1298+ # <DD>
1299+ # text2...
1300+ # dt_or_dd1...
1301+ # dt_or_dd2...
1302+ # which should become
1303+ # <P>
1304+ # <DL COMPACT>
1305+ # <DT>
1306+ # text1...
1307+ # <DD>
1308+ # text2...
1309+ # dt_or_dd1...
1310+ # dt_or_dd2...
1311+
1312+ sub reorder_dt_and_dl ( $$ $ )
1313+ { my ($he , $startflag ) = (check_args(3, @_ ))[0,1]; # ignore depth argument
1314+ if (!$startflag )
1315+ { return ; }
1316+
1317+ if ($he -> tag() eq " p" )
1318+ { my $ref_pcontent = $he -> content();
1319+ if (defined $ref_pcontent )
1320+ { my @pcontent = @{$ref_pcontent };
1321+ # print "reorder_dt_and_dl found a <p>\n"; $he->dump();
1322+ if ((scalar (@pcontent ) >= 1)
1323+ && (ref $pcontent [0]) && ($pcontent [0]-> tag() eq " dl" )
1324+ && $pcontent [0]-> implicit())
1325+ { my $ref_dlcontent = $pcontent [0]-> content();
1326+ # print "reorder_dt_and_dl found a <p> and implicit <dl>\n";
1327+ if (defined $ref_dlcontent )
1328+ { my @dlcontent = @{$ref_dlcontent };
1329+ if ((scalar (@dlcontent ) >= 1)
1330+ && (ref $dlcontent [0]) && ($dlcontent [0]-> tag() eq " dt" ))
1331+ { my $ref_dtcontent = $dlcontent [0]-> content();
1332+ # print "reorder_dt_and_dl found a <p>, implicit <dl>, and <dt>\n";
1333+ if (defined $ref_dtcontent )
1334+ { my @dtcontent = @{$ref_dtcontent };
1335+ if ((scalar (@dtcontent ) > 0)
1336+ && (ref $dtcontent [$#dtcontent ])
1337+ && ($dtcontent [$#dtcontent ]-> tag() eq " dl" ))
1338+ { my $ref_dl2content = $dtcontent [$#dtcontent ]-> content();
1339+ # print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, and <dl>\n";
1340+ if (defined $ref_dl2content )
1341+ { my @dl2content = @{$ref_dl2content };
1342+ if ((scalar (@dl2content ) > 0)
1343+ && (ref ($dl2content [0]))
1344+ && ($dl2content [0]-> tag() eq " dd" ))
1345+ {
1346+ # print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, <dl>, and <dd>\n";
1347+ # print STDERR "CHANGING\n"; $he->dump();
1348+ html_replace_by_ignore($dtcontent [$#dtcontent ]);
1349+ splice (@{$ref_dlcontent }, 1, 0, @dl2content );
1350+ # print STDERR "CHANGED TO:\n"; $he->dump();
1351+ return 0; # don't traverse children
1352+ } } } } } } } } }
1353+ return 1;
1354+ }
1355+
1356+
12791357# If we find a paragraph that looks like
12801358# <P>
12811359# <HR>
@@ -1668,3 +1746,5 @@ ( $ )
16681746{ die " Pass one argument, the main/contents page" ; }
16691747
16701748process_contents_file($ARGV [0]);
1749+
1750+ # end of html2texi.pl
0 commit comments