Nice, relaxing Perl munging. :) Have another change to put into this

changeset - afaik, I can edit changesets with bk. So, rather than wait, I will commit this now, finish up the other code, try to alter the changeset and then push all the changes up. Docs/Support/docbook-fixup.pl: Rewrote to use a more native Perl style. Increase strictness of error checking. Simplified and optimized regular expressions. Fixed several problems with conversion. Still have some minor issues to sort out. Rewrote to accept input from stdin or from filename arg(s) on the command line. Improved speed ~6x. Docs/Support/docbook-split: Rewrote to use a more native Perl style. Increase strictness of error checking. Simplified and optimized. Rewrote to accept input from stdin or from filename arg(s) on the command line.

Nice, relaxing Perl munging. :) Have another change to put into this
changeset - afaik, I can edit changesets with bk. So, rather than wait, I will commit this now, finish up the other code, try to alter the changeset and then push all the changes up. Docs/Support/docbook-fixup.pl: Rewrote to use a more native Perl style. Increase strictness of error checking. Simplified and optimized regular expressions. Fixed several problems with conversion. Still have some minor issues to sort out. Rewrote to accept input from stdin or from filename arg(s) on the command line. Improved speed ~6x. Docs/Support/docbook-split: Rewrote to use a more native Perl style. Increase strictness of error checking. Simplified and optimized. Rewrote to accept input from stdin or from filename arg(s) on the command line.
633d54b6 · unknown · e8291479 · 633d54b6 · 633d54b6
Commit 633d54b6 authored Feb 27, 2002 by unknown
Show whitespace changes
Inline Side-by-side

Showing with 163 additions and 182 deletions

Docs/Support/docbook-fixup.pl Docs/Support/docbook-fixup.pl +127 -120

Docs/Support/docbook-split Docs/Support/docbook-split +36 -62

No files found.
--- a/Docs/Support/docbook-fixup.pl
+++ b/Docs/Support/docbook-fixup.pl
 #!/usr/bin/perl -w
-# 2002-02-15 zak@mysql.com
-# Use -w to make perl print useful warnings about the script being run

-sub fix_underscore {
-  $str = shift;
-  $str =~ tr/_/-/;
-  return $str;
-};
+# Fix the output of `makeinfo --docbook` version 4.0c
+# Convert the broken docbook output to well-formed XML that conforms to the O'Reilly idiom
+# See code for detailed comments
+# Authors: Arjen Lentz and Zak Greant

-sub strip_emph {
-  $str = shift;
-  $str =~ s{<emphasis>(.+?)</emphasis>}
-           {$1}gs;
-  return $str;
-};
+use strict;

-print STDERR "\n--Post-processing makeinfo output--\n";
+my $data  = '';
+my @apx   = ();
+my $apx   = '';
+my @nodes = ();
+my $nodes = '';

-# 2002-02-15 zak@mysql.com
-print STDERR "Discard DTD - ORA can add the appropriate DTD for their flavour of DocBook\n";
-<STDIN>;
+msg ("\n-- Post-processing `makeinfo --docbook` output --");
+msg ("** Written to work with makeinfo version 4.0c **\n");

-print STDERR "Slurp! In comes the rest of the file. :)\n";
-$data = join "", <STDIN>;
+msg ("Discarding DTD - not required by subsequent scripts");
+# <> is a magic filehandle - either reading lines from stdin or from file(s) specified on the command line
+<>;

-# 2002-02-15 zak@mysql.com
-print STDERR "Add an XML processing instruction with the right character encoding\n";
-$data = "<?xml version='1.0' encoding='ISO-8859-1'?>" . $data;
+msg ("Create an XML PI with ISO-8859-1 character encoding");
+$data = "<?xml version='1.0' encoding='ISO-8859-1'?>";

-# 2002-02-15 zak@mysql.com
-# Less than optimal - should be fixed in makeinfo
-print STDERR "Put in missing <bookinfo> and <abstract>\n";
-$data =~ s/<book lang="en">/<book lang="en"><bookinfo><abstract>/gs;
+msg ("Get the rest of the data");
+$data = $data . join "", <>;

-# 2002-02-15 zak@mysql.com
-print STDERR "Convert existing ampersands to escape sequences \n";
-$data =~ s/&(?!\w+;)/&amp;/gs;
+msg ("Add missing <bookinfo> and <abstract> opening tags");
+# Note the absence of the g (global) pattern modified. This situation can only happen once.
+# ...as soon as we find the first instance, we can stop looking.
+$data =~ s/<book lang="en">/<book lang="en"><bookinfo><abstract>/;

-# 2002-02-15 zak@mysql.com
-# Need to talk to Arjen about what the <n> bits are for
-print STDERR "Rework references of the notation '<n>'\n";
-$data =~ s/<(\d)>/[$1]/gs;
+msg ("Removing mailto: from email addresses...");
+$data =~ s/mailto://g;

-# 2002-02-15 zak@mysql.com
-# We might need to encode the high-bit characters to ensure proper representation
-# print STDERR "Converting high-bit characters to entities\n";
-# $data =~ s/([\200-\400])/&get_entity($1)>/gs;
-# There is no get_entity function yet - no point writing it til we need it :)
+msg ("Removing INFORMALFIGURE...");
+$data =~ s{<informalfigure>.+?</informalfigure>}
+          {}gs;

-print STDERR "Changing @@ to @...\n";
-$data =~ s/@@/@/gs;
+msg ("Convert ampersands to XML escape sequences ");
+$data =~ s/&(?!\w+;)/&amp;/g;
  
-print STDERR "Changing '_' to '-' in references...\n";
-$data =~ s{id=\"(.+?)\"}
-          {"id=\"".&fix_underscore($1)."\""}gsex;
-$data =~ s{linkend=\"(.+?)\"}
-          {"linkend=\"".&fix_underscore($1)."\""}gsex;
+msg ("Changing @@ to @...");
+$data =~ s/@@/@/g;

-print STDERR "Changing ULINK to SYSTEMITEM...\n";
-$data =~ s{<ulink url=\"(.+?)\"></ulink>}
-          {<systemitem role=\"url\">$1</systemitem>}gs;
+msg ("Rework references of the notation '<n>'");
+# Need to talk to Arjen about what the <n> bits are for
+$data =~ s/<(\d)>/[$1]/g;

-print STDERR "Removing INFORMALFIGURE...\n";
-$data =~ s{<informalfigure>(.+?)</informalfigure>}
-          {}gs;
+msg ("Changing '_' to '-' in references...");
+$data =~ s{((?:id|linkend)=\".+?\")}
+          {&underscore2hyphen($1)}gex;

-print STDERR "Adding PARA inside ENTRY...\n";
+msg ("Changing ULINK to SYSTEMITEM...");
+$data =~ s{<ulink url=\"(.+?)\">\s*</ulink>}
+          {<systemitem role=\"url\">$1</systemitem>}gs;
+
+msg ("Adding PARA inside ENTRY...");
 $data =~ s{<entry>(.*?)</entry>}
          {<entry><para>$1</para></entry>}gs;

-print STDERR "Removing mailto: from email addresses...\n";
-$data =~ s{mailto:}
-          {}gs;
+msg ("Fixing spacing problem with titles...");
+$data =~ s{(</\w+>)(\w{2,})}
+          {$1 $2}gs;

-print STDERR "Fixing spacing problem with titles...\n";
-$data =~ s{</(\w+)>(\w{2,})}
-          {</$1> $2}gs;
+msg ("Adding closing / to XREF and COLSPEC tags...");
+$data =~ s{<(xref|colspec) (.+?)>}
+          {<$1 $2 />}gs;

-# 2002-02-15 arjen@mysql.com
-print STDERR "Adding closing / to XREF...\n";
-$data =~ s{<xref (.+?)>}
-          {<xref $1 />}gs;
-
-# 2002-02-22 arjen@mysql.com
-print STDERR "Adding \"See \" to XREFs that used to be \@xref...\n";
-$data =~ s{([\.\'\!\)])[\n ]*<xref }
+# Probably need to strip these
+msg ('Adding "See " to XREFs that used to be @xref...');
+$data =~ s{([.'!)])\s*<xref }
          {$1 See <xref }gs;

-# 2002-02-22 arjen@mysql.com
-print STDERR "Adding \"see \" to (XREFs) that used to be (\@pxref)...\n";
-$data =~ s{(\(|[[,;])([\n]*[ ]*)<xref }
+msg ('Adding "see " to (XREFs) that used to be (@pxref)...');
+$data =~ s{([([,;])(\s*)<xref }
          {$1$2see <xref }gs;

-# 2002-01-30 arjen@mysql.com
-print STDERR "Removing COLSPEC...\n";
-$data =~ s{\n *<colspec colwidth=\"[0-9]+\*\">}
-          {}gs;
-
-# 2002-01-31 arjen@mysql.com
-print STDERR "Making first row in table THEAD...\n";
-$data =~ s{([ ]*)<tbody>\n([ ]*<row>(.+?)</row>)}
-          {$1<thead>\n$2\n$1</thead>\n$1<tbody>}gs;
+msg ("Making first row in table THEAD...");
+$data =~ s{( *)<tbody>(\s*<row>.+?</row>)}
+          {$1<thead>$2\n$1</thead>\n$1<tbody>}gs;

-# 2002-01-31 arjen@mysql.com
-print STDERR "Removing EMPHASIS inside THEAD...\n";
+msg ("Removing EMPHASIS inside THEAD...");
 $data =~ s{<thead>(.+?)</thead>}
-          {"<thead>".&strip_emph($1)."</thead>"}gsex;
-
-# 2002-01-31 arjen@mysql.com
-print STDERR "Removing lf before /PARA in ENTRY...\n";
-$data =~ s{(<entry><para>(.+?))\n(</para></entry>)}
-          {$1$3}gs;
+          {"<thead>".&strip_tag($1, 'emphasis')."</thead>"}gsex;

-# 2002-01-31 arjen@mysql.com (2002-02-15 added \n stuff)
-print STDERR "Removing whitespace before /PARA if not on separate line...\n";
-$data =~ s{([^\n ])[ ]+</para>}
-          {$1</para>}gs;
+msg ("Removing empty PARA...");
+$data =~ s{<para>\s*</para>}
+          {}gs;

-# 2002-01-31 arjen@mysql.com
-print STDERR "Removing empty PARA in ENTRY...\n";
-$data =~ s{<entry><para></para></entry>}
-          {<entry></entry>}gs;
+msg ("Removing lf before /PARA in ENTRY...");
+$data =~ s{\n(</para></entry>)}
+          {$1}gs;

-# 2002-01-31 arjen@mysql.com
-print STDERR "Removing PARA around INDEXENTRY if no text in PARA...\n";
-$data =~ s{<para>((<indexterm role=\"(cp|fn)\">(<(primary|secondary)>[^<]+?</(primary|secondary)>)+?</indexterm>)+?)[\n]*</para>[\n]*}
-          {$1\n}gs;
+msg ("Removing whitespace before /PARA if not on separate line...");
+$data =~ s{(\S+)[\t ]+</para>}
+          {$1</para>}g;

-# -----
+msg ("Removing PARA around INDEXTERM if no text in PARA...");
+$data =~ s{<para>((?:<indexterm role=\"(?:cp|fn)\">(?:<(primary|secondary)>[^>]+</\2>)+?</indexterm>)+?)\s*</para>}
+          {$1}gs;

-@apx = ("Users", "MySQL Testimonials", "News",
-        "GPL-license", "LGPL-license");
+@apx = ("Users", "MySQL Testimonials", "News", "GPL-license", "LGPL-license");

 foreach $apx (@apx) {
-  print STDERR "Removing appendix $apx...\n";
+    msg ("Removing appendix $apx...");
    $data =~ s{<appendix id=\"$apx\">(.+?)</appendix>}
              {}gs;

-  print STDERR " ... Building list of removed nodes ...\n";
-  foreach(split "\n", $&) {
-    push @nodes, $2 if(/<(\w+) id=\"(.+?)\">/)
-  };
-};
+    # Skip to next appendix regex if the regex did not match anything
+    next unless (defined $&);
+    
+    msg ("...Building list of removed nodes...");
+    
+    # Split the last bracketed regex match into an array
+    # Extract the node names from the tags and push them into an array
+    foreach (split "\n", $&) {
+        push @nodes, $1 if /<\w+ id=\"(.+?)\">/
+    }
+}

 # 2002-02-22 arjen@mysql.com (added fix " /" to end of regex, to make it match)
-print STDERR "Fixing references to removed nodes...\n";
-foreach $node (@nodes) {
-  $web = $node;
-  $web =~ s/[ ]/_/;
-  $web = "http://www.mysql.com/doc/" .
-         (join "/", (split //, $web)[0..1])."/$web.html";
-  print STDERR "$node -> $web\n";
-  $data =~ s{<(\w+) linkend=\"$node\" />}
-            {$web}gs;
-};
+msg ("Fixing references to removed nodes...");
+# Merge the list of node names into a set of regex alternations
+$nodes = join "|", @nodes;
+
+# Find all references to removed nodes and convert them to absolute URLs
+$data =~ s{<\w+ linkend="($nodes)" />}
+          {&xref2link($1)}ges;

 print STDOUT $data;
+exit;
+
+#
+# Definitions for helper sub-routines
+#
+
+sub msg {
+    print STDERR shift, "\n";
+}
+
+sub strip_tag($$) {
+    (my $str, my $tag) = @_;
+    $str =~ s{<$tag>(.+?)</$tag>}{$1}gs;
+    return $str;
+}
+
+sub underscore2hyphen($) {
+    my $str = shift;
+    $str =~ tr/_/-/;
+    return $str;
+}
+
+sub xref2link {
+    my $ref = shift;
+    $ref =~ tr/ /_/;
+    $ref =~ s{^((.)(.).+)$}{$2/$3/$1.html};
+    return "http://www.mysql.com/doc/" . $ref;
+}
+
+# We might need to encode the high-bit characters to ensure proper representation
+# msg ("Converting high-bit characters to entities");
+# $data =~ s/([\200-\400])/&get_entity($1)>/gs;
+# There is no get_entity function yet - no point writing it til we need it :)
--- a/Docs/Support/docbook-split
+++ b/Docs/Support/docbook-split
-#! /usr/local/bin/perl
+#! /usr/bin/perl -w
 # O'Reilly's Perl script to chop mysql.xml into separate ch/apps/index files.
 # The indexes are actually not used, they're created straight from the xrefs.
-
-use strict;
-
 # Breaks the MySQL reference manual into chapters, appendices, and indexes.

-my $input_file;
-my $directory;
-my $chap_num;
-my $app_letter;
-my $start_text;
-my $line;
-my $input_file;
-my $output_name;
-
-$input_file = "mysql.xml";
-$directory="chaps_apps_index";
-$chap_num=1;       # Start chapter numbers at one (there is no preface)
-$app_letter="a";   # Start appendix letters at "a"
-$start_text="";
-$line="";
-
-open (INPUT_FILE, '<' . $input_file) or die "Cannot open $input_file";
-
-if (-d $directory) {
-    my $unlinked = unlink <$directory/*>;
-    printf(Removed "%d files\n", $unlinked);
-}
-else {
-    mkdir $directory or die "Cannot make $directory subdirectory";
-}
+use strict;

-while (1) {
+my $app_letter      = "a";                                  # Start appendix letters at "a"
+my $chap_num        = 1;                                    # Start chapter numbers at one (there is no preface)
+my $directory       = "chaps_apps_index";
+my $ext             = ".xml";
+my $line            = "";
+my $output_name     = "";
+my $start_text      = "";

-    # Terminating statement for loop.
-    exit if not defined $line;
+mkdir $directory unless -d $directory;

-    if ($line =~ /(?:.*)(<chapter.*)/i ) {
+while (defined $line) {
+    if ($line =~ /(<chapter.+)/i ) {
        $start_text = $1;
-        $output_name = &make_chapter_name($chap_num);
-        $chap_num++;
+        $output_name = sprintf("ch%02d%s", $chap_num, $ext);
+        ++$chap_num;
        &process_file("chapter");
    }
-    elsif ($line =~ /(?:.*)(<appendix.*)/i ) {
+    elsif ($line =~ /(<appendix.+)/i ) {
        $start_text = $1 ;
-        $output_name = &make_appendix_name($app_letter);
-        $app_letter++;
+        $output_name = "app$app_letter$ext";
+        ++$app_letter;
        &process_file("appendix");
    }
-    elsif ($line =~ /(?:.*)(<index\s+id=")(.*?)(">.*)/i ) {
+    elsif ($line =~ /(<index\s+id=")(.*?)(">.*)/i ) {
        $start_text = $1 . $2 . $3;
-        $output_name = lc($2) . ".xml";
+        $output_name = lc($2) . $ext;
        &process_file("index");
    }
    else {
-        # Automatically skips junk in between chapters, appendices,
-        # and indexes.
-        $line = <INPUT_FILE>;
+        # Skip junk in between chapters, appendices and indexes.
+        $line = <>;
    }
 }

-sub make_chapter_name {
-    my $num = shift;
-    my $name = "ch" . sprintf("%02d", $num) . ".xml";
-    return $name;
-}
+sub process_file {
+    my $marker  = shift;
+    my $path    = "$directory/$output_name";

-sub make_appendix_name {
-    my $letter = shift;
-    my $name = "app" . sprintf("%s", $letter) . ".xml";
-    return $name;
-}
+    open (OUTPUT_FILE, ">$path") or die "Cannot open $path";
+
+    print STDERR "Creating $path\n";
+
+    # Print out XML PI
+    print OUTPUT_FILE "<?xml version='1.0' encoding='ISO-8859-1'?>\n";
   
-sub process_file {
-    my $marker=shift;
-    open (OUTPUT_FILE, '>' . $directory . "/" . $output_name) or
-        die "Cannot open $output_name";
    # Print whatever happened to appear at the end of the previous chapter.
-    print OUTPUT_FILE $start_text . "\n" if $start_text;
-    while (1) {
-        $line = <INPUT_FILE>;
-        exit if not defined $line;
+    print OUTPUT_FILE "$start_text\n" if $start_text;
+    
+    while (defined $line) {
+        $line = <>;
+
        # Note: Anything after the terminating marker is lost, just like
        # lines in between chapters.
        if ($line =~ /(.*<\/\s*$marker\s*>)/i ) {
-            print OUTPUT_FILE $1 . "\n" if $1;
+            print OUTPUT_FILE "$1\n" if $1;
            close OUTPUT_FILE;
            return;
        }