Commit 633d54b6 authored by unknown's avatar unknown

Nice, relaxing Perl munging. :) Have another change to put into this

changeset - afaik, I can edit changesets with bk. So, rather than wait, I will
commit this now, finish up the other code, try to alter the changeset and then
push all the changes up.


Docs/Support/docbook-fixup.pl:
  Rewrote to use a more native Perl style.
  Increase strictness of error checking.
  Simplified and optimized regular expressions.
  Fixed several problems with conversion. Still have some minor issues to sort out.
  Rewrote to accept input from stdin or from filename arg(s) on the command line.
  Improved speed ~6x.
Docs/Support/docbook-split:
  Rewrote to use a more native Perl style.
  Increase strictness of error checking.
  Simplified and optimized.
  Rewrote to accept input from stdin or from filename arg(s) on the command line.
parent e8291479
#!/usr/bin/perl -w #!/usr/bin/perl -w
# 2002-02-15 zak@mysql.com
# Use -w to make perl print useful warnings about the script being run
sub fix_underscore { # Fix the output of `makeinfo --docbook` version 4.0c
$str = shift; # Convert the broken docbook output to well-formed XML that conforms to the O'Reilly idiom
$str =~ tr/_/-/; # See code for detailed comments
return $str; # Authors: Arjen Lentz and Zak Greant
};
sub strip_emph { use strict;
$str = shift;
$str =~ s{<emphasis>(.+?)</emphasis>}
{$1}gs;
return $str;
};
print STDERR "\n--Post-processing makeinfo output--\n"; my $data = '';
my @apx = ();
my $apx = '';
my @nodes = ();
my $nodes = '';
# 2002-02-15 zak@mysql.com msg ("\n-- Post-processing `makeinfo --docbook` output --");
print STDERR "Discard DTD - ORA can add the appropriate DTD for their flavour of DocBook\n"; msg ("** Written to work with makeinfo version 4.0c **\n");
<STDIN>;
print STDERR "Slurp! In comes the rest of the file. :)\n"; msg ("Discarding DTD - not required by subsequent scripts");
$data = join "", <STDIN>; # <> is a magic filehandle - either reading lines from stdin or from file(s) specified on the command line
<>;
# 2002-02-15 zak@mysql.com msg ("Create an XML PI with ISO-8859-1 character encoding");
print STDERR "Add an XML processing instruction with the right character encoding\n"; $data = "<?xml version='1.0' encoding='ISO-8859-1'?>";
$data = "<?xml version='1.0' encoding='ISO-8859-1'?>" . $data;
# 2002-02-15 zak@mysql.com msg ("Get the rest of the data");
# Less than optimal - should be fixed in makeinfo $data = $data . join "", <>;
print STDERR "Put in missing <bookinfo> and <abstract>\n";
$data =~ s/<book lang="en">/<book lang="en"><bookinfo><abstract>/gs;
# 2002-02-15 zak@mysql.com msg ("Add missing <bookinfo> and <abstract> opening tags");
print STDERR "Convert existing ampersands to escape sequences \n"; # Note the absence of the g (global) pattern modified. This situation can only happen once.
$data =~ s/&(?!\w+;)/&amp;/gs; # ...as soon as we find the first instance, we can stop looking.
$data =~ s/<book lang="en">/<book lang="en"><bookinfo><abstract>/;
# 2002-02-15 zak@mysql.com msg ("Removing mailto: from email addresses...");
# Need to talk to Arjen about what the <n> bits are for $data =~ s/mailto://g;
print STDERR "Rework references of the notation '<n>'\n";
$data =~ s/<(\d)>/[$1]/gs;
# 2002-02-15 zak@mysql.com msg ("Removing INFORMALFIGURE...");
# We might need to encode the high-bit characters to ensure proper representation $data =~ s{<informalfigure>.+?</informalfigure>}
# print STDERR "Converting high-bit characters to entities\n"; {}gs;
# $data =~ s/([\200-\400])/&get_entity($1)>/gs;
# There is no get_entity function yet - no point writing it til we need it :)
print STDERR "Changing @@ to @...\n"; msg ("Convert ampersands to XML escape sequences ");
$data =~ s/@@/@/gs; $data =~ s/&(?!\w+;)/&amp;/g;
print STDERR "Changing '_' to '-' in references...\n"; msg ("Changing @@ to @...");
$data =~ s{id=\"(.+?)\"} $data =~ s/@@/@/g;
{"id=\"".&fix_underscore($1)."\""}gsex;
$data =~ s{linkend=\"(.+?)\"}
{"linkend=\"".&fix_underscore($1)."\""}gsex;
print STDERR "Changing ULINK to SYSTEMITEM...\n"; msg ("Rework references of the notation '<n>'");
$data =~ s{<ulink url=\"(.+?)\"></ulink>} # Need to talk to Arjen about what the <n> bits are for
{<systemitem role=\"url\">$1</systemitem>}gs; $data =~ s/<(\d)>/[$1]/g;
print STDERR "Removing INFORMALFIGURE...\n"; msg ("Changing '_' to '-' in references...");
$data =~ s{<informalfigure>(.+?)</informalfigure>} $data =~ s{((?:id|linkend)=\".+?\")}
{}gs; {&underscore2hyphen($1)}gex;
print STDERR "Adding PARA inside ENTRY...\n"; msg ("Changing ULINK to SYSTEMITEM...");
$data =~ s{<ulink url=\"(.+?)\">\s*</ulink>}
{<systemitem role=\"url\">$1</systemitem>}gs;
msg ("Adding PARA inside ENTRY...");
$data =~ s{<entry>(.*?)</entry>} $data =~ s{<entry>(.*?)</entry>}
{<entry><para>$1</para></entry>}gs; {<entry><para>$1</para></entry>}gs;
print STDERR "Removing mailto: from email addresses...\n"; msg ("Fixing spacing problem with titles...");
$data =~ s{mailto:} $data =~ s{(</\w+>)(\w{2,})}
{}gs; {$1 $2}gs;
print STDERR "Fixing spacing problem with titles...\n"; msg ("Adding closing / to XREF and COLSPEC tags...");
$data =~ s{</(\w+)>(\w{2,})} $data =~ s{<(xref|colspec) (.+?)>}
{</$1> $2}gs; {<$1 $2 />}gs;
# 2002-02-15 arjen@mysql.com # Probably need to strip these
print STDERR "Adding closing / to XREF...\n"; msg ('Adding "See " to XREFs that used to be @xref...');
$data =~ s{<xref (.+?)>} $data =~ s{([.'!)])\s*<xref }
{<xref $1 />}gs;
# 2002-02-22 arjen@mysql.com
print STDERR "Adding \"See \" to XREFs that used to be \@xref...\n";
$data =~ s{([\.\'\!\)])[\n ]*<xref }
{$1 See <xref }gs; {$1 See <xref }gs;
# 2002-02-22 arjen@mysql.com msg ('Adding "see " to (XREFs) that used to be (@pxref)...');
print STDERR "Adding \"see \" to (XREFs) that used to be (\@pxref)...\n"; $data =~ s{([([,;])(\s*)<xref }
$data =~ s{(\(|[[,;])([\n]*[ ]*)<xref }
{$1$2see <xref }gs; {$1$2see <xref }gs;
# 2002-01-30 arjen@mysql.com msg ("Making first row in table THEAD...");
print STDERR "Removing COLSPEC...\n"; $data =~ s{( *)<tbody>(\s*<row>.+?</row>)}
$data =~ s{\n *<colspec colwidth=\"[0-9]+\*\">} {$1<thead>$2\n$1</thead>\n$1<tbody>}gs;
{}gs;
# 2002-01-31 arjen@mysql.com
print STDERR "Making first row in table THEAD...\n";
$data =~ s{([ ]*)<tbody>\n([ ]*<row>(.+?)</row>)}
{$1<thead>\n$2\n$1</thead>\n$1<tbody>}gs;
# 2002-01-31 arjen@mysql.com msg ("Removing EMPHASIS inside THEAD...");
print STDERR "Removing EMPHASIS inside THEAD...\n";
$data =~ s{<thead>(.+?)</thead>} $data =~ s{<thead>(.+?)</thead>}
{"<thead>".&strip_emph($1)."</thead>"}gsex; {"<thead>".&strip_tag($1, 'emphasis')."</thead>"}gsex;
# 2002-01-31 arjen@mysql.com
print STDERR "Removing lf before /PARA in ENTRY...\n";
$data =~ s{(<entry><para>(.+?))\n(</para></entry>)}
{$1$3}gs;
# 2002-01-31 arjen@mysql.com (2002-02-15 added \n stuff) msg ("Removing empty PARA...");
print STDERR "Removing whitespace before /PARA if not on separate line...\n"; $data =~ s{<para>\s*</para>}
$data =~ s{([^\n ])[ ]+</para>} {}gs;
{$1</para>}gs;
# 2002-01-31 arjen@mysql.com msg ("Removing lf before /PARA in ENTRY...");
print STDERR "Removing empty PARA in ENTRY...\n"; $data =~ s{\n(</para></entry>)}
$data =~ s{<entry><para></para></entry>} {$1}gs;
{<entry></entry>}gs;
# 2002-01-31 arjen@mysql.com msg ("Removing whitespace before /PARA if not on separate line...");
print STDERR "Removing PARA around INDEXENTRY if no text in PARA...\n"; $data =~ s{(\S+)[\t ]+</para>}
$data =~ s{<para>((<indexterm role=\"(cp|fn)\">(<(primary|secondary)>[^<]+?</(primary|secondary)>)+?</indexterm>)+?)[\n]*</para>[\n]*} {$1</para>}g;
{$1\n}gs;
# ----- msg ("Removing PARA around INDEXTERM if no text in PARA...");
$data =~ s{<para>((?:<indexterm role=\"(?:cp|fn)\">(?:<(primary|secondary)>[^>]+</\2>)+?</indexterm>)+?)\s*</para>}
{$1}gs;
@apx = ("Users", "MySQL Testimonials", "News", @apx = ("Users", "MySQL Testimonials", "News", "GPL-license", "LGPL-license");
"GPL-license", "LGPL-license");
foreach $apx (@apx) { foreach $apx (@apx) {
print STDERR "Removing appendix $apx...\n"; msg ("Removing appendix $apx...");
$data =~ s{<appendix id=\"$apx\">(.+?)</appendix>} $data =~ s{<appendix id=\"$apx\">(.+?)</appendix>}
{}gs; {}gs;
print STDERR " ... Building list of removed nodes ...\n"; # Skip to next appendix regex if the regex did not match anything
foreach(split "\n", $&) { next unless (defined $&);
push @nodes, $2 if(/<(\w+) id=\"(.+?)\">/)
}; msg ("...Building list of removed nodes...");
};
# Split the last bracketed regex match into an array
# Extract the node names from the tags and push them into an array
foreach (split "\n", $&) {
push @nodes, $1 if /<\w+ id=\"(.+?)\">/
}
}
# 2002-02-22 arjen@mysql.com (added fix " /" to end of regex, to make it match) # 2002-02-22 arjen@mysql.com (added fix " /" to end of regex, to make it match)
print STDERR "Fixing references to removed nodes...\n"; msg ("Fixing references to removed nodes...");
foreach $node (@nodes) { # Merge the list of node names into a set of regex alternations
$web = $node; $nodes = join "|", @nodes;
$web =~ s/[ ]/_/;
$web = "http://www.mysql.com/doc/" . # Find all references to removed nodes and convert them to absolute URLs
(join "/", (split //, $web)[0..1])."/$web.html"; $data =~ s{<\w+ linkend="($nodes)" />}
print STDERR "$node -> $web\n"; {&xref2link($1)}ges;
$data =~ s{<(\w+) linkend=\"$node\" />}
{$web}gs;
};
print STDOUT $data; print STDOUT $data;
exit;
#
# Definitions for helper sub-routines
#
sub msg {
print STDERR shift, "\n";
}
sub strip_tag($$) {
(my $str, my $tag) = @_;
$str =~ s{<$tag>(.+?)</$tag>}{$1}gs;
return $str;
}
sub underscore2hyphen($) {
my $str = shift;
$str =~ tr/_/-/;
return $str;
}
sub xref2link {
my $ref = shift;
$ref =~ tr/ /_/;
$ref =~ s{^((.)(.).+)$}{$2/$3/$1.html};
return "http://www.mysql.com/doc/" . $ref;
}
# We might need to encode the high-bit characters to ensure proper representation
# msg ("Converting high-bit characters to entities");
# $data =~ s/([\200-\400])/&get_entity($1)>/gs;
# There is no get_entity function yet - no point writing it til we need it :)
#! /usr/local/bin/perl #! /usr/bin/perl -w
# O'Reilly's Perl script to chop mysql.xml into separate ch/apps/index files. # O'Reilly's Perl script to chop mysql.xml into separate ch/apps/index files.
# The indexes are actually not used, they're created straight from the xrefs. # The indexes are actually not used, they're created straight from the xrefs.
use strict;
# Breaks the MySQL reference manual into chapters, appendices, and indexes. # Breaks the MySQL reference manual into chapters, appendices, and indexes.
my $input_file; use strict;
my $directory;
my $chap_num;
my $app_letter;
my $start_text;
my $line;
my $input_file;
my $output_name;
$input_file = "mysql.xml";
$directory="chaps_apps_index";
$chap_num=1; # Start chapter numbers at one (there is no preface)
$app_letter="a"; # Start appendix letters at "a"
$start_text="";
$line="";
open (INPUT_FILE, '<' . $input_file) or die "Cannot open $input_file";
if (-d $directory) {
my $unlinked = unlink <$directory/*>;
printf(Removed "%d files\n", $unlinked);
}
else {
mkdir $directory or die "Cannot make $directory subdirectory";
}
while (1) { my $app_letter = "a"; # Start appendix letters at "a"
my $chap_num = 1; # Start chapter numbers at one (there is no preface)
my $directory = "chaps_apps_index";
my $ext = ".xml";
my $line = "";
my $output_name = "";
my $start_text = "";
# Terminating statement for loop. mkdir $directory unless -d $directory;
exit if not defined $line;
if ($line =~ /(?:.*)(<chapter.*)/i ) { while (defined $line) {
if ($line =~ /(<chapter.+)/i ) {
$start_text = $1; $start_text = $1;
$output_name = &make_chapter_name($chap_num); $output_name = sprintf("ch%02d%s", $chap_num, $ext);
$chap_num++; ++$chap_num;
&process_file("chapter"); &process_file("chapter");
} }
elsif ($line =~ /(?:.*)(<appendix.*)/i ) { elsif ($line =~ /(<appendix.+)/i ) {
$start_text = $1 ; $start_text = $1 ;
$output_name = &make_appendix_name($app_letter); $output_name = "app$app_letter$ext";
$app_letter++; ++$app_letter;
&process_file("appendix"); &process_file("appendix");
} }
elsif ($line =~ /(?:.*)(<index\s+id=")(.*?)(">.*)/i ) { elsif ($line =~ /(<index\s+id=")(.*?)(">.*)/i ) {
$start_text = $1 . $2 . $3; $start_text = $1 . $2 . $3;
$output_name = lc($2) . ".xml"; $output_name = lc($2) . $ext;
&process_file("index"); &process_file("index");
} }
else { else {
# Automatically skips junk in between chapters, appendices, # Skip junk in between chapters, appendices and indexes.
# and indexes. $line = <>;
$line = <INPUT_FILE>;
} }
} }
sub make_chapter_name { sub process_file {
my $num = shift; my $marker = shift;
my $name = "ch" . sprintf("%02d", $num) . ".xml"; my $path = "$directory/$output_name";
return $name;
}
sub make_appendix_name { open (OUTPUT_FILE, ">$path") or die "Cannot open $path";
my $letter = shift;
my $name = "app" . sprintf("%s", $letter) . ".xml"; print STDERR "Creating $path\n";
return $name;
} # Print out XML PI
print OUTPUT_FILE "<?xml version='1.0' encoding='ISO-8859-1'?>\n";
sub process_file {
my $marker=shift;
open (OUTPUT_FILE, '>' . $directory . "/" . $output_name) or
die "Cannot open $output_name";
# Print whatever happened to appear at the end of the previous chapter. # Print whatever happened to appear at the end of the previous chapter.
print OUTPUT_FILE $start_text . "\n" if $start_text; print OUTPUT_FILE "$start_text\n" if $start_text;
while (1) {
$line = <INPUT_FILE>; while (defined $line) {
exit if not defined $line; $line = <>;
# Note: Anything after the terminating marker is lost, just like # Note: Anything after the terminating marker is lost, just like
# lines in between chapters. # lines in between chapters.
if ($line =~ /(.*<\/\s*$marker\s*>)/i ) { if ($line =~ /(.*<\/\s*$marker\s*>)/i ) {
print OUTPUT_FILE $1 . "\n" if $1; print OUTPUT_FILE "$1\n" if $1;
close OUTPUT_FILE; close OUTPUT_FILE;
return; return;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment