Commit 802ffb1a authored by unknown's avatar unknown

Nice, relaxing Perl munging. :) Have another change to put into this

changeset - afaik, I can edit changesets with bk. So, rather than wait, I will
commit this now, finish up the other code, try to alter the changeset and then
push all the changes up.


Docs/Support/docbook-fixup.pl:
  Rewrote to use a more native Perl style.
  Increase strictness of error checking.
  Simplified and optimized regular expressions.
  Fixed several problems with conversion. Still have some minor issues to sort out.
  Rewrote to accept input from stdin or from filename arg(s) on the command line.
  Improved speed ~6x.
Docs/Support/docbook-split:
  Rewrote to use a more native Perl style.
  Increase strictness of error checking.
  Simplified and optimized.
  Rewrote to accept input from stdin or from filename arg(s) on the command line.
parent d2456bb4
#!/usr/bin/perl -w
# 2002-02-15 zak@mysql.com
# Use -w to make perl print useful warnings about the script being run
sub fix_underscore {
$str = shift;
$str =~ tr/_/-/;
return $str;
};
# Fix the output of `makeinfo --docbook` version 4.0c
# Convert the broken docbook output to well-formed XML that conforms to the O'Reilly idiom
# See code for detailed comments
# Authors: Arjen Lentz and Zak Greant
sub strip_emph {
$str = shift;
$str =~ s{<emphasis>(.+?)</emphasis>}
{$1}gs;
return $str;
};
use strict;
print STDERR "\n--Post-processing makeinfo output--\n";
my $data = '';
my @apx = ();
my $apx = '';
my @nodes = ();
my $nodes = '';
# 2002-02-15 zak@mysql.com
print STDERR "Discard DTD - ORA can add the appropriate DTD for their flavour of DocBook\n";
<STDIN>;
msg ("\n-- Post-processing `makeinfo --docbook` output --");
msg ("** Written to work with makeinfo version 4.0c **\n");
print STDERR "Slurp! In comes the rest of the file. :)\n";
$data = join "", <STDIN>;
msg ("Discarding DTD - not required by subsequent scripts");
# <> is a magic filehandle - either reading lines from stdin or from file(s) specified on the command line
<>;
# 2002-02-15 zak@mysql.com
print STDERR "Add an XML processing instruction with the right character encoding\n";
$data = "<?xml version='1.0' encoding='ISO-8859-1'?>" . $data;
msg ("Create an XML PI with ISO-8859-1 character encoding");
$data = "<?xml version='1.0' encoding='ISO-8859-1'?>";
# 2002-02-15 zak@mysql.com
# Less than optimal - should be fixed in makeinfo
print STDERR "Put in missing <bookinfo> and <abstract>\n";
$data =~ s/<book lang="en">/<book lang="en"><bookinfo><abstract>/gs;
msg ("Get the rest of the data");
$data = $data . join "", <>;
# 2002-02-15 zak@mysql.com
print STDERR "Convert existing ampersands to escape sequences \n";
$data =~ s/&(?!\w+;)/&amp;/gs;
msg ("Add missing <bookinfo> and <abstract> opening tags");
# Note the absence of the g (global) pattern modified. This situation can only happen once.
# ...as soon as we find the first instance, we can stop looking.
$data =~ s/<book lang="en">/<book lang="en"><bookinfo><abstract>/;
# 2002-02-15 zak@mysql.com
# Need to talk to Arjen about what the <n> bits are for
print STDERR "Rework references of the notation '<n>'\n";
$data =~ s/<(\d)>/[$1]/gs;
msg ("Removing mailto: from email addresses...");
$data =~ s/mailto://g;
# 2002-02-15 zak@mysql.com
# We might need to encode the high-bit characters to ensure proper representation
# print STDERR "Converting high-bit characters to entities\n";
# $data =~ s/([\200-\400])/&get_entity($1)>/gs;
# There is no get_entity function yet - no point writing it til we need it :)
msg ("Removing INFORMALFIGURE...");
$data =~ s{<informalfigure>.+?</informalfigure>}
{}gs;
print STDERR "Changing @@ to @...\n";
$data =~ s/@@/@/gs;
msg ("Convert ampersands to XML escape sequences ");
$data =~ s/&(?!\w+;)/&amp;/g;
print STDERR "Changing '_' to '-' in references...\n";
$data =~ s{id=\"(.+?)\"}
{"id=\"".&fix_underscore($1)."\""}gsex;
$data =~ s{linkend=\"(.+?)\"}
{"linkend=\"".&fix_underscore($1)."\""}gsex;
msg ("Changing @@ to @...");
$data =~ s/@@/@/g;
print STDERR "Changing ULINK to SYSTEMITEM...\n";
$data =~ s{<ulink url=\"(.+?)\"></ulink>}
{<systemitem role=\"url\">$1</systemitem>}gs;
msg ("Rework references of the notation '<n>'");
# Need to talk to Arjen about what the <n> bits are for
$data =~ s/<(\d)>/[$1]/g;
print STDERR "Removing INFORMALFIGURE...\n";
$data =~ s{<informalfigure>(.+?)</informalfigure>}
{}gs;
msg ("Changing '_' to '-' in references...");
$data =~ s{((?:id|linkend)=\".+?\")}
{&underscore2hyphen($1)}gex;
print STDERR "Adding PARA inside ENTRY...\n";
msg ("Changing ULINK to SYSTEMITEM...");
$data =~ s{<ulink url=\"(.+?)\">\s*</ulink>}
{<systemitem role=\"url\">$1</systemitem>}gs;
msg ("Adding PARA inside ENTRY...");
$data =~ s{<entry>(.*?)</entry>}
{<entry><para>$1</para></entry>}gs;
print STDERR "Removing mailto: from email addresses...\n";
$data =~ s{mailto:}
{}gs;
msg ("Fixing spacing problem with titles...");
$data =~ s{(</\w+>)(\w{2,})}
{$1 $2}gs;
print STDERR "Fixing spacing problem with titles...\n";
$data =~ s{</(\w+)>(\w{2,})}
{</$1> $2}gs;
msg ("Adding closing / to XREF and COLSPEC tags...");
$data =~ s{<(xref|colspec) (.+?)>}
{<$1 $2 />}gs;
# 2002-02-15 arjen@mysql.com
print STDERR "Adding closing / to XREF...\n";
$data =~ s{<xref (.+?)>}
{<xref $1 />}gs;
# 2002-02-22 arjen@mysql.com
print STDERR "Adding \"See \" to XREFs that used to be \@xref...\n";
$data =~ s{([\.\'\!\)])[\n ]*<xref }
# Probably need to strip these
msg ('Adding "See " to XREFs that used to be @xref...');
$data =~ s{([.'!)])\s*<xref }
{$1 See <xref }gs;
# 2002-02-22 arjen@mysql.com
print STDERR "Adding \"see \" to (XREFs) that used to be (\@pxref)...\n";
$data =~ s{(\(|[[,;])([\n]*[ ]*)<xref }
msg ('Adding "see " to (XREFs) that used to be (@pxref)...');
$data =~ s{([([,;])(\s*)<xref }
{$1$2see <xref }gs;
# 2002-01-30 arjen@mysql.com
print STDERR "Removing COLSPEC...\n";
$data =~ s{\n *<colspec colwidth=\"[0-9]+\*\">}
{}gs;
# 2002-01-31 arjen@mysql.com
print STDERR "Making first row in table THEAD...\n";
$data =~ s{([ ]*)<tbody>\n([ ]*<row>(.+?)</row>)}
{$1<thead>\n$2\n$1</thead>\n$1<tbody>}gs;
msg ("Making first row in table THEAD...");
$data =~ s{( *)<tbody>(\s*<row>.+?</row>)}
{$1<thead>$2\n$1</thead>\n$1<tbody>}gs;
# 2002-01-31 arjen@mysql.com
print STDERR "Removing EMPHASIS inside THEAD...\n";
msg ("Removing EMPHASIS inside THEAD...");
$data =~ s{<thead>(.+?)</thead>}
{"<thead>".&strip_emph($1)."</thead>"}gsex;
# 2002-01-31 arjen@mysql.com
print STDERR "Removing lf before /PARA in ENTRY...\n";
$data =~ s{(<entry><para>(.+?))\n(</para></entry>)}
{$1$3}gs;
{"<thead>".&strip_tag($1, 'emphasis')."</thead>"}gsex;
# 2002-01-31 arjen@mysql.com (2002-02-15 added \n stuff)
print STDERR "Removing whitespace before /PARA if not on separate line...\n";
$data =~ s{([^\n ])[ ]+</para>}
{$1</para>}gs;
msg ("Removing empty PARA...");
$data =~ s{<para>\s*</para>}
{}gs;
# 2002-01-31 arjen@mysql.com
print STDERR "Removing empty PARA in ENTRY...\n";
$data =~ s{<entry><para></para></entry>}
{<entry></entry>}gs;
msg ("Removing lf before /PARA in ENTRY...");
$data =~ s{\n(</para></entry>)}
{$1}gs;
# 2002-01-31 arjen@mysql.com
print STDERR "Removing PARA around INDEXENTRY if no text in PARA...\n";
$data =~ s{<para>((<indexterm role=\"(cp|fn)\">(<(primary|secondary)>[^<]+?</(primary|secondary)>)+?</indexterm>)+?)[\n]*</para>[\n]*}
{$1\n}gs;
msg ("Removing whitespace before /PARA if not on separate line...");
$data =~ s{(\S+)[\t ]+</para>}
{$1</para>}g;
# -----
msg ("Removing PARA around INDEXTERM if no text in PARA...");
$data =~ s{<para>((?:<indexterm role=\"(?:cp|fn)\">(?:<(primary|secondary)>[^>]+</\2>)+?</indexterm>)+?)\s*</para>}
{$1}gs;
@apx = ("Users", "MySQL Testimonials", "News",
"GPL-license", "LGPL-license");
@apx = ("Users", "MySQL Testimonials", "News", "GPL-license", "LGPL-license");
foreach $apx (@apx) {
print STDERR "Removing appendix $apx...\n";
msg ("Removing appendix $apx...");
$data =~ s{<appendix id=\"$apx\">(.+?)</appendix>}
{}gs;
print STDERR " ... Building list of removed nodes ...\n";
foreach(split "\n", $&) {
push @nodes, $2 if(/<(\w+) id=\"(.+?)\">/)
};
};
# Skip to next appendix regex if the regex did not match anything
next unless (defined $&);
msg ("...Building list of removed nodes...");
# Split the last bracketed regex match into an array
# Extract the node names from the tags and push them into an array
foreach (split "\n", $&) {
push @nodes, $1 if /<\w+ id=\"(.+?)\">/
}
}
# 2002-02-22 arjen@mysql.com (added fix " /" to end of regex, to make it match)
print STDERR "Fixing references to removed nodes...\n";
foreach $node (@nodes) {
$web = $node;
$web =~ s/[ ]/_/;
$web = "http://www.mysql.com/doc/" .
(join "/", (split //, $web)[0..1])."/$web.html";
print STDERR "$node -> $web\n";
$data =~ s{<(\w+) linkend=\"$node\" />}
{$web}gs;
};
msg ("Fixing references to removed nodes...");
# Merge the list of node names into a set of regex alternations
$nodes = join "|", @nodes;
# Find all references to removed nodes and convert them to absolute URLs
$data =~ s{<\w+ linkend="($nodes)" />}
{&xref2link($1)}ges;
print STDOUT $data;
exit;
#
# Definitions for helper sub-routines
#
sub msg {
print STDERR shift, "\n";
}
sub strip_tag($$) {
(my $str, my $tag) = @_;
$str =~ s{<$tag>(.+?)</$tag>}{$1}gs;
return $str;
}
sub underscore2hyphen($) {
my $str = shift;
$str =~ tr/_/-/;
return $str;
}
sub xref2link {
my $ref = shift;
$ref =~ tr/ /_/;
$ref =~ s{^((.)(.).+)$}{$2/$3/$1.html};
return "http://www.mysql.com/doc/" . $ref;
}
# We might need to encode the high-bit characters to ensure proper representation
# msg ("Converting high-bit characters to entities");
# $data =~ s/([\200-\400])/&get_entity($1)>/gs;
# There is no get_entity function yet - no point writing it til we need it :)
#! /usr/local/bin/perl
#! /usr/bin/perl -w
# O'Reilly's Perl script to chop mysql.xml into separate ch/apps/index files.
# The indexes are actually not used, they're created straight from the xrefs.
use strict;
# Breaks the MySQL reference manual into chapters, appendices, and indexes.
my $input_file;
my $directory;
my $chap_num;
my $app_letter;
my $start_text;
my $line;
my $input_file;
my $output_name;
$input_file = "mysql.xml";
$directory="chaps_apps_index";
$chap_num=1; # Start chapter numbers at one (there is no preface)
$app_letter="a"; # Start appendix letters at "a"
$start_text="";
$line="";
open (INPUT_FILE, '<' . $input_file) or die "Cannot open $input_file";
if (-d $directory) {
my $unlinked = unlink <$directory/*>;
printf(Removed "%d files\n", $unlinked);
}
else {
mkdir $directory or die "Cannot make $directory subdirectory";
}
use strict;
while (1) {
my $app_letter = "a"; # Start appendix letters at "a"
my $chap_num = 1; # Start chapter numbers at one (there is no preface)
my $directory = "chaps_apps_index";
my $ext = ".xml";
my $line = "";
my $output_name = "";
my $start_text = "";
# Terminating statement for loop.
exit if not defined $line;
mkdir $directory unless -d $directory;
if ($line =~ /(?:.*)(<chapter.*)/i ) {
while (defined $line) {
if ($line =~ /(<chapter.+)/i ) {
$start_text = $1;
$output_name = &make_chapter_name($chap_num);
$chap_num++;
$output_name = sprintf("ch%02d%s", $chap_num, $ext);
++$chap_num;
&process_file("chapter");
}
elsif ($line =~ /(?:.*)(<appendix.*)/i ) {
elsif ($line =~ /(<appendix.+)/i ) {
$start_text = $1 ;
$output_name = &make_appendix_name($app_letter);
$app_letter++;
$output_name = "app$app_letter$ext";
++$app_letter;
&process_file("appendix");
}
elsif ($line =~ /(?:.*)(<index\s+id=")(.*?)(">.*)/i ) {
elsif ($line =~ /(<index\s+id=")(.*?)(">.*)/i ) {
$start_text = $1 . $2 . $3;
$output_name = lc($2) . ".xml";
$output_name = lc($2) . $ext;
&process_file("index");
}
else {
# Automatically skips junk in between chapters, appendices,
# and indexes.
$line = <INPUT_FILE>;
# Skip junk in between chapters, appendices and indexes.
$line = <>;
}
}
sub make_chapter_name {
my $num = shift;
my $name = "ch" . sprintf("%02d", $num) . ".xml";
return $name;
}
sub process_file {
my $marker = shift;
my $path = "$directory/$output_name";
sub make_appendix_name {
my $letter = shift;
my $name = "app" . sprintf("%s", $letter) . ".xml";
return $name;
}
open (OUTPUT_FILE, ">$path") or die "Cannot open $path";
print STDERR "Creating $path\n";
# Print out XML PI
print OUTPUT_FILE "<?xml version='1.0' encoding='ISO-8859-1'?>\n";
sub process_file {
my $marker=shift;
open (OUTPUT_FILE, '>' . $directory . "/" . $output_name) or
die "Cannot open $output_name";
# Print whatever happened to appear at the end of the previous chapter.
print OUTPUT_FILE $start_text . "\n" if $start_text;
while (1) {
$line = <INPUT_FILE>;
exit if not defined $line;
print OUTPUT_FILE "$start_text\n" if $start_text;
while (defined $line) {
$line = <>;
# Note: Anything after the terminating marker is lost, just like
# lines in between chapters.
if ($line =~ /(.*<\/\s*$marker\s*>)/i ) {
print OUTPUT_FILE $1 . "\n" if $1;
print OUTPUT_FILE "$1\n" if $1;
close OUTPUT_FILE;
return;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment