Commit 7f8b3ad3 authored by zak@linux.local's avatar zak@linux.local

Nice, relaxing Perl munging. :) Have another change to put into this

changeset - afaik, I can edit changesets with bk. So, rather than wait, I will
commit this now, finish up the other code, try to alter the changeset and then
push all the changes up.
parent c630531b
#!/usr/bin/perl -w #!/usr/bin/perl -w
# 2002-02-15 zak@mysql.com
# Use -w to make perl print useful warnings about the script being run
sub fix_underscore { # Fix the output of `makeinfo --docbook` version 4.0c
$str = shift; # Convert the broken docbook output to well-formed XML that conforms to the O'Reilly idiom
$str =~ tr/_/-/; # See code for detailed comments
return $str; # Authors: Arjen Lentz and Zak Greant
};
sub strip_emph { use strict;
$str = shift;
$str =~ s{<emphasis>(.+?)</emphasis>}
{$1}gs;
return $str;
};
print STDERR "\n--Post-processing makeinfo output--\n"; my $data = '';
my @apx = ();
my $apx = '';
my @nodes = ();
my $nodes = '';
# 2002-02-15 zak@mysql.com msg ("\n-- Post-processing `makeinfo --docbook` output --");
print STDERR "Discard DTD - ORA can add the appropriate DTD for their flavour of DocBook\n"; msg ("** Written to work with makeinfo version 4.0c **\n");
<STDIN>;
print STDERR "Slurp! In comes the rest of the file. :)\n"; msg ("Discarding DTD - not required by subsequent scripts");
$data = join "", <STDIN>; # <> is a magic filehandle - either reading lines from stdin or from file(s) specified on the command line
<>;
# 2002-02-15 zak@mysql.com msg ("Create an XML PI with ISO-8859-1 character encoding");
print STDERR "Add an XML processing instruction with the right character encoding\n"; $data = "<?xml version='1.0' encoding='ISO-8859-1'?>";
$data = "<?xml version='1.0' encoding='ISO-8859-1'?>" . $data;
# 2002-02-15 zak@mysql.com msg ("Get the rest of the data");
# Less than optimal - should be fixed in makeinfo $data = $data . join "", <>;
print STDERR "Put in missing <bookinfo> and <abstract>\n";
$data =~ s/<book lang="en">/<book lang="en"><bookinfo><abstract>/gs;
# 2002-02-15 zak@mysql.com msg ("Add missing <bookinfo> and <abstract> opening tags");
print STDERR "Convert existing ampersands to escape sequences \n"; # Note the absence of the g (global) pattern modified. This situation can only happen once.
$data =~ s/&(?!\w+;)/&amp;/gs; # ...as soon as we find the first instance, we can stop looking.
$data =~ s/<book lang="en">/<book lang="en"><bookinfo><abstract>/;
# 2002-02-15 zak@mysql.com msg ("Removing mailto: from email addresses...");
# Need to talk to Arjen about what the <n> bits are for $data =~ s/mailto://g;
print STDERR "Rework references of the notation '<n>'\n";
$data =~ s/<(\d)>/[$1]/gs; msg ("Removing INFORMALFIGURE...");
$data =~ s{<informalfigure>.+?</informalfigure>}
{}gs;
msg ("Convert ampersands to XML escape sequences ");
$data =~ s/&(?!\w+;)/&amp;/g;
# 2002-02-15 zak@mysql.com msg ("Changing @@ to @...");
# We might need to encode the high-bit characters to ensure proper representation $data =~ s/@@/@/g;
# print STDERR "Converting high-bit characters to entities\n";
# $data =~ s/([\200-\400])/&get_entity($1)>/gs;
# There is no get_entity function yet - no point writing it til we need it :)
print STDERR "Changing @@ to @...\n"; msg ("Rework references of the notation '<n>'");
$data =~ s/@@/@/gs; # Need to talk to Arjen about what the <n> bits are for
$data =~ s/<(\d)>/[$1]/g;
print STDERR "Changing '_' to '-' in references...\n"; msg ("Changing '_' to '-' in references...");
$data =~ s{id=\"(.+?)\"} $data =~ s{((?:id|linkend)=\".+?\")}
{"id=\"".&fix_underscore($1)."\""}gsex; {&underscore2hyphen($1)}gex;
$data =~ s{linkend=\"(.+?)\"}
{"linkend=\"".&fix_underscore($1)."\""}gsex;
print STDERR "Changing ULINK to SYSTEMITEM...\n"; msg ("Changing ULINK to SYSTEMITEM...");
$data =~ s{<ulink url=\"(.+?)\"></ulink>} $data =~ s{<ulink url=\"(.+?)\">\s*</ulink>}
{<systemitem role=\"url\">$1</systemitem>}gs; {<systemitem role=\"url\">$1</systemitem>}gs;
print STDERR "Removing INFORMALFIGURE...\n"; msg ("Adding PARA inside ENTRY...");
$data =~ s{<informalfigure>(.+?)</informalfigure>}
{}gs;
print STDERR "Adding PARA inside ENTRY...\n";
$data =~ s{<entry>(.*?)</entry>} $data =~ s{<entry>(.*?)</entry>}
{<entry><para>$1</para></entry>}gs; {<entry><para>$1</para></entry>}gs;
print STDERR "Removing mailto: from email addresses...\n"; msg ("Fixing spacing problem with titles...");
$data =~ s{mailto:} $data =~ s{(</\w+>)(\w{2,})}
{}gs; {$1 $2}gs;
print STDERR "Fixing spacing problem with titles...\n";
$data =~ s{</(\w+)>(\w{2,})}
{</$1> $2}gs;
# 2002-02-15 arjen@mysql.com msg ("Adding closing / to XREF and COLSPEC tags...");
print STDERR "Adding closing / to XREF...\n"; $data =~ s{<(xref|colspec) (.+?)>}
$data =~ s{<xref (.+?)>} {<$1 $2 />}gs;
{<xref $1 />}gs;
# 2002-02-22 arjen@mysql.com # Probably need to strip these
print STDERR "Adding \"See \" to XREFs that used to be \@xref...\n"; msg ('Adding "See " to XREFs that used to be @xref...');
$data =~ s{([\.\'\!\)])[\n ]*<xref } $data =~ s{([.'!)])\s*<xref }
{$1 See <xref }gs; {$1 See <xref }gs;
# 2002-02-22 arjen@mysql.com msg ('Adding "see " to (XREFs) that used to be (@pxref)...');
print STDERR "Adding \"see \" to (XREFs) that used to be (\@pxref)...\n"; $data =~ s{([([,;])(\s*)<xref }
$data =~ s{(\(|[[,;])([\n]*[ ]*)<xref }
{$1$2see <xref }gs; {$1$2see <xref }gs;
# 2002-01-30 arjen@mysql.com msg ("Making first row in table THEAD...");
print STDERR "Removing COLSPEC...\n"; $data =~ s{( *)<tbody>(\s*<row>.+?</row>)}
$data =~ s{\n *<colspec colwidth=\"[0-9]+\*\">} {$1<thead>$2\n$1</thead>\n$1<tbody>}gs;
{}gs;
# 2002-01-31 arjen@mysql.com
print STDERR "Making first row in table THEAD...\n";
$data =~ s{([ ]*)<tbody>\n([ ]*<row>(.+?)</row>)}
{$1<thead>\n$2\n$1</thead>\n$1<tbody>}gs;
# 2002-01-31 arjen@mysql.com msg ("Removing EMPHASIS inside THEAD...");
print STDERR "Removing EMPHASIS inside THEAD...\n";
$data =~ s{<thead>(.+?)</thead>} $data =~ s{<thead>(.+?)</thead>}
{"<thead>".&strip_emph($1)."</thead>"}gsex; {"<thead>".&strip_tag($1, 'emphasis')."</thead>"}gsex;
# 2002-01-31 arjen@mysql.com msg ("Removing empty PARA...");
print STDERR "Removing lf before /PARA in ENTRY...\n"; $data =~ s{<para>\s*</para>}
$data =~ s{(<entry><para>(.+?))\n(</para></entry>)} {}gs;
{$1$3}gs;
# 2002-01-31 arjen@mysql.com (2002-02-15 added \n stuff)
print STDERR "Removing whitespace before /PARA if not on separate line...\n";
$data =~ s{([^\n ])[ ]+</para>}
{$1</para>}gs;
# 2002-01-31 arjen@mysql.com msg ("Removing lf before /PARA in ENTRY...");
print STDERR "Removing empty PARA in ENTRY...\n"; $data =~ s{\n(</para></entry>)}
$data =~ s{<entry><para></para></entry>} {$1}gs;
{<entry></entry>}gs;
# 2002-01-31 arjen@mysql.com msg ("Removing whitespace before /PARA if not on separate line...");
print STDERR "Removing PARA around INDEXENTRY if no text in PARA...\n"; $data =~ s{(\S+)[\t ]+</para>}
$data =~ s{<para>((<indexterm role=\"(cp|fn)\">(<(primary|secondary)>[^<]+?</(primary|secondary)>)+?</indexterm>)+?)[\n]*</para>[\n]*} {$1</para>}g;
{$1\n}gs;
# ----- msg ("Removing PARA around INDEXTERM if no text in PARA...");
$data =~ s{<para>((?:<indexterm role=\"(?:cp|fn)\">(?:<(primary|secondary)>[^>]+</\2>)+?</indexterm>)+?)\s*</para>}
{$1}gs;
@apx = ("Users", "MySQL Testimonials", "News", @apx = ("Users", "MySQL Testimonials", "News", "GPL-license", "LGPL-license");
"GPL-license", "LGPL-license");
foreach $apx (@apx) { foreach $apx (@apx) {
print STDERR "Removing appendix $apx...\n"; msg ("Removing appendix $apx...");
$data =~ s{<appendix id=\"$apx\">(.+?)</appendix>} $data =~ s{<appendix id=\"$apx\">(.+?)</appendix>}
{}gs; {}gs;
print STDERR " ... Building list of removed nodes ...\n"; # Skip to next appendix regex if the regex did not match anything
foreach(split "\n", $&) { next unless (defined $&);
push @nodes, $2 if(/<(\w+) id=\"(.+?)\">/)
}; msg ("...Building list of removed nodes...");
};
# Split the last bracketed regex match into an array
# Extract the node names from the tags and push them into an array
foreach (split "\n", $&) {
push @nodes, $1 if /<\w+ id=\"(.+?)\">/
}
}
# 2002-02-22 arjen@mysql.com (added fix " /" to end of regex, to make it match) # 2002-02-22 arjen@mysql.com (added fix " /" to end of regex, to make it match)
print STDERR "Fixing references to removed nodes...\n"; msg ("Fixing references to removed nodes...");
foreach $node (@nodes) { # Merge the list of node names into a set of regex alternations
$web = $node; $nodes = join "|", @nodes;
$web =~ s/[ ]/_/;
$web = "http://www.mysql.com/doc/" . # Find all references to removed nodes and convert them to absolute URLs
(join "/", (split //, $web)[0..1])."/$web.html"; $data =~ s{<\w+ linkend="($nodes)" />}
print STDERR "$node -> $web\n"; {&xref2link($1)}ges;
$data =~ s{<(\w+) linkend=\"$node\" />}
{$web}gs;
};
print STDOUT $data; print STDOUT $data;
exit;
#
# Definitions for helper sub-routines
#
sub msg {
print STDERR shift, "\n";
}
sub strip_tag($$) {
(my $str, my $tag) = @_;
$str =~ s{<$tag>(.+?)</$tag>}{$1}gs;
return $str;
}
sub underscore2hyphen($) {
my $str = shift;
$str =~ tr/_/-/;
return $str;
}
sub xref2link {
my $ref = shift;
$ref =~ tr/ /_/;
$ref =~ s{^((.)(.).+)$}{$2/$3/$1.html};
return "http://www.mysql.com/doc/" . $ref;
}
# We might need to encode the high-bit characters to ensure proper representation
# msg ("Converting high-bit characters to entities");
# $data =~ s/([\200-\400])/&get_entity($1)>/gs;
# There is no get_entity function yet - no point writing it til we need it :)
#! /usr/local/bin/perl #! /usr/bin/perl -w
# O'Reilly's Perl script to chop mysql.xml into separate ch/apps/index files. # O'Reilly's Perl script to chop mysql.xml into separate ch/apps/index files.
# The indexes are actually not used, they're created straight from the xrefs. # The indexes are actually not used, they're created straight from the xrefs.
use strict;
# Breaks the MySQL reference manual into chapters, appendices, and indexes. # Breaks the MySQL reference manual into chapters, appendices, and indexes.
my $input_file; use strict;
my $directory;
my $chap_num;
my $app_letter;
my $start_text;
my $line;
my $input_file;
my $output_name;
$input_file = "mysql.xml";
$directory="chaps_apps_index";
$chap_num=1; # Start chapter numbers at one (there is no preface)
$app_letter="a"; # Start appendix letters at "a"
$start_text="";
$line="";
open (INPUT_FILE, '<' . $input_file) or die "Cannot open $input_file";
if (-d $directory) {
my $unlinked = unlink <$directory/*>;
printf(Removed "%d files\n", $unlinked);
}
else {
mkdir $directory or die "Cannot make $directory subdirectory";
}
while (1) { my $app_letter = "a"; # Start appendix letters at "a"
my $chap_num = 1; # Start chapter numbers at one (there is no preface)
my $directory = "chaps_apps_index";
my $ext = ".xml";
my $line = "";
my $output_name = "";
my $start_text = "";
# Terminating statement for loop. mkdir $directory unless -d $directory;
exit if not defined $line;
if ($line =~ /(?:.*)(<chapter.*)/i ) { while (defined $line) {
if ($line =~ /(<chapter.+)/i ) {
$start_text = $1; $start_text = $1;
$output_name = &make_chapter_name($chap_num); $output_name = sprintf("ch%02d%s", $chap_num, $ext);
$chap_num++; ++$chap_num;
&process_file("chapter"); &process_file("chapter");
} }
elsif ($line =~ /(?:.*)(<appendix.*)/i ) { elsif ($line =~ /(<appendix.+)/i ) {
$start_text = $1 ; $start_text = $1 ;
$output_name = &make_appendix_name($app_letter); $output_name = "app$app_letter$ext";
$app_letter++; ++$app_letter;
&process_file("appendix"); &process_file("appendix");
} }
elsif ($line =~ /(?:.*)(<index\s+id=")(.*?)(">.*)/i ) { elsif ($line =~ /(<index\s+id=")(.*?)(">.*)/i ) {
$start_text = $1 . $2 . $3; $start_text = $1 . $2 . $3;
$output_name = lc($2) . ".xml"; $output_name = lc($2) . $ext;
&process_file("index"); &process_file("index");
} }
else { else {
# Automatically skips junk in between chapters, appendices, # Skip junk in between chapters, appendices and indexes.
# and indexes. $line = <>;
$line = <INPUT_FILE>;
} }
} }
sub make_chapter_name { sub process_file {
my $num = shift; my $marker = shift;
my $name = "ch" . sprintf("%02d", $num) . ".xml"; my $path = "$directory/$output_name";
return $name;
}
sub make_appendix_name { open (OUTPUT_FILE, ">$path") or die "Cannot open $path";
my $letter = shift;
my $name = "app" . sprintf("%s", $letter) . ".xml";
return $name;
}
sub process_file { print STDERR "Creating $path\n";
my $marker=shift;
open (OUTPUT_FILE, '>' . $directory . "/" . $output_name) or # Print out XML PI
die "Cannot open $output_name"; print OUTPUT_FILE "<?xml version='1.0' encoding='ISO-8859-1'?>\n";
# Print whatever happened to appear at the end of the previous chapter. # Print whatever happened to appear at the end of the previous chapter.
print OUTPUT_FILE $start_text . "\n" if $start_text; print OUTPUT_FILE "$start_text\n" if $start_text;
while (1) {
$line = <INPUT_FILE>; while (defined $line) {
exit if not defined $line; $line = <>;
# Note: Anything after the terminating marker is lost, just like # Note: Anything after the terminating marker is lost, just like
# lines in between chapters. # lines in between chapters.
if ($line =~ /(.*<\/\s*$marker\s*>)/i ) { if ($line =~ /(.*<\/\s*$marker\s*>)/i ) {
print OUTPUT_FILE $1 . "\n" if $1; print OUTPUT_FILE "$1\n" if $1;
close OUTPUT_FILE; close OUTPUT_FILE;
return; return;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment