Commit 427a0adb authored by Rob Pike's avatar Rob Pike

further simplification of the case fold calculation.

hard to beat at this point, i think.

R=rsc
DELTA=38  (8 added, 21 deleted, 9 changed)
OCL=34092
CL=34096
parent 6fceadbb
......@@ -20,13 +20,9 @@ type Range struct {
// different case for that character. They may be negative. If zero, it
// means the character is in the corresponding case. There is a special
// case representing sequences of alternating corresponding Upper and Lower
// pairs. It appears with the usual Lo and Hi values and a Delta of
// {0, UpperLower, 0}
// The constant UpperLower has (meaningful) value 1. The lower case
// letters in such sequences are assumed; were they present they would
// have a Delta of
// {LowerUpper, 0, LowerUpper}
// where LowerUpper has value -1.
// pairs. It appears with a fixed Delta of
// {UpperLower, UpperLower, UpperLower}
// The constant UpperLower has an otherwise impossible delta value.
type CaseRange struct {
Lo int;
Hi int;
......@@ -46,9 +42,8 @@ type d [MaxCase]int32 // to make the CaseRanges text shorter
// this CaseRange represents a sequence of the form (say)
// Upper Lower Upper Lower.
const (
MaxChar = 0x10FFFF;
UpperLower = MaxChar + 2; // cannot be a valid delta
LowerUpper = MaxChar + 3;
MaxChar = 0x10FFFF; // Maximum valid Unicode character value.
UpperLower = MaxChar + 1; // (Cannot be a valid delta.)
)
// Is tests whether rune is in the specified table of ranges.
......@@ -118,22 +113,6 @@ func IsLetter(rune int) bool {
return Is(Letter, rune);
}
// In an Upper-Lower sequence, which always starts with an UpperCase letter,
// the real deltas always look like:
// 0 1 0
// -1 0 -1
// This is a single-dimensioned array addressed by the case shifted up one bit
// (the column of this table) or'ed with the low bit of the position in
// the sequence (the row of the table).
var ulDelta = [8]int{
(UpperCase<<1) | 0: 0,
(UpperCase<<1) | 1: -1,
(LowerCase<<1) | 0: 1,
(LowerCase<<1) | 1: 0,
(TitleCase<<1) | 0: 0,
(TitleCase<<1) | 1: -1,
}
// To maps the rune to the specified case, UpperCase, LowerCase, or TitleCase
func To(_case int, rune int) int {
if _case < 0 || MaxCase <= _case {
......@@ -148,9 +127,17 @@ func To(_case int, rune int) int {
if r.Lo <= rune && rune <= r.Hi {
delta := int(r.Delta[_case]);
if delta > MaxChar {
// Somewhere inside an UpperLower sequence. Use
// the precomputed delta table to get our offset.
delta = ulDelta[((_case<<1) | ((rune-r.Lo)&1))];
// In an Upper-Lower sequence, which always starts with
// an UpperCase letter, the real deltas always look like:
// {0, 1, 0} UpperCase (Lower is next)
// {-1, 0, -1} LowerCase (Upper, Title are previous)
// The characters at even offsets from the beginning of the
// sequence are upper case; the ones at odd offsets are lower.
// The correct mapping can be done by clearing or setting the low
// bit in the sequence offset.
// The constants UpperCase and TitleCase are even while LowerCase
// is odd so we take the low bit from _case.
return r.Lo + ((rune - r.Lo)&^1 | _case&1);
}
return rune + delta;
}
......
......@@ -798,7 +798,7 @@ func printCaseRange(lo, hi *caseState) {
fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
lo.point, hi.point)
case hi.point > lo.point && lo.isLowerUpper():
die.Log("LowerUpper sequence: should not happen: U+%04X\n", lo.point);
die.Log("LowerUpper sequence: should not happen: U+%04X. If it's real, need to fix To()", lo.point);
fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
lo.point, hi.point)
default:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment