Commit 3e52dadf authored by Russ Cox's avatar Russ Cox

regexp: use rune

Public API of syntax tree changes.

R=golang-dev, r, gri
CC=golang-dev
https://golang.org/cl/5302046
parent 81b01481
......@@ -90,15 +90,15 @@ func (m *machine) match(i input, pos int) bool {
m.matchcap[i] = -1
}
runq, nextq := &m.q0, &m.q1
rune, rune1 := endOfText, endOfText
r, r1 := endOfText, endOfText
width, width1 := 0, 0
rune, width = i.step(pos)
if rune != endOfText {
rune1, width1 = i.step(pos + width)
r, width = i.step(pos)
if r != endOfText {
r1, width1 = i.step(pos + width)
}
var flag syntax.EmptyOp
if pos == 0 {
flag = syntax.EmptyOpContext(-1, rune)
flag = syntax.EmptyOpContext(-1, r)
} else {
flag = i.context(pos)
}
......@@ -112,15 +112,15 @@ func (m *machine) match(i input, pos int) bool {
// Have match; finished exploring alternatives.
break
}
if len(m.re.prefix) > 0 && rune1 != m.re.prefixRune && i.canCheckPrefix() {
if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
// Match requires literal prefix; fast search for it.
advance := i.index(m.re, pos)
if advance < 0 {
break
}
pos += advance
rune, width = i.step(pos)
rune1, width1 = i.step(pos + width)
r, width = i.step(pos)
r1, width1 = i.step(pos + width)
}
}
if !m.matched {
......@@ -129,8 +129,8 @@ func (m *machine) match(i input, pos int) bool {
}
m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil)
}
flag = syntax.EmptyOpContext(rune, rune1)
m.step(runq, nextq, pos, pos+width, rune, flag)
flag = syntax.EmptyOpContext(r, r1)
m.step(runq, nextq, pos, pos+width, r, flag)
if width == 0 {
break
}
......@@ -140,9 +140,9 @@ func (m *machine) match(i input, pos int) bool {
break
}
pos += width
rune, width = rune1, width1
if rune != endOfText {
rune1, width1 = i.step(pos + width)
r, width = r1, width1
if r != endOfText {
r1, width1 = i.step(pos + width)
}
runq, nextq = nextq, runq
}
......@@ -166,7 +166,7 @@ func (m *machine) clear(q *queue) {
// The step processes the rune c (which may be endOfText),
// which starts at position pos and ends at nextPos.
// nextCond gives the setting for the empty-width flags after c.
func (m *machine) step(runq, nextq *queue, pos, nextPos, c int, nextCond syntax.EmptyOp) {
func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) {
longest := m.re.longest
for j := 0; j < len(runq.dense); j++ {
d := &runq.dense[j]
......
......@@ -83,7 +83,7 @@ type Regexp struct {
prefix string // required prefix in unanchored matches
prefixBytes []byte // prefix, as a []byte
prefixComplete bool // prefix is the entire regexp
prefixRune int // first rune in prefix
prefixRune rune // first rune in prefix
cond syntax.EmptyOp // empty-width conditions required at start of match
numSubexp int
longest bool
......@@ -224,13 +224,13 @@ func (re *Regexp) NumSubexp() int {
return re.numSubexp
}
const endOfText = -1
const endOfText rune = -1
// input abstracts different representations of the input text. It provides
// one-character lookahead.
type input interface {
step(pos int) (rune int, width int) // advance one rune
canCheckPrefix() bool // can we look ahead without losing info?
step(pos int) (r rune, width int) // advance one rune
canCheckPrefix() bool // can we look ahead without losing info?
hasPrefix(re *Regexp) bool
index(re *Regexp, pos int) int
context(pos int) syntax.EmptyOp
......@@ -245,11 +245,11 @@ func newInputString(str string) *inputString {
return &inputString{str: str}
}
func (i *inputString) step(pos int) (int, int) {
func (i *inputString) step(pos int) (rune, int) {
if pos < len(i.str) {
c := i.str[pos]
if c < utf8.RuneSelf {
return int(c), 1
return rune(c), 1
}
return utf8.DecodeRuneInString(i.str[pos:])
}
......@@ -269,7 +269,7 @@ func (i *inputString) index(re *Regexp, pos int) int {
}
func (i *inputString) context(pos int) syntax.EmptyOp {
r1, r2 := -1, -1
r1, r2 := endOfText, endOfText
if pos > 0 && pos <= len(i.str) {
r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
}
......@@ -288,11 +288,11 @@ func newInputBytes(str []byte) *inputBytes {
return &inputBytes{str: str}
}
func (i *inputBytes) step(pos int) (int, int) {
func (i *inputBytes) step(pos int) (rune, int) {
if pos < len(i.str) {
c := i.str[pos]
if c < utf8.RuneSelf {
return int(c), 1
return rune(c), 1
}
return utf8.DecodeRune(i.str[pos:])
}
......@@ -312,7 +312,7 @@ func (i *inputBytes) index(re *Regexp, pos int) int {
}
func (i *inputBytes) context(pos int) syntax.EmptyOp {
r1, r2 := -1, -1
r1, r2 := endOfText, endOfText
if pos > 0 && pos <= len(i.str) {
r1, _ = utf8.DecodeLastRune(i.str[:pos])
}
......@@ -333,7 +333,7 @@ func newInputReader(r io.RuneReader) *inputReader {
return &inputReader{r: r}
}
func (i *inputReader) step(pos int) (int, int) {
func (i *inputReader) step(pos int) (rune, int) {
if !i.atEOT && pos != i.pos {
return endOfText, 0
......
......@@ -91,8 +91,8 @@ func (c *compiler) init() {
c.inst(InstFail)
}
var anyRuneNotNL = []int{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
var anyRune = []int{0, unicode.MaxRune}
var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
var anyRune = []rune{0, unicode.MaxRune}
func (c *compiler) compile(re *Regexp) frag {
switch re.Op {
......@@ -262,12 +262,12 @@ func (c *compiler) empty(op EmptyOp) frag {
return f
}
func (c *compiler) rune(rune []int, flags Flags) frag {
func (c *compiler) rune(r []rune, flags Flags) frag {
f := c.inst(InstRune)
i := &c.p.Inst[f.i]
i.Rune = rune
i.Rune = r
flags &= FoldCase // only relevant flag is FoldCase
if len(rune) != 1 || unicode.SimpleFold(rune[0]) == rune[0] {
if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
// and sometimes not even that
flags &^= FoldCase
}
......@@ -276,11 +276,11 @@ func (c *compiler) rune(rune []int, flags Flags) frag {
// Special cases for exec machine.
switch {
case flags&FoldCase == 0 && (len(rune) == 1 || len(rune) == 2 && rune[0] == rune[1]):
case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
i.Op = InstRune1
case len(rune) == 2 && rune[0] == 0 && rune[1] == unicode.MaxRune:
case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
i.Op = InstRuneAny
case len(rune) == 4 && rune[0] == 0 && rune[1] == '\n'-1 && rune[2] == '\n'+1 && rune[3] == unicode.MaxRune:
case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
i.Op = InstRuneAnyNotNL
}
......
......@@ -57,7 +57,7 @@ sub ComputeClass($) {
sub PrintClass($$@) {
my ($cname, $name, @ranges) = @_;
print "var code$cname = []int{ /* $name */\n";
print "var code$cname = []rune{ /* $name */\n";
for (my $i=0; $i<@ranges; $i++) {
my @a = @{$ranges[$i]};
printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
......
This diff is collapsed.
......@@ -371,10 +371,10 @@ func dumpRegexp(b *bytes.Buffer, re *Regexp) {
b.WriteByte('}')
}
func mkCharClass(f func(int) bool) string {
func mkCharClass(f func(rune) bool) string {
re := &Regexp{Op: OpCharClass}
lo := -1
for i := 0; i <= unicode.MaxRune; i++ {
lo := rune(-1)
for i := rune(0); i <= unicode.MaxRune; i++ {
if f(i) {
if lo < 0 {
lo = i
......@@ -392,12 +392,12 @@ func mkCharClass(f func(int) bool) string {
return dump(re)
}
func isUpperFold(rune int) bool {
if unicode.IsUpper(rune) {
func isUpperFold(r rune) bool {
if unicode.IsUpper(r) {
return true
}
c := unicode.SimpleFold(rune)
for c != rune {
c := unicode.SimpleFold(r)
for c != r {
if unicode.IsUpper(c) {
return true
}
......@@ -407,8 +407,8 @@ func isUpperFold(rune int) bool {
}
func TestFoldConstants(t *testing.T) {
last := -1
for i := 0; i <= unicode.MaxRune; i++ {
last := rune(-1)
for i := rune(0); i <= unicode.MaxRune; i++ {
if unicode.SimpleFold(i) == i {
continue
}
......@@ -427,8 +427,8 @@ func TestAppendRangeCollapse(t *testing.T) {
// into the earlier ones (it looks back two ranges), so that
// the slice never grows very large.
// Note that we are not calling cleanClass.
var r []int
for i := 'A'; i <= 'Z'; i++ {
var r []rune
for i := rune('A'); i <= 'Z'; i++ {
r = appendRange(r, i, i)
r = appendRange(r, i+'a'-'A', i+'a'-'A')
}
......
......@@ -3,17 +3,17 @@
package syntax
var code1 = []int{ /* \d */
var code1 = []rune{ /* \d */
0x30, 0x39,
}
var code2 = []int{ /* \s */
var code2 = []rune{ /* \s */
0x9, 0xa,
0xc, 0xd,
0x20, 0x20,
}
var code3 = []int{ /* \w */
var code3 = []rune{ /* \w */
0x30, 0x39,
0x41, 0x5a,
0x5f, 0x5f,
......@@ -28,71 +28,71 @@ var perlGroup = map[string]charGroup{
`\w`: {+1, code3},
`\W`: {-1, code3},
}
var code4 = []int{ /* [:alnum:] */
var code4 = []rune{ /* [:alnum:] */
0x30, 0x39,
0x41, 0x5a,
0x61, 0x7a,
}
var code5 = []int{ /* [:alpha:] */
var code5 = []rune{ /* [:alpha:] */
0x41, 0x5a,
0x61, 0x7a,
}
var code6 = []int{ /* [:ascii:] */
var code6 = []rune{ /* [:ascii:] */
0x0, 0x7f,
}
var code7 = []int{ /* [:blank:] */
var code7 = []rune{ /* [:blank:] */
0x9, 0x9,
0x20, 0x20,
}
var code8 = []int{ /* [:cntrl:] */
var code8 = []rune{ /* [:cntrl:] */
0x0, 0x1f,
0x7f, 0x7f,
}
var code9 = []int{ /* [:digit:] */
var code9 = []rune{ /* [:digit:] */
0x30, 0x39,
}
var code10 = []int{ /* [:graph:] */
var code10 = []rune{ /* [:graph:] */
0x21, 0x7e,
}
var code11 = []int{ /* [:lower:] */
var code11 = []rune{ /* [:lower:] */
0x61, 0x7a,
}
var code12 = []int{ /* [:print:] */
var code12 = []rune{ /* [:print:] */
0x20, 0x7e,
}
var code13 = []int{ /* [:punct:] */
var code13 = []rune{ /* [:punct:] */
0x21, 0x2f,
0x3a, 0x40,
0x5b, 0x60,
0x7b, 0x7e,
}
var code14 = []int{ /* [:space:] */
var code14 = []rune{ /* [:space:] */
0x9, 0xd,
0x20, 0x20,
}
var code15 = []int{ /* [:upper:] */
var code15 = []rune{ /* [:upper:] */
0x41, 0x5a,
}
var code16 = []int{ /* [:word:] */
var code16 = []rune{ /* [:word:] */
0x30, 0x39,
0x41, 0x5a,
0x5f, 0x5f,
0x61, 0x7a,
}
var code17 = []int{ /* [:xdigit:] */
var code17 = []rune{ /* [:xdigit:] */
0x30, 0x39,
0x41, 0x46,
0x61, 0x66,
......
......@@ -51,7 +51,7 @@ const (
// at the beginning of the text.
// Passing r2 == -1 indicates that the position is
// at the end of the text.
func EmptyOpContext(r1, r2 int) EmptyOp {
func EmptyOpContext(r1, r2 rune) EmptyOp {
var op EmptyOp
if r1 < 0 {
op |= EmptyBeginText | EmptyBeginLine
......@@ -76,7 +76,7 @@ func EmptyOpContext(r1, r2 int) EmptyOp {
// IsWordChar reports whether r is consider a ``word character''
// during the evaluation of the \b and \B zero-width assertions.
// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
func IsWordChar(r int) bool {
func IsWordChar(r rune) bool {
return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
}
......@@ -85,7 +85,7 @@ type Inst struct {
Op InstOp
Out uint32 // all but InstMatch, InstFail
Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
Rune []int
Rune []rune
}
func (p *Prog) String() string {
......@@ -161,7 +161,7 @@ Loop:
// MatchRune returns true if the instruction matches (and consumes) r.
// It should only be called when i.Op == InstRune.
func (i *Inst) MatchRune(r int) bool {
func (i *Inst) MatchRune(r rune) bool {
rune := i.Rune
// Special case: single-rune slice is from literal string, not char class.
......@@ -210,17 +210,17 @@ func (i *Inst) MatchRune(r int) bool {
// As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
// Since we act on runes, it would be easy to support Unicode here.
func wordRune(rune int) bool {
return rune == '_' ||
('A' <= rune && rune <= 'Z') ||
('a' <= rune && rune <= 'z') ||
('0' <= rune && rune <= '9')
func wordRune(r rune) bool {
return r == '_' ||
('A' <= r && r <= 'Z') ||
('a' <= r && r <= 'z') ||
('0' <= r && r <= '9')
}
// MatchEmptyWidth returns true if the instruction matches
// an empty string between the runes before and after.
// It should only be called when i.Op == InstEmptyWidth.
func (i *Inst) MatchEmptyWidth(before int, after int) bool {
func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
switch EmptyOp(i.Arg) {
case EmptyBeginLine:
return before == '\n' || before == -1
......
......@@ -22,8 +22,8 @@ type Regexp struct {
Flags Flags
Sub []*Regexp // subexpressions, if any
Sub0 [1]*Regexp // storage for short Sub
Rune []int // matched runes, for OpLiteral, OpCharClass
Rune0 [2]int // storage for short Rune
Rune []rune // matched runes, for OpLiteral, OpCharClass
Rune0 [2]rune // storage for short Rune
Min, Max int // min, max for OpRepeat
Cap int // capturing index, for OpCapture
Name string // capturing name, for OpCapture
......@@ -252,7 +252,7 @@ func (re *Regexp) String() string {
const meta = `\.+*?()|[]{}^$`
func escape(b *bytes.Buffer, r int, force bool) {
func escape(b *bytes.Buffer, r rune, force bool) {
if unicode.IsPrint(r) {
if strings.IndexRune(meta, r) >= 0 || force {
b.WriteRune('\\')
......@@ -277,7 +277,7 @@ func escape(b *bytes.Buffer, r int, force bool) {
default:
if r < 0x100 {
b.WriteString(`\x`)
s := strconv.Itob(r, 16)
s := strconv.Itob(int(r), 16)
if len(s) == 1 {
b.WriteRune('0')
}
......@@ -285,7 +285,7 @@ func escape(b *bytes.Buffer, r int, force bool) {
break
}
b.WriteString(`\x{`)
b.WriteString(strconv.Itob(r, 16))
b.WriteString(strconv.Itob(int(r), 16))
b.WriteString(`}`)
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment