Commit 42e0cc60 authored by Robert Griesemer's avatar Robert Griesemer

go/scanner: accept new Go2 number literals

This CL introduces go/scanner support for the new binary and octal integer
literals, hexadecimal floats, and digit separators for all number literals.
The new code is closely mirroring the respective code for number literals in
cmd/compile/internal/syntax/scanner.go.

R=Go1.13

Updates #12711.
Updates #19308.
Updates #28493.
Updates #29008.

Change-Id: I5315c6aaa7cfc41a618296be20e3acd5114d6b3c
Reviewed-on: https://go-review.googlesource.com/c/159997Reviewed-by: default avatarIan Lance Taylor <iant@golang.org>
Reviewed-by: default avatarRuss Cox <rsc@golang.org>
parent ceb849dd
...@@ -150,6 +150,10 @@ func (s *Scanner) error(offs int, msg string) { ...@@ -150,6 +150,10 @@ func (s *Scanner) error(offs int, msg string) {
s.ErrorCount++ s.ErrorCount++
} }
func (s *Scanner) errorf(offs int, format string, args ...interface{}) {
s.error(offs, fmt.Sprintf(format, args...))
}
func (s *Scanner) scanComment() string { func (s *Scanner) scanComment() string {
// initial '/' already consumed; s.ch == '/' || s.ch == '*' // initial '/' already consumed; s.ch == '/' || s.ch == '*'
offs := s.offset - 1 // position of initial '/' offs := s.offset - 1 // position of initial '/'
...@@ -336,11 +340,11 @@ func (s *Scanner) findLineEnd() bool { ...@@ -336,11 +340,11 @@ func (s *Scanner) findLineEnd() bool {
} }
func isLetter(ch rune) bool { func isLetter(ch rune) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
} }
func isDigit(ch rune) bool { func isDigit(ch rune) bool {
return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch) return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
} }
func (s *Scanner) scanIdentifier() string { func (s *Scanner) scanIdentifier() string {
...@@ -355,95 +359,188 @@ func digitVal(ch rune) int { ...@@ -355,95 +359,188 @@ func digitVal(ch rune) int {
switch { switch {
case '0' <= ch && ch <= '9': case '0' <= ch && ch <= '9':
return int(ch - '0') return int(ch - '0')
case 'a' <= ch && ch <= 'f': case 'a' <= lower(ch) && lower(ch) <= 'f':
return int(ch - 'a' + 10) return int(lower(ch) - 'a' + 10)
case 'A' <= ch && ch <= 'F':
return int(ch - 'A' + 10)
} }
return 16 // larger than any legal digit val return 16 // larger than any legal digit val
} }
func (s *Scanner) scanMantissa(base int) { func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
for digitVal(s.ch) < base { func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
// digits accepts the sequence { digit | '_' }.
// If base <= 10, digits accepts any decimal digit but records
// the offset (relative to the source start) of a digit >= base
// in *invalid, if *invalid < 0.
// digits returns a bitset describing whether the sequence contained
// digits (bit 0 is set), or separators '_' (bit 1 is set).
func (s *Scanner) digits(base int, invalid *int) (digsep int) {
if base <= 10 {
max := rune('0' + base)
for isDecimal(s.ch) || s.ch == '_' {
ds := 1
if s.ch == '_' {
ds = 2
} else if s.ch >= max && *invalid < 0 {
*invalid = int(s.offset) // record invalid rune offset
}
digsep |= ds
s.next()
}
} else {
for isHex(s.ch) || s.ch == '_' {
ds := 1
if s.ch == '_' {
ds = 2
}
digsep |= ds
s.next() s.next()
} }
}
return
} }
func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { func (s *Scanner) scanNumber() (token.Token, string) {
// digitVal(s.ch) < 10
offs := s.offset offs := s.offset
tok := token.INT tok := token.ILLEGAL
if seenDecimalPoint { base := 10 // number base
offs-- prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
tok = token.FLOAT digsep := 0 // bit 0: digit present, bit 1: '_' present
s.scanMantissa(10) invalid := -1 // index of invalid digit in literal, or < 0
goto exponent
}
// integer part
if s.ch != '.' {
tok = token.INT
if s.ch == '0' { if s.ch == '0' {
// int or float
offs := s.offset
s.next() s.next()
if s.ch == 'x' || s.ch == 'X' { switch lower(s.ch) {
// hexadecimal int case 'x':
s.next() s.next()
s.scanMantissa(16) base, prefix = 16, 'x'
if s.offset-offs <= 2 { case 'o':
// only scanned "0x" or "0X" s.next()
s.error(offs, "illegal hexadecimal number") base, prefix = 8, 'o'
} case 'b':
} else { s.next()
// octal int or float base, prefix = 2, 'b'
seenDecimalDigit := false default:
s.scanMantissa(8) base, prefix = 8, '0'
if s.ch == '8' || s.ch == '9' { digsep = 1 // leading 0
// illegal octal int or float
seenDecimalDigit = true
s.scanMantissa(10)
}
if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
goto fraction
} }
// octal int
if seenDecimalDigit {
s.error(offs, "illegal octal number")
} }
digsep |= s.digits(base, &invalid)
} }
goto exit
}
// decimal int or float
s.scanMantissa(10)
fraction: // fractional part
if s.ch == '.' { if s.ch == '.' {
tok = token.FLOAT tok = token.FLOAT
if prefix == 'o' || prefix == 'b' {
s.error(s.offset, "invalid radix point in "+litname(prefix))
}
s.next() s.next()
s.scanMantissa(10) digsep |= s.digits(base, &invalid)
} }
exponent: if digsep&1 == 0 {
if s.ch == 'e' || s.ch == 'E' { s.error(s.offset, litname(prefix)+" has no digits")
tok = token.FLOAT }
// exponent
if e := lower(s.ch); e == 'e' || e == 'p' {
switch {
case e == 'e' && prefix != 0 && prefix != '0':
s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
case e == 'p' && prefix != 'x':
s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
}
s.next() s.next()
if s.ch == '-' || s.ch == '+' { tok = token.FLOAT
if s.ch == '+' || s.ch == '-' {
s.next() s.next()
} }
if digitVal(s.ch) < 10 { ds := s.digits(10, nil)
s.scanMantissa(10) digsep |= ds
} else { if ds&1 == 0 {
s.error(offs, "illegal floating-point exponent") s.error(s.offset, "exponent has no digits")
} }
} else if prefix == 'x' && tok == token.FLOAT {
s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
} }
// suffix 'i'
if s.ch == 'i' { if s.ch == 'i' {
tok = token.IMAG tok = token.IMAG
if prefix != 0 && prefix != '0' {
s.error(s.offset, "invalid suffix 'i' on "+litname(prefix))
}
s.next() s.next()
} }
exit: lit := string(s.src[offs:s.offset])
return tok, string(s.src[offs:s.offset]) if tok == token.INT && invalid >= 0 {
s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
}
if digsep&2 != 0 {
if i := invalidSep(lit); i >= 0 {
s.error(offs+i, "'_' must separate successive digits")
}
}
return tok, lit
}
func litname(prefix rune) string {
switch prefix {
case 'x':
return "hexadecimal literal"
case 'o', '0':
return "octal literal"
case 'b':
return "binary literal"
}
return "decimal literal"
}
// invalidSep returns the index of the first invalid separator in x, or -1.
func invalidSep(x string) int {
x1 := ' ' // prefix char, we only care if it's 'x'
d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else)
i := 0
// a prefix counts as a digit
if len(x) >= 2 && x[0] == '0' {
x1 = lower(rune(x[1]))
if x1 == 'x' || x1 == 'o' || x1 == 'b' {
d = '0'
i = 2
}
}
// mantissa and exponent
for ; i < len(x); i++ {
p := d // previous digit
d = rune(x[i])
switch {
case d == '_':
if p != '0' {
return i
}
case isDecimal(d) || x1 == 'x' && isHex(d):
d = '0'
default:
if p == '_' {
return i - 1
}
d = '.'
}
}
if d == '_' {
return len(x) - 1
}
return -1
} }
// scanEscape parses an escape sequence where rune is the accepted // scanEscape parses an escape sequence where rune is the accepted
...@@ -708,9 +805,9 @@ scanAgain: ...@@ -708,9 +805,9 @@ scanAgain:
insertSemi = true insertSemi = true
tok = token.IDENT tok = token.IDENT
} }
case '0' <= ch && ch <= '9': case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
insertSemi = true insertSemi = true
tok, lit = s.scanNumber(false) tok, lit = s.scanNumber()
default: default:
s.next() // always make progress s.next() // always make progress
switch ch { switch ch {
...@@ -741,17 +838,13 @@ scanAgain: ...@@ -741,17 +838,13 @@ scanAgain:
case ':': case ':':
tok = s.switch2(token.COLON, token.DEFINE) tok = s.switch2(token.COLON, token.DEFINE)
case '.': case '.':
if '0' <= s.ch && s.ch <= '9' { // fractions starting with a '.' are handled by outer switch
insertSemi = true
tok, lit = s.scanNumber(true)
} else {
tok = token.PERIOD tok = token.PERIOD
if s.ch == '.' && s.peek() == '.' { if s.ch == '.' && s.peek() == '.' {
s.next() s.next()
s.next() // consume last '.' s.next() // consume last '.'
tok = token.ELLIPSIS tok = token.ELLIPSIS
} }
}
case ',': case ',':
tok = token.COMMA tok = token.COMMA
case ';': case ';':
...@@ -835,7 +928,7 @@ scanAgain: ...@@ -835,7 +928,7 @@ scanAgain:
default: default:
// next reports unexpected BOMs - don't repeat // next reports unexpected BOMs - don't repeat
if ch != bom { if ch != bom {
s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
} }
insertSemi = s.insertSemi // preserve insertSemi info insertSemi = s.insertSemi // preserve insertSemi info
tok = token.ILLEGAL tok = token.ILLEGAL
......
...@@ -10,6 +10,7 @@ import ( ...@@ -10,6 +10,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
"strings"
"testing" "testing"
) )
...@@ -802,11 +803,10 @@ var errors = []struct { ...@@ -802,11 +803,10 @@ var errors = []struct {
{"078.", token.FLOAT, 0, "078.", ""}, {"078.", token.FLOAT, 0, "078.", ""},
{"07801234567.", token.FLOAT, 0, "07801234567.", ""}, {"07801234567.", token.FLOAT, 0, "07801234567.", ""},
{"078e0", token.FLOAT, 0, "078e0", ""}, {"078e0", token.FLOAT, 0, "078e0", ""},
{"0E", token.FLOAT, 0, "0E", "illegal floating-point exponent"}, // issue 17621 {"0E", token.FLOAT, 2, "0E", "exponent has no digits"}, // issue 17621
{"078", token.INT, 0, "078", "illegal octal number"}, {"078", token.INT, 2, "078", "invalid digit '8' in octal literal"},
{"07800000009", token.INT, 0, "07800000009", "illegal octal number"}, {"07090000008", token.INT, 3, "07090000008", "invalid digit '9' in octal literal"},
{"0x", token.INT, 0, "0x", "illegal hexadecimal number"}, {"0x", token.INT, 2, "0x", "hexadecimal literal has no digits"},
{"0X", token.INT, 0, "0X", "illegal hexadecimal number"},
{"\"abc\x00def\"", token.STRING, 4, "\"abc\x00def\"", "illegal character NUL"}, {"\"abc\x00def\"", token.STRING, 4, "\"abc\x00def\"", "illegal character NUL"},
{"\"abc\x80def\"", token.STRING, 4, "\"abc\x80def\"", "illegal UTF-8 encoding"}, {"\"abc\x80def\"", token.STRING, 4, "\"abc\x80def\"", "illegal UTF-8 encoding"},
{"\ufeff\ufeff", token.ILLEGAL, 3, "\ufeff\ufeff", "illegal byte order mark"}, // only first BOM is ignored {"\ufeff\ufeff", token.ILLEGAL, 3, "\ufeff\ufeff", "illegal byte order mark"}, // only first BOM is ignored
...@@ -912,3 +912,199 @@ func BenchmarkScanFile(b *testing.B) { ...@@ -912,3 +912,199 @@ func BenchmarkScanFile(b *testing.B) {
} }
} }
} }
func TestNumbers(t *testing.T) {
for _, test := range []struct {
tok token.Token
src, tokens, err string
}{
// binaries
{token.INT, "0b0", "0b0", ""},
{token.INT, "0b1010", "0b1010", ""},
{token.INT, "0B1110", "0B1110", ""},
{token.INT, "0b", "0b", "binary literal has no digits"},
{token.INT, "0b0190", "0b0190", "invalid digit '9' in binary literal"},
{token.INT, "0b01a0", "0b01 a0", ""}, // only accept 0-9
// binary floats and imaginaries (invalid)
{token.FLOAT, "0b.", "0b.", "invalid radix point in binary literal"},
{token.FLOAT, "0b.1", "0b.1", "invalid radix point in binary literal"},
{token.FLOAT, "0b1.0", "0b1.0", "invalid radix point in binary literal"},
{token.FLOAT, "0b1e10", "0b1e10", "'e' exponent requires decimal mantissa"},
{token.FLOAT, "0b1P-1", "0b1P-1", "'P' exponent requires hexadecimal mantissa"},
{token.IMAG, "0b10i", "0b10i", "invalid suffix 'i' on binary literal"},
// octals
{token.INT, "0o0", "0o0", ""},
{token.INT, "0o1234", "0o1234", ""},
{token.INT, "0O1234", "0O1234", ""},
{token.INT, "0o", "0o", "octal literal has no digits"},
{token.INT, "0o8123", "0o8123", "invalid digit '8' in octal literal"},
{token.INT, "0o1293", "0o1293", "invalid digit '9' in octal literal"},
{token.INT, "0o12a3", "0o12 a3", ""}, // only accept 0-9
// octal floats and imaginaries (invalid)
{token.FLOAT, "0o.", "0o.", "invalid radix point in octal literal"},
{token.FLOAT, "0o.2", "0o.2", "invalid radix point in octal literal"},
{token.FLOAT, "0o1.2", "0o1.2", "invalid radix point in octal literal"},
{token.FLOAT, "0o1E+2", "0o1E+2", "'E' exponent requires decimal mantissa"},
{token.FLOAT, "0o1p10", "0o1p10", "'p' exponent requires hexadecimal mantissa"},
{token.IMAG, "0o10i", "0o10i", "invalid suffix 'i' on octal literal"},
// 0-octals
{token.INT, "0", "0", ""},
{token.INT, "0123", "0123", ""},
{token.INT, "08123", "08123", "invalid digit '8' in octal literal"},
{token.INT, "01293", "01293", "invalid digit '9' in octal literal"},
{token.INT, "0F.", "0 F .", ""}, // only accept 0-9
{token.INT, "0123F.", "0123 F .", ""},
{token.INT, "0123456x", "0123456 x", ""},
// decimals
{token.INT, "1", "1", ""},
{token.INT, "1234", "1234", ""},
{token.INT, "1f", "1 f", ""}, // only accept 0-9
// decimal floats
{token.FLOAT, "0.", "0.", ""},
{token.FLOAT, "123.", "123.", ""},
{token.FLOAT, "0123.", "0123.", ""},
{token.FLOAT, ".0", ".0", ""},
{token.FLOAT, ".123", ".123", ""},
{token.FLOAT, ".0123", ".0123", ""},
{token.FLOAT, "0.0", "0.0", ""},
{token.FLOAT, "123.123", "123.123", ""},
{token.FLOAT, "0123.0123", "0123.0123", ""},
{token.FLOAT, "0e0", "0e0", ""},
{token.FLOAT, "123e+0", "123e+0", ""},
{token.FLOAT, "0123E-1", "0123E-1", ""},
{token.FLOAT, "0.e+1", "0.e+1", ""},
{token.FLOAT, "123.E-10", "123.E-10", ""},
{token.FLOAT, "0123.e123", "0123.e123", ""},
{token.FLOAT, ".0e-1", ".0e-1", ""},
{token.FLOAT, ".123E+10", ".123E+10", ""},
{token.FLOAT, ".0123E123", ".0123E123", ""},
{token.FLOAT, "0.0e1", "0.0e1", ""},
{token.FLOAT, "123.123E-10", "123.123E-10", ""},
{token.FLOAT, "0123.0123e+456", "0123.0123e+456", ""},
{token.FLOAT, "0e", "0e", "exponent has no digits"},
{token.FLOAT, "0E+", "0E+", "exponent has no digits"},
{token.FLOAT, "1e+f", "1e+ f", "exponent has no digits"},
{token.FLOAT, "0p0", "0p0", "'p' exponent requires hexadecimal mantissa"},
{token.FLOAT, "1.0P-1", "1.0P-1", "'P' exponent requires hexadecimal mantissa"},
// decimal imaginaries
{token.IMAG, "0.i", "0.i", ""},
{token.IMAG, ".123i", ".123i", ""},
{token.IMAG, "123.123i", "123.123i", ""},
{token.IMAG, "123e+0i", "123e+0i", ""},
{token.IMAG, "123.E-10i", "123.E-10i", ""},
{token.IMAG, ".123E+10i", ".123E+10i", ""},
// hexadecimals
{token.INT, "0x0", "0x0", ""},
{token.INT, "0x1234", "0x1234", ""},
{token.INT, "0xcafef00d", "0xcafef00d", ""},
{token.INT, "0XCAFEF00D", "0XCAFEF00D", ""},
{token.INT, "0x", "0x", "hexadecimal literal has no digits"},
{token.INT, "0x1g", "0x1 g", ""},
// hexadecimal floats
{token.FLOAT, "0x0p0", "0x0p0", ""},
{token.FLOAT, "0x12efp-123", "0x12efp-123", ""},
{token.FLOAT, "0xABCD.p+0", "0xABCD.p+0", ""},
{token.FLOAT, "0x.0189P-0", "0x.0189P-0", ""},
{token.FLOAT, "0x1.ffffp+1023", "0x1.ffffp+1023", ""},
{token.FLOAT, "0x.", "0x.", "hexadecimal literal has no digits"},
{token.FLOAT, "0x0.", "0x0.", "hexadecimal mantissa requires a 'p' exponent"},
{token.FLOAT, "0x.0", "0x.0", "hexadecimal mantissa requires a 'p' exponent"},
{token.FLOAT, "0x1.1", "0x1.1", "hexadecimal mantissa requires a 'p' exponent"},
{token.FLOAT, "0x1.1e0", "0x1.1e0", "hexadecimal mantissa requires a 'p' exponent"},
{token.FLOAT, "0x1.2gp1a", "0x1.2 gp1a", "hexadecimal mantissa requires a 'p' exponent"},
{token.FLOAT, "0x0p", "0x0p", "exponent has no digits"},
{token.FLOAT, "0xeP-", "0xeP-", "exponent has no digits"},
{token.FLOAT, "0x1234PAB", "0x1234P AB", "exponent has no digits"},
{token.FLOAT, "0x1.2p1a", "0x1.2p1 a", ""},
// hexadecimal imaginaries (invalid)
{token.IMAG, "0xf00i", "0xf00i", "invalid suffix 'i' on hexadecimal literal"},
{token.IMAG, "0xf00.bap+12i", "0xf00.bap+12i", "invalid suffix 'i' on hexadecimal literal"},
// separators
{token.INT, "0b_1000_0001", "0b_1000_0001", ""},
{token.INT, "0o_600", "0o_600", ""},
{token.INT, "0_466", "0_466", ""},
{token.INT, "1_000", "1_000", ""},
{token.FLOAT, "1_000.000_1", "1_000.000_1", ""},
{token.IMAG, "10e+1_2_3i", "10e+1_2_3i", ""},
{token.INT, "0x_f00d", "0x_f00d", ""},
{token.FLOAT, "0x_f00d.0p1_2", "0x_f00d.0p1_2", ""},
{token.INT, "0b__1000", "0b__1000", "'_' must separate successive digits"},
{token.INT, "0o60___0", "0o60___0", "'_' must separate successive digits"},
{token.INT, "0466_", "0466_", "'_' must separate successive digits"},
{token.FLOAT, "1_.", "1_.", "'_' must separate successive digits"},
{token.FLOAT, "0._1", "0._1", "'_' must separate successive digits"},
{token.FLOAT, "2.7_e0", "2.7_e0", "'_' must separate successive digits"},
{token.IMAG, "10e+12_i", "10e+12_i", "'_' must separate successive digits"},
{token.INT, "0x___0", "0x___0", "'_' must separate successive digits"},
{token.FLOAT, "0x1.0_p0", "0x1.0_p0", "'_' must separate successive digits"},
} {
var s Scanner
var err string
s.Init(fset.AddFile("", fset.Base(), len(test.src)), []byte(test.src), func(_ token.Position, msg string) {
if err == "" {
err = msg
}
}, 0)
for i, want := range strings.Split(test.tokens, " ") {
err = ""
_, tok, lit := s.Scan()
// compute lit where for tokens where lit is not defined
switch tok {
case token.PERIOD:
lit = "."
case token.ADD:
lit = "+"
case token.SUB:
lit = "-"
}
if i == 0 {
if tok != test.tok {
t.Errorf("%q: got token %s; want %s", test.src, tok, test.tok)
}
if err != test.err {
t.Errorf("%q: got error %q; want %q", test.src, err, test.err)
}
}
if lit != want {
t.Errorf("%q: got literal %q (%s); want %s", test.src, lit, tok, want)
}
}
// make sure we read all
_, tok, _ := s.Scan()
if tok == token.SEMICOLON {
_, tok, _ = s.Scan()
}
if tok != token.EOF {
t.Errorf("%q: got %s; want EOF", test.src, tok)
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment