Commit 42e0cc60 authored by Robert Griesemer's avatar Robert Griesemer

go/scanner: accept new Go2 number literals

This CL introduces go/scanner support for the new binary and octal integer
literals, hexadecimal floats, and digit separators for all number literals.
The new code is closely mirroring the respective code for number literals in
cmd/compile/internal/syntax/scanner.go.

R=Go1.13

Updates #12711.
Updates #19308.
Updates #28493.
Updates #29008.

Change-Id: I5315c6aaa7cfc41a618296be20e3acd5114d6b3c
Reviewed-on: https://go-review.googlesource.com/c/159997Reviewed-by: default avatarIan Lance Taylor <iant@golang.org>
Reviewed-by: default avatarRuss Cox <rsc@golang.org>
parent ceb849dd
......@@ -150,6 +150,10 @@ func (s *Scanner) error(offs int, msg string) {
s.ErrorCount++
}
func (s *Scanner) errorf(offs int, format string, args ...interface{}) {
s.error(offs, fmt.Sprintf(format, args...))
}
func (s *Scanner) scanComment() string {
// initial '/' already consumed; s.ch == '/' || s.ch == '*'
offs := s.offset - 1 // position of initial '/'
......@@ -336,11 +340,11 @@ func (s *Scanner) findLineEnd() bool {
}
func isLetter(ch rune) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
}
func isDigit(ch rune) bool {
return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
}
func (s *Scanner) scanIdentifier() string {
......@@ -355,95 +359,188 @@ func digitVal(ch rune) int {
switch {
case '0' <= ch && ch <= '9':
return int(ch - '0')
case 'a' <= ch && ch <= 'f':
return int(ch - 'a' + 10)
case 'A' <= ch && ch <= 'F':
return int(ch - 'A' + 10)
case 'a' <= lower(ch) && lower(ch) <= 'f':
return int(lower(ch) - 'a' + 10)
}
return 16 // larger than any legal digit val
}
func (s *Scanner) scanMantissa(base int) {
for digitVal(s.ch) < base {
s.next()
func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
// digits accepts the sequence { digit | '_' }.
// If base <= 10, digits accepts any decimal digit but records
// the offset (relative to the source start) of a digit >= base
// in *invalid, if *invalid < 0.
// digits returns a bitset describing whether the sequence contained
// digits (bit 0 is set), or separators '_' (bit 1 is set).
func (s *Scanner) digits(base int, invalid *int) (digsep int) {
if base <= 10 {
max := rune('0' + base)
for isDecimal(s.ch) || s.ch == '_' {
ds := 1
if s.ch == '_' {
ds = 2
} else if s.ch >= max && *invalid < 0 {
*invalid = int(s.offset) // record invalid rune offset
}
digsep |= ds
s.next()
}
} else {
for isHex(s.ch) || s.ch == '_' {
ds := 1
if s.ch == '_' {
ds = 2
}
digsep |= ds
s.next()
}
}
return
}
func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
// digitVal(s.ch) < 10
func (s *Scanner) scanNumber() (token.Token, string) {
offs := s.offset
tok := token.INT
tok := token.ILLEGAL
if seenDecimalPoint {
offs--
tok = token.FLOAT
s.scanMantissa(10)
goto exponent
}
base := 10 // number base
prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
digsep := 0 // bit 0: digit present, bit 1: '_' present
invalid := -1 // index of invalid digit in literal, or < 0
if s.ch == '0' {
// int or float
offs := s.offset
s.next()
if s.ch == 'x' || s.ch == 'X' {
// hexadecimal int
// integer part
if s.ch != '.' {
tok = token.INT
if s.ch == '0' {
s.next()
s.scanMantissa(16)
if s.offset-offs <= 2 {
// only scanned "0x" or "0X"
s.error(offs, "illegal hexadecimal number")
}
} else {
// octal int or float
seenDecimalDigit := false
s.scanMantissa(8)
if s.ch == '8' || s.ch == '9' {
// illegal octal int or float
seenDecimalDigit = true
s.scanMantissa(10)
}
if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
goto fraction
}
// octal int
if seenDecimalDigit {
s.error(offs, "illegal octal number")
switch lower(s.ch) {
case 'x':
s.next()
base, prefix = 16, 'x'
case 'o':
s.next()
base, prefix = 8, 'o'
case 'b':
s.next()
base, prefix = 2, 'b'
default:
base, prefix = 8, '0'
digsep = 1 // leading 0
}
}
goto exit
digsep |= s.digits(base, &invalid)
}
// decimal int or float
s.scanMantissa(10)
fraction:
// fractional part
if s.ch == '.' {
tok = token.FLOAT
if prefix == 'o' || prefix == 'b' {
s.error(s.offset, "invalid radix point in "+litname(prefix))
}
s.next()
s.scanMantissa(10)
digsep |= s.digits(base, &invalid)
}
exponent:
if s.ch == 'e' || s.ch == 'E' {
tok = token.FLOAT
if digsep&1 == 0 {
s.error(s.offset, litname(prefix)+" has no digits")
}
// exponent
if e := lower(s.ch); e == 'e' || e == 'p' {
switch {
case e == 'e' && prefix != 0 && prefix != '0':
s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
case e == 'p' && prefix != 'x':
s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
}
s.next()
if s.ch == '-' || s.ch == '+' {
tok = token.FLOAT
if s.ch == '+' || s.ch == '-' {
s.next()
}
if digitVal(s.ch) < 10 {
s.scanMantissa(10)
} else {
s.error(offs, "illegal floating-point exponent")
ds := s.digits(10, nil)
digsep |= ds
if ds&1 == 0 {
s.error(s.offset, "exponent has no digits")
}
} else if prefix == 'x' && tok == token.FLOAT {
s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
}
// suffix 'i'
if s.ch == 'i' {
tok = token.IMAG
if prefix != 0 && prefix != '0' {
s.error(s.offset, "invalid suffix 'i' on "+litname(prefix))
}
s.next()
}
exit:
return tok, string(s.src[offs:s.offset])
lit := string(s.src[offs:s.offset])
if tok == token.INT && invalid >= 0 {
s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
}
if digsep&2 != 0 {
if i := invalidSep(lit); i >= 0 {
s.error(offs+i, "'_' must separate successive digits")
}
}
return tok, lit
}
func litname(prefix rune) string {
switch prefix {
case 'x':
return "hexadecimal literal"
case 'o', '0':
return "octal literal"
case 'b':
return "binary literal"
}
return "decimal literal"
}
// invalidSep returns the index of the first invalid separator in x, or -1.
func invalidSep(x string) int {
x1 := ' ' // prefix char, we only care if it's 'x'
d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else)
i := 0
// a prefix counts as a digit
if len(x) >= 2 && x[0] == '0' {
x1 = lower(rune(x[1]))
if x1 == 'x' || x1 == 'o' || x1 == 'b' {
d = '0'
i = 2
}
}
// mantissa and exponent
for ; i < len(x); i++ {
p := d // previous digit
d = rune(x[i])
switch {
case d == '_':
if p != '0' {
return i
}
case isDecimal(d) || x1 == 'x' && isHex(d):
d = '0'
default:
if p == '_' {
return i - 1
}
d = '.'
}
}
if d == '_' {
return len(x) - 1
}
return -1
}
// scanEscape parses an escape sequence where rune is the accepted
......@@ -708,9 +805,9 @@ scanAgain:
insertSemi = true
tok = token.IDENT
}
case '0' <= ch && ch <= '9':
case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
insertSemi = true
tok, lit = s.scanNumber(false)
tok, lit = s.scanNumber()
default:
s.next() // always make progress
switch ch {
......@@ -741,16 +838,12 @@ scanAgain:
case ':':
tok = s.switch2(token.COLON, token.DEFINE)
case '.':
if '0' <= s.ch && s.ch <= '9' {
insertSemi = true
tok, lit = s.scanNumber(true)
} else {
tok = token.PERIOD
if s.ch == '.' && s.peek() == '.' {
s.next()
s.next() // consume last '.'
tok = token.ELLIPSIS
}
// fractions starting with a '.' are handled by outer switch
tok = token.PERIOD
if s.ch == '.' && s.peek() == '.' {
s.next()
s.next() // consume last '.'
tok = token.ELLIPSIS
}
case ',':
tok = token.COMMA
......@@ -835,7 +928,7 @@ scanAgain:
default:
// next reports unexpected BOMs - don't repeat
if ch != bom {
s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
}
insertSemi = s.insertSemi // preserve insertSemi info
tok = token.ILLEGAL
......
......@@ -10,6 +10,7 @@ import (
"os"
"path/filepath"
"runtime"
"strings"
"testing"
)
......@@ -802,11 +803,10 @@ var errors = []struct {
{"078.", token.FLOAT, 0, "078.", ""},
{"07801234567.", token.FLOAT, 0, "07801234567.", ""},
{"078e0", token.FLOAT, 0, "078e0", ""},
{"0E", token.FLOAT, 0, "0E", "illegal floating-point exponent"}, // issue 17621
{"078", token.INT, 0, "078", "illegal octal number"},
{"07800000009", token.INT, 0, "07800000009", "illegal octal number"},
{"0x", token.INT, 0, "0x", "illegal hexadecimal number"},
{"0X", token.INT, 0, "0X", "illegal hexadecimal number"},
{"0E", token.FLOAT, 2, "0E", "exponent has no digits"}, // issue 17621
{"078", token.INT, 2, "078", "invalid digit '8' in octal literal"},
{"07090000008", token.INT, 3, "07090000008", "invalid digit '9' in octal literal"},
{"0x", token.INT, 2, "0x", "hexadecimal literal has no digits"},
{"\"abc\x00def\"", token.STRING, 4, "\"abc\x00def\"", "illegal character NUL"},
{"\"abc\x80def\"", token.STRING, 4, "\"abc\x80def\"", "illegal UTF-8 encoding"},
{"\ufeff\ufeff", token.ILLEGAL, 3, "\ufeff\ufeff", "illegal byte order mark"}, // only first BOM is ignored
......@@ -912,3 +912,199 @@ func BenchmarkScanFile(b *testing.B) {
}
}
}
func TestNumbers(t *testing.T) {
for _, test := range []struct {
tok token.Token
src, tokens, err string
}{
// binaries
{token.INT, "0b0", "0b0", ""},
{token.INT, "0b1010", "0b1010", ""},
{token.INT, "0B1110", "0B1110", ""},
{token.INT, "0b", "0b", "binary literal has no digits"},
{token.INT, "0b0190", "0b0190", "invalid digit '9' in binary literal"},
{token.INT, "0b01a0", "0b01 a0", ""}, // only accept 0-9
// binary floats and imaginaries (invalid)
{token.FLOAT, "0b.", "0b.", "invalid radix point in binary literal"},
{token.FLOAT, "0b.1", "0b.1", "invalid radix point in binary literal"},
{token.FLOAT, "0b1.0", "0b1.0", "invalid radix point in binary literal"},
{token.FLOAT, "0b1e10", "0b1e10", "'e' exponent requires decimal mantissa"},
{token.FLOAT, "0b1P-1", "0b1P-1", "'P' exponent requires hexadecimal mantissa"},
{token.IMAG, "0b10i", "0b10i", "invalid suffix 'i' on binary literal"},
// octals
{token.INT, "0o0", "0o0", ""},
{token.INT, "0o1234", "0o1234", ""},
{token.INT, "0O1234", "0O1234", ""},
{token.INT, "0o", "0o", "octal literal has no digits"},
{token.INT, "0o8123", "0o8123", "invalid digit '8' in octal literal"},
{token.INT, "0o1293", "0o1293", "invalid digit '9' in octal literal"},
{token.INT, "0o12a3", "0o12 a3", ""}, // only accept 0-9
// octal floats and imaginaries (invalid)
{token.FLOAT, "0o.", "0o.", "invalid radix point in octal literal"},
{token.FLOAT, "0o.2", "0o.2", "invalid radix point in octal literal"},
{token.FLOAT, "0o1.2", "0o1.2", "invalid radix point in octal literal"},
{token.FLOAT, "0o1E+2", "0o1E+2", "'E' exponent requires decimal mantissa"},
{token.FLOAT, "0o1p10", "0o1p10", "'p' exponent requires hexadecimal mantissa"},
{token.IMAG, "0o10i", "0o10i", "invalid suffix 'i' on octal literal"},
// 0-octals
{token.INT, "0", "0", ""},
{token.INT, "0123", "0123", ""},
{token.INT, "08123", "08123", "invalid digit '8' in octal literal"},
{token.INT, "01293", "01293", "invalid digit '9' in octal literal"},
{token.INT, "0F.", "0 F .", ""}, // only accept 0-9
{token.INT, "0123F.", "0123 F .", ""},
{token.INT, "0123456x", "0123456 x", ""},
// decimals
{token.INT, "1", "1", ""},
{token.INT, "1234", "1234", ""},
{token.INT, "1f", "1 f", ""}, // only accept 0-9
// decimal floats
{token.FLOAT, "0.", "0.", ""},
{token.FLOAT, "123.", "123.", ""},
{token.FLOAT, "0123.", "0123.", ""},
{token.FLOAT, ".0", ".0", ""},
{token.FLOAT, ".123", ".123", ""},
{token.FLOAT, ".0123", ".0123", ""},
{token.FLOAT, "0.0", "0.0", ""},
{token.FLOAT, "123.123", "123.123", ""},
{token.FLOAT, "0123.0123", "0123.0123", ""},
{token.FLOAT, "0e0", "0e0", ""},
{token.FLOAT, "123e+0", "123e+0", ""},
{token.FLOAT, "0123E-1", "0123E-1", ""},
{token.FLOAT, "0.e+1", "0.e+1", ""},
{token.FLOAT, "123.E-10", "123.E-10", ""},
{token.FLOAT, "0123.e123", "0123.e123", ""},
{token.FLOAT, ".0e-1", ".0e-1", ""},
{token.FLOAT, ".123E+10", ".123E+10", ""},
{token.FLOAT, ".0123E123", ".0123E123", ""},
{token.FLOAT, "0.0e1", "0.0e1", ""},
{token.FLOAT, "123.123E-10", "123.123E-10", ""},
{token.FLOAT, "0123.0123e+456", "0123.0123e+456", ""},
{token.FLOAT, "0e", "0e", "exponent has no digits"},
{token.FLOAT, "0E+", "0E+", "exponent has no digits"},
{token.FLOAT, "1e+f", "1e+ f", "exponent has no digits"},
{token.FLOAT, "0p0", "0p0", "'p' exponent requires hexadecimal mantissa"},
{token.FLOAT, "1.0P-1", "1.0P-1", "'P' exponent requires hexadecimal mantissa"},
// decimal imaginaries
{token.IMAG, "0.i", "0.i", ""},
{token.IMAG, ".123i", ".123i", ""},
{token.IMAG, "123.123i", "123.123i", ""},
{token.IMAG, "123e+0i", "123e+0i", ""},
{token.IMAG, "123.E-10i", "123.E-10i", ""},
{token.IMAG, ".123E+10i", ".123E+10i", ""},
// hexadecimals
{token.INT, "0x0", "0x0", ""},
{token.INT, "0x1234", "0x1234", ""},
{token.INT, "0xcafef00d", "0xcafef00d", ""},
{token.INT, "0XCAFEF00D", "0XCAFEF00D", ""},
{token.INT, "0x", "0x", "hexadecimal literal has no digits"},
{token.INT, "0x1g", "0x1 g", ""},
// hexadecimal floats
{token.FLOAT, "0x0p0", "0x0p0", ""},
{token.FLOAT, "0x12efp-123", "0x12efp-123", ""},
{token.FLOAT, "0xABCD.p+0", "0xABCD.p+0", ""},
{token.FLOAT, "0x.0189P-0", "0x.0189P-0", ""},
{token.FLOAT, "0x1.ffffp+1023", "0x1.ffffp+1023", ""},
{token.FLOAT, "0x.", "0x.", "hexadecimal literal has no digits"},
{token.FLOAT, "0x0.", "0x0.", "hexadecimal mantissa requires a 'p' exponent"},
{token.FLOAT, "0x.0", "0x.0", "hexadecimal mantissa requires a 'p' exponent"},
{token.FLOAT, "0x1.1", "0x1.1", "hexadecimal mantissa requires a 'p' exponent"},
{token.FLOAT, "0x1.1e0", "0x1.1e0", "hexadecimal mantissa requires a 'p' exponent"},
{token.FLOAT, "0x1.2gp1a", "0x1.2 gp1a", "hexadecimal mantissa requires a 'p' exponent"},
{token.FLOAT, "0x0p", "0x0p", "exponent has no digits"},
{token.FLOAT, "0xeP-", "0xeP-", "exponent has no digits"},
{token.FLOAT, "0x1234PAB", "0x1234P AB", "exponent has no digits"},
{token.FLOAT, "0x1.2p1a", "0x1.2p1 a", ""},
// hexadecimal imaginaries (invalid)
{token.IMAG, "0xf00i", "0xf00i", "invalid suffix 'i' on hexadecimal literal"},
{token.IMAG, "0xf00.bap+12i", "0xf00.bap+12i", "invalid suffix 'i' on hexadecimal literal"},
// separators
{token.INT, "0b_1000_0001", "0b_1000_0001", ""},
{token.INT, "0o_600", "0o_600", ""},
{token.INT, "0_466", "0_466", ""},
{token.INT, "1_000", "1_000", ""},
{token.FLOAT, "1_000.000_1", "1_000.000_1", ""},
{token.IMAG, "10e+1_2_3i", "10e+1_2_3i", ""},
{token.INT, "0x_f00d", "0x_f00d", ""},
{token.FLOAT, "0x_f00d.0p1_2", "0x_f00d.0p1_2", ""},
{token.INT, "0b__1000", "0b__1000", "'_' must separate successive digits"},
{token.INT, "0o60___0", "0o60___0", "'_' must separate successive digits"},
{token.INT, "0466_", "0466_", "'_' must separate successive digits"},
{token.FLOAT, "1_.", "1_.", "'_' must separate successive digits"},
{token.FLOAT, "0._1", "0._1", "'_' must separate successive digits"},
{token.FLOAT, "2.7_e0", "2.7_e0", "'_' must separate successive digits"},
{token.IMAG, "10e+12_i", "10e+12_i", "'_' must separate successive digits"},
{token.INT, "0x___0", "0x___0", "'_' must separate successive digits"},
{token.FLOAT, "0x1.0_p0", "0x1.0_p0", "'_' must separate successive digits"},
} {
var s Scanner
var err string
s.Init(fset.AddFile("", fset.Base(), len(test.src)), []byte(test.src), func(_ token.Position, msg string) {
if err == "" {
err = msg
}
}, 0)
for i, want := range strings.Split(test.tokens, " ") {
err = ""
_, tok, lit := s.Scan()
// compute lit where for tokens where lit is not defined
switch tok {
case token.PERIOD:
lit = "."
case token.ADD:
lit = "+"
case token.SUB:
lit = "-"
}
if i == 0 {
if tok != test.tok {
t.Errorf("%q: got token %s; want %s", test.src, tok, test.tok)
}
if err != test.err {
t.Errorf("%q: got error %q; want %q", test.src, err, test.err)
}
}
if lit != want {
t.Errorf("%q: got literal %q (%s); want %s", test.src, lit, tok, want)
}
}
// make sure we read all
_, tok, _ := s.Scan()
if tok == token.SEMICOLON {
_, tok, _ = s.Scan()
}
if tok != token.EOF {
t.Errorf("%q: got %s; want EOF", test.src, tok)
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment