text/scanner: accept new Go2 number literals

This CL introduces text/scanner support for the new binary and octal integer literals, hexadecimal floats, and digit separators for all number literals. The new code is closely mirroring the respective code for number literals in cmd/compile/internal/syntax/scanner.go. Uniformly use the term "invalid" rather than "illegal" in error messages to match the respective error messages in the other scanners directly. R=Go1.13 Updates #12711. Updates #19308. Updates #28493. Updates #29008. Change-Id: I2f291de13ba5afc0e530cd8326e6bf4c3858ebac Reviewed-on: https://go-review.googlesource.com/c/161199 Run-TryBot: Robert Griesemer <gri@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ian Lance Taylor <iant@golang.org>

text/scanner: accept new Go2 number literals
This CL introduces text/scanner support for the new binary and octal integer literals, hexadecimal floats, and digit separators for all number literals. The new code is closely mirroring the respective code for number literals in cmd/compile/internal/syntax/scanner.go. Uniformly use the term "invalid" rather than "illegal" in error messages to match the respective error messages in the other scanners directly. R=Go1.13 Updates #12711. Updates #19308. Updates #28493. Updates #29008. Change-Id: I2f291de13ba5afc0e530cd8326e6bf4c3858ebac Reviewed-on: https://go-review.googlesource.com/c/161199 Run-TryBot: Robert Griesemer <gri@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ian Lance Taylor <iant@golang.org>
710417bc · Robert Griesemer · 33ac8544 · 710417bc · 710417bc
Commit 710417bc authored Feb 05, 2019 by Robert Griesemer
Expand all Show whitespace changes
Inline Side-by-side

Showing with 371 additions and 109 deletions

src/text/scanner/scanner.go src/text/scanner/scanner.go +174 -78

src/text/scanner/scanner_test.go src/text/scanner/scanner_test.go +197 -31

No files found.
--- a/src/text/scanner/scanner.go
+++ b/src/text/scanner/scanner.go
@@ -266,7 +266,7 @@ func (s *Scanner) next() rune {
 				s.srcPos += width
 				s.lastCharLen = width
 				s.column++
-				s.error("illegal UTF-8 encoding")
+				s.error("invalid UTF-8 encoding")
 				return ch
 			}
 		}
@@ -281,7 +281,7 @@ func (s *Scanner) next() rune {
 	switch ch {
 	case 0:
 		// for compatibility with other tools
-		s.error("illegal character NUL")
+		s.error("invalid character NUL")
 	case '\n':
 		s.line++
 		s.lastLineLen = s.column
@@ -335,6 +335,10 @@ func (s *Scanner) error(msg string) {
 	fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
 }
+func (s *Scanner) errorf(format string, args ...interface{}) {
+	s.error(fmt.Sprintf(format, args...))
+}
 func (s *Scanner) isIdentRune(ch rune, i int) bool {
 	if s.IsIdentRune != nil {
 		return s.IsIdentRune(ch, i)
@@ -351,95 +355,189 @@ func (s *Scanner) scanIdentifier() rune {
 	return ch
 }
-func digitVal(ch rune) int {
+func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
-	switch {
+func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
-	case '0' <= ch && ch <= '9':
+func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
-		return int(ch - '0')
-	case 'a' <= ch && ch <= 'f':
+// digits accepts the sequence { digit | '_' } starting with ch0.
-		return int(ch - 'a' + 10)
+// If base <= 10, digits accepts any decimal digit but records
-	case 'A' <= ch && ch <= 'F':
+// the first invalid digit >= base in *invalid if *invalid == 0.
-		return int(ch - 'A' + 10)
+// digits returns the first rune that is not part of the sequence
+// anymore, and a bitset describing whether the sequence contained
+// digits (bit 0 is set), or separators '_' (bit 1 is set).
+func (s *Scanner) digits(ch0 rune, base int, invalid *rune) (ch rune, digsep int) {
+	ch = ch0
+	if base <= 10 {
+		max := rune('0' + base)
+		for isDecimal(ch) || ch == '_' {
+			ds := 1
+			if ch == '_' {
+				ds = 2
+			} else if ch >= max && *invalid == 0 {
+				*invalid = ch
+			}
+			digsep |= ds
+			ch = s.next()
 		}
-	return 16 // larger than any legal digit val
+	} else {
+		for isHex(ch) || ch == '_' {
+			ds := 1
+			if ch == '_' {
+				ds = 2
+			}
+			digsep |= ds
+			ch = s.next()
+		}
+	}
+	return
 }
-func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
+func (s *Scanner) scanNumber(ch rune, integerPart bool) (rune, rune) {
+	base := 10         // number base
-func (s *Scanner) scanMantissa(ch rune) rune {
+	prefix := rune(0)  // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
-	for isDecimal(ch) {
+	digsep := 0        // bit 0: digit present, bit 1: '_' present
+	invalid := rune(0) // invalid digit in literal, or 0
+	// integer part
+	var tok rune
+	var ds int
+	if integerPart {
+		tok = Int
+		if ch == '0' {
 			ch = s.next()
+			switch lower(ch) {
+			case 'x':
+				ch = s.next()
+				base, prefix = 16, 'x'
+			case 'o':
+				ch = s.next()
+				base, prefix = 8, 'o'
+			case 'b':
+				ch = s.next()
+				base, prefix = 2, 'b'
+			default:
+				base, prefix = 8, '0'
+				digsep = 1 // leading 0
+			}
+		}
+		ch, ds = s.digits(ch, base, &invalid)
+		digsep |= ds
 	}
-	return ch
-}
-func (s *Scanner) scanFraction(ch rune) rune {
+	// fractional part
+	if !integerPart || ch == '.' {
+		tok = Float
+		if prefix == 'o' || prefix == 'b' {
+			s.error("invalid radix point in " + litname(prefix))
+		}
 		if ch == '.' {
-		ch = s.scanMantissa(s.next())
+			ch = s.next()
+		}
+		ch, ds = s.digits(ch, base, &invalid)
+		digsep |= ds
+	}
+	if digsep&1 == 0 {
+		s.error(litname(prefix) + " has no digits")
 	}
-	return ch
-}
-func (s *Scanner) scanExponent(ch rune) rune {
+	// exponent
-	if ch == 'e' || ch == 'E' {
+	if e := lower(ch); e == 'e' || e == 'p' {
+		switch {
+		case e == 'e' && prefix != 0 && prefix != '0':
+			s.errorf("%q exponent requires decimal mantissa", ch)
+		case e == 'p' && prefix != 'x':
+			s.errorf("%q exponent requires hexadecimal mantissa", ch)
+		}
 		ch = s.next()
-		if ch == '-' || ch == '+' {
+		tok = Float
+		if ch == '+' || ch == '-' {
 			ch = s.next()
 		}
-		if !isDecimal(ch) {
+		ch, ds = s.digits(ch, 10, nil)
-			s.error("illegal exponent")
+		digsep |= ds
+		if ds&1 == 0 {
+			s.error("exponent has no digits")
 		}
-		ch = s.scanMantissa(ch)
+	} else if prefix == 'x' && tok == Float {
+		s.error("hexadecimal mantissa requires a 'p' exponent")
 	}
-	return ch
+	if tok == Int && invalid != 0 {
+		s.errorf("invalid digit %q in %s", invalid, litname(prefix))
+	}
+	if digsep&2 != 0 {
+		s.tokEnd = s.srcPos - s.lastCharLen // make sure token text is terminated
+		if i := invalidSep(s.TokenText()); i >= 0 {
+			s.error("'_' must separate successive digits")
+		}
+	}
+	return tok, ch
 }
-func (s *Scanner) scanNumber(ch rune) (rune, rune) {
+func litname(prefix rune) string {
-	// isDecimal(ch)
+	switch prefix {
-	if ch == '0' {
+	default:
-		// int or float
+		return "decimal literal"
-		ch = s.next()
+	case 'x':
-		if ch == 'x' || ch == 'X' {
+		return "hexadecimal literal"
-			// hexadecimal int
+	case 'o', '0':
-			ch = s.next()
+		return "octal literal"
-			hasMantissa := false
+	case 'b':
-			for digitVal(ch) < 16 {
+		return "binary literal"
-				ch = s.next()
-				hasMantissa = true
 	}
-			if !hasMantissa {
+}
-				s.error("illegal hexadecimal number")
+// invalidSep returns the index of the first invalid separator in x, or -1.
+func invalidSep(x string) int {
+	x1 := ' ' // prefix char, we only care if it's 'x'
+	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
+	i := 0
+	// a prefix counts as a digit
+	if len(x) >= 2 && x[0] == '0' {
+		x1 = lower(rune(x[1]))
+		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
+			d = '0'
+			i = 2
 		}
-		} else {
-			// octal int or float
-			has8or9 := false
-			for isDecimal(ch) {
-				if ch > '7' {
-					has8or9 = true
 	}
-				ch = s.next()
+	// mantissa and exponent
+	for ; i < len(x); i++ {
+		p := d // previous digit
+		d = rune(x[i])
+		switch {
+		case d == '_':
+			if p != '0' {
+				return i
 			}
-			if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
+		case isDecimal(d) || x1 == 'x' && isHex(d):
-				// float
+			d = '0'
-				ch = s.scanFraction(ch)
+		default:
-				ch = s.scanExponent(ch)
+			if p == '_' {
-				return Float, ch
+				return i - 1
 			}
-			// octal int
+			d = '.'
-			if has8or9 {
-				s.error("illegal octal number")
 		}
 	}
-		return Int, ch
+	if d == '_' {
+		return len(x) - 1
 	}
-	// decimal int or float
-	ch = s.scanMantissa(ch)
+	return -1
-	if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
+}
-		// float
-		ch = s.scanFraction(ch)
+func digitVal(ch rune) int {
-		ch = s.scanExponent(ch)
+	switch {
-		return Float, ch
+	case '0' <= ch && ch <= '9':
+		return int(ch - '0')
+	case 'a' <= lower(ch) && lower(ch) <= 'f':
+		return int(lower(ch) - 'a' + 10)
 	}
-	return Int, ch
+	return 16 // larger than any legal digit val
 }
 func (s *Scanner) scanDigits(ch rune, base, n int) rune {
@@ -448,7 +546,7 @@ func (s *Scanner) scanDigits(ch rune, base, n int) rune {
 		n--
 	}
 	if n > 0 {
-		s.error("illegal char escape")
+		s.error("invalid char escape")
 	}
 	return ch
 }
@@ -468,7 +566,7 @@ func (s *Scanner) scanEscape(quote rune) rune {
 	case 'U':
 		ch = s.scanDigits(s.next(), 16, 8)
 	default:
-		s.error("illegal char escape")
+		s.error("invalid char escape")
 	}
 	return ch
 }
@@ -503,7 +601,7 @@ func (s *Scanner) scanRawString() {
 func (s *Scanner) scanChar() {
 	if s.scanString('\'') != 1 {
-		s.error("illegal char literal")
+		s.error("invalid char literal")
 	}
 }
@@ -584,7 +682,7 @@ redo:
 		}
 	case isDecimal(ch):
 		if s.Mode&(ScanInts|ScanFloats) != 0 {
-			tok, ch = s.scanNumber(ch)
+			tok, ch = s.scanNumber(ch, true)
 		} else {
 			ch = s.next()
 		}
@@ -607,9 +705,7 @@ redo:
 		case '.':
 			ch = s.next()
 			if isDecimal(ch) && s.Mode&ScanFloats != 0 {
-				tok = Float
+				tok, ch = s.scanNumber(ch, false)
-				ch = s.scanMantissa(ch)
-				ch = s.scanExponent(ch)
 			}
 		case '/':
 			ch = s.next()

--- a/src/text/scanner/scanner_test.go
+++ b/src/text/scanner/scanner_test.go