Commit 3fc327b3 authored by Robert Griesemer's avatar Robert Griesemer

go/scanner: 17% faster scanning

- Changed the Scan API semantics slightly:
The token literal string is only returned
if the token is a literal, comment, semicolon,
or illegal character. In all other cases, the
token literal value is determined by the token
value.

Clients that care about the token literal value
when not present can always use the following
piece of code:

pos, tok, lit := scanner.Scan()
if lit == "" {
   lit = tok.String()
}

- Changed token.Lookup API to use a string instead
of a []byte argument.

- Both these changes were long-standing TODOs.

- Added BenchmarkScan.

This change permits a faster implementation of Scan
with much fewer string creations:

benchmark                old ns/op    new ns/op    delta
scanner.BenchmarkScan        74404        61457  -17.40%

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/5532076
parent d9b82baa
...@@ -1374,7 +1374,7 @@ func (c *typeConv) Struct(dt *dwarf.StructType) (expr *ast.StructType, csyntax s ...@@ -1374,7 +1374,7 @@ func (c *typeConv) Struct(dt *dwarf.StructType) (expr *ast.StructType, csyntax s
if !*godefs && !*cdefs { if !*godefs && !*cdefs {
for cid, goid := range ident { for cid, goid := range ident {
if token.Lookup([]byte(goid)).IsKeyword() { if token.Lookup(goid).IsKeyword() {
// Avoid keyword // Avoid keyword
goid = "_" + goid goid = "_" + goid
......
...@@ -157,7 +157,7 @@ func (S *Scanner) interpretLineComment(text []byte) { ...@@ -157,7 +157,7 @@ func (S *Scanner) interpretLineComment(text []byte) {
} }
} }
func (S *Scanner) scanComment() { func (S *Scanner) scanComment() string {
// initial '/' already consumed; S.ch == '/' || S.ch == '*' // initial '/' already consumed; S.ch == '/' || S.ch == '*'
offs := S.offset - 1 // position of initial '/' offs := S.offset - 1 // position of initial '/'
...@@ -171,7 +171,7 @@ func (S *Scanner) scanComment() { ...@@ -171,7 +171,7 @@ func (S *Scanner) scanComment() {
// comment starts at the beginning of the current line // comment starts at the beginning of the current line
S.interpretLineComment(S.src[offs:S.offset]) S.interpretLineComment(S.src[offs:S.offset])
} }
return goto exit
} }
/*-style comment */ /*-style comment */
...@@ -181,11 +181,14 @@ func (S *Scanner) scanComment() { ...@@ -181,11 +181,14 @@ func (S *Scanner) scanComment() {
S.next() S.next()
if ch == '*' && S.ch == '/' { if ch == '*' && S.ch == '/' {
S.next() S.next()
return goto exit
} }
} }
S.error(offs, "comment not terminated") S.error(offs, "comment not terminated")
exit:
return string(S.src[offs:S.offset])
} }
func (S *Scanner) findLineEnd() bool { func (S *Scanner) findLineEnd() bool {
...@@ -240,12 +243,12 @@ func isDigit(ch rune) bool { ...@@ -240,12 +243,12 @@ func isDigit(ch rune) bool {
return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch) return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
} }
func (S *Scanner) scanIdentifier() token.Token { func (S *Scanner) scanIdentifier() string {
offs := S.offset offs := S.offset
for isLetter(S.ch) || isDigit(S.ch) { for isLetter(S.ch) || isDigit(S.ch) {
S.next() S.next()
} }
return token.Lookup(S.src[offs:S.offset]) return string(S.src[offs:S.offset])
} }
func digitVal(ch rune) int { func digitVal(ch rune) int {
...@@ -266,11 +269,13 @@ func (S *Scanner) scanMantissa(base int) { ...@@ -266,11 +269,13 @@ func (S *Scanner) scanMantissa(base int) {
} }
} }
func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token { func (S *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
// digitVal(S.ch) < 10 // digitVal(S.ch) < 10
offs := S.offset
tok := token.INT tok := token.INT
if seenDecimalPoint { if seenDecimalPoint {
offs--
tok = token.FLOAT tok = token.FLOAT
S.scanMantissa(10) S.scanMantissa(10)
goto exponent goto exponent
...@@ -334,7 +339,7 @@ exponent: ...@@ -334,7 +339,7 @@ exponent:
} }
exit: exit:
return tok return tok, string(S.src[offs:S.offset])
} }
func (S *Scanner) scanEscape(quote rune) { func (S *Scanner) scanEscape(quote rune) {
...@@ -381,7 +386,7 @@ func (S *Scanner) scanEscape(quote rune) { ...@@ -381,7 +386,7 @@ func (S *Scanner) scanEscape(quote rune) {
} }
} }
func (S *Scanner) scanChar() { func (S *Scanner) scanChar() string {
// '\'' opening already consumed // '\'' opening already consumed
offs := S.offset - 1 offs := S.offset - 1
...@@ -405,9 +410,11 @@ func (S *Scanner) scanChar() { ...@@ -405,9 +410,11 @@ func (S *Scanner) scanChar() {
if n != 1 { if n != 1 {
S.error(offs, "illegal character literal") S.error(offs, "illegal character literal")
} }
return string(S.src[offs:S.offset])
} }
func (S *Scanner) scanString() { func (S *Scanner) scanString() string {
// '"' opening already consumed // '"' opening already consumed
offs := S.offset - 1 offs := S.offset - 1
...@@ -424,12 +431,27 @@ func (S *Scanner) scanString() { ...@@ -424,12 +431,27 @@ func (S *Scanner) scanString() {
} }
S.next() S.next()
return string(S.src[offs:S.offset])
}
func stripCR(b []byte) []byte {
c := make([]byte, len(b))
i := 0
for _, ch := range b {
if ch != '\r' {
c[i] = ch
i++
}
}
return c[:i]
} }
func (S *Scanner) scanRawString() (hasCR bool) { func (S *Scanner) scanRawString() string {
// '`' opening already consumed // '`' opening already consumed
offs := S.offset - 1 offs := S.offset - 1
hasCR := false
for S.ch != '`' { for S.ch != '`' {
ch := S.ch ch := S.ch
S.next() S.next()
...@@ -443,7 +465,13 @@ func (S *Scanner) scanRawString() (hasCR bool) { ...@@ -443,7 +465,13 @@ func (S *Scanner) scanRawString() (hasCR bool) {
} }
S.next() S.next()
return
lit := S.src[offs:S.offset]
if hasCR {
lit = stripCR(lit)
}
return string(lit)
} }
func (S *Scanner) skipWhitespace() { func (S *Scanner) skipWhitespace() {
...@@ -494,27 +522,24 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Tok ...@@ -494,27 +522,24 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Tok
return tok0 return tok0
} }
func stripCR(b []byte) []byte { // Scan scans the next token and returns the token position, the token,
c := make([]byte, len(b)) // and its literal string if applicable. The source end is indicated by
i := 0 // token.EOF.
for _, ch := range b { //
if ch != '\r' { // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
c[i] = ch // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
i++ // has the corresponding value.
}
}
return c[:i]
}
// Scan scans the next token and returns the token position,
// the token, and the literal string corresponding to the
// token. The source end is indicated by token.EOF.
// //
// If the returned token is token.SEMICOLON, the corresponding // If the returned token is token.SEMICOLON, the corresponding
// literal string is ";" if the semicolon was present in the source, // literal string is ";" if the semicolon was present in the source,
// and "\n" if the semicolon was inserted because of a newline or // and "\n" if the semicolon was inserted because of a newline or
// at EOF. // at EOF.
// //
// If the returned token is token.ILLEGAL, the literal string is the
// offending character.
//
// In all other cases, Scan returns an empty literal string.
//
// For more tolerant parsing, Scan will return a valid token if // For more tolerant parsing, Scan will return a valid token if
// possible even if a syntax error was encountered. Thus, even // possible even if a syntax error was encountered. Thus, even
// if the resulting token sequence contains no illegal tokens, // if the resulting token sequence contains no illegal tokens,
...@@ -526,34 +551,33 @@ func stripCR(b []byte) []byte { ...@@ -526,34 +551,33 @@ func stripCR(b []byte) []byte {
// set with Init. Token positions are relative to that file // set with Init. Token positions are relative to that file
// and thus relative to the file set. // and thus relative to the file set.
// //
func (S *Scanner) Scan() (token.Pos, token.Token, string) { func (S *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
scanAgain: scanAgain:
S.skipWhitespace() S.skipWhitespace()
// current token start // current token start
insertSemi := false pos = S.file.Pos(S.offset)
offs := S.offset
tok := token.ILLEGAL
hasCR := false
// determine token value // determine token value
insertSemi := false
switch ch := S.ch; { switch ch := S.ch; {
case isLetter(ch): case isLetter(ch):
tok = S.scanIdentifier() lit = S.scanIdentifier()
tok = token.Lookup(lit)
switch tok { switch tok {
case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
insertSemi = true insertSemi = true
} }
case digitVal(ch) < 10: case digitVal(ch) < 10:
insertSemi = true insertSemi = true
tok = S.scanNumber(false) tok, lit = S.scanNumber(false)
default: default:
S.next() // always make progress S.next() // always make progress
switch ch { switch ch {
case -1: case -1:
if S.insertSemi { if S.insertSemi {
S.insertSemi = false // EOF consumed S.insertSemi = false // EOF consumed
return S.file.Pos(offs), token.SEMICOLON, "\n" return pos, token.SEMICOLON, "\n"
} }
tok = token.EOF tok = token.EOF
case '\n': case '\n':
...@@ -561,25 +585,25 @@ scanAgain: ...@@ -561,25 +585,25 @@ scanAgain:
// set in the first place and exited early // set in the first place and exited early
// from S.skipWhitespace() // from S.skipWhitespace()
S.insertSemi = false // newline consumed S.insertSemi = false // newline consumed
return S.file.Pos(offs), token.SEMICOLON, "\n" return pos, token.SEMICOLON, "\n"
case '"': case '"':
insertSemi = true insertSemi = true
tok = token.STRING tok = token.STRING
S.scanString() lit = S.scanString()
case '\'': case '\'':
insertSemi = true insertSemi = true
tok = token.CHAR tok = token.CHAR
S.scanChar() lit = S.scanChar()
case '`': case '`':
insertSemi = true insertSemi = true
tok = token.STRING tok = token.STRING
hasCR = S.scanRawString() lit = S.scanRawString()
case ':': case ':':
tok = S.switch2(token.COLON, token.DEFINE) tok = S.switch2(token.COLON, token.DEFINE)
case '.': case '.':
if digitVal(S.ch) < 10 { if digitVal(S.ch) < 10 {
insertSemi = true insertSemi = true
tok = S.scanNumber(true) tok, lit = S.scanNumber(true)
} else if S.ch == '.' { } else if S.ch == '.' {
S.next() S.next()
if S.ch == '.' { if S.ch == '.' {
...@@ -593,6 +617,7 @@ scanAgain: ...@@ -593,6 +617,7 @@ scanAgain:
tok = token.COMMA tok = token.COMMA
case ';': case ';':
tok = token.SEMICOLON tok = token.SEMICOLON
lit = ";"
case '(': case '(':
tok = token.LPAREN tok = token.LPAREN
case ')': case ')':
...@@ -626,12 +651,12 @@ scanAgain: ...@@ -626,12 +651,12 @@ scanAgain:
if S.insertSemi && S.findLineEnd() { if S.insertSemi && S.findLineEnd() {
// reset position to the beginning of the comment // reset position to the beginning of the comment
S.ch = '/' S.ch = '/'
S.offset = offs S.offset = S.file.Offset(pos)
S.rdOffset = offs + 1 S.rdOffset = S.offset + 1
S.insertSemi = false // newline consumed S.insertSemi = false // newline consumed
return S.file.Pos(offs), token.SEMICOLON, "\n" return pos, token.SEMICOLON, "\n"
} }
S.scanComment() lit = S.scanComment()
if S.mode&ScanComments == 0 { if S.mode&ScanComments == 0 {
// skip comment // skip comment
S.insertSemi = false // newline consumed S.insertSemi = false // newline consumed
...@@ -668,21 +693,15 @@ scanAgain: ...@@ -668,21 +693,15 @@ scanAgain:
case '|': case '|':
tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
default: default:
S.error(offs, fmt.Sprintf("illegal character %#U", ch)) S.error(S.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
insertSemi = S.insertSemi // preserve insertSemi info insertSemi = S.insertSemi // preserve insertSemi info
tok = token.ILLEGAL
lit = string(ch)
} }
} }
if S.mode&dontInsertSemis == 0 { if S.mode&dontInsertSemis == 0 {
S.insertSemi = insertSemi S.insertSemi = insertSemi
} }
// TODO(gri): The scanner API should change such that the literal string return
// is only valid if an actual literal was scanned. This will
// permit a more efficient implementation.
lit := S.src[offs:S.offset]
if hasCR {
lit = stripCR(lit)
}
return S.file.Pos(offs), tok, string(lit)
} }
...@@ -177,6 +177,15 @@ var tokens = [...]elt{ ...@@ -177,6 +177,15 @@ var tokens = [...]elt{
const whitespace = " \t \n\n\n" // to separate tokens const whitespace = " \t \n\n\n" // to separate tokens
var source = func() []byte {
var src []byte
for _, t := range tokens {
src = append(src, t.lit...)
src = append(src, whitespace...)
}
return src
}()
type testErrorHandler struct { type testErrorHandler struct {
t *testing.T t *testing.T
} }
...@@ -214,20 +223,20 @@ func checkPos(t *testing.T, lit string, p token.Pos, expected token.Position) { ...@@ -214,20 +223,20 @@ func checkPos(t *testing.T, lit string, p token.Pos, expected token.Position) {
// Verify that calling Scan() provides the correct results. // Verify that calling Scan() provides the correct results.
func TestScan(t *testing.T) { func TestScan(t *testing.T) {
// make source // make source
var src string src_linecount := newlineCount(string(source))
for _, e := range tokens {
src += e.lit + whitespace
}
src_linecount := newlineCount(src)
whitespace_linecount := newlineCount(whitespace) whitespace_linecount := newlineCount(whitespace)
// verify scan // verify scan
var s Scanner var s Scanner
s.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), &testErrorHandler{t}, ScanComments|dontInsertSemis) s.Init(fset.AddFile("", fset.Base(), len(source)), source, &testErrorHandler{t}, ScanComments|dontInsertSemis)
index := 0 index := 0
epos := token.Position{"", 0, 1, 1} // expected position epos := token.Position{"", 0, 1, 1} // expected position
for { for {
pos, tok, lit := s.Scan() pos, tok, lit := s.Scan()
if lit == "" {
// no literal value for non-literal tokens
lit = tok.String()
}
e := elt{token.EOF, "", special} e := elt{token.EOF, "", special}
if index < len(tokens) { if index < len(tokens) {
e = tokens[index] e = tokens[index]
...@@ -659,3 +668,20 @@ func TestScanErrors(t *testing.T) { ...@@ -659,3 +668,20 @@ func TestScanErrors(t *testing.T) {
checkError(t, e.src, e.tok, e.pos, e.err) checkError(t, e.src, e.tok, e.pos, e.err)
} }
} }
func BenchmarkScan(b *testing.B) {
b.StopTimer()
fset := token.NewFileSet()
file := fset.AddFile("", fset.Base(), len(source))
var s Scanner
b.StartTimer()
for i := b.N - 1; i >= 0; i-- {
s.Init(file, source, nil, ScanComments)
for {
_, tok, _ := s.Scan()
if tok == token.EOF {
break
}
}
}
}
...@@ -283,10 +283,8 @@ func init() { ...@@ -283,10 +283,8 @@ func init() {
// Lookup maps an identifier to its keyword token or IDENT (if not a keyword). // Lookup maps an identifier to its keyword token or IDENT (if not a keyword).
// //
func Lookup(ident []byte) Token { func Lookup(ident string) Token {
// TODO Maps with []byte key are illegal because []byte does not if tok, is_keyword := keywords[ident]; is_keyword {
// support == . Should find a more efficient solution eventually.
if tok, is_keyword := keywords[string(ident)]; is_keyword {
return tok return tok
} }
return IDENT return IDENT
...@@ -295,16 +293,16 @@ func Lookup(ident []byte) Token { ...@@ -295,16 +293,16 @@ func Lookup(ident []byte) Token {
// Predicates // Predicates
// IsLiteral returns true for tokens corresponding to identifiers // IsLiteral returns true for tokens corresponding to identifiers
// and basic type literals; returns false otherwise. // and basic type literals; it returns false otherwise.
// //
func (tok Token) IsLiteral() bool { return literal_beg < tok && tok < literal_end } func (tok Token) IsLiteral() bool { return literal_beg < tok && tok < literal_end }
// IsOperator returns true for tokens corresponding to operators and // IsOperator returns true for tokens corresponding to operators and
// delimiters; returns false otherwise. // delimiters; it returns false otherwise.
// //
func (tok Token) IsOperator() bool { return operator_beg < tok && tok < operator_end } func (tok Token) IsOperator() bool { return operator_beg < tok && tok < operator_end }
// IsKeyword returns true for tokens corresponding to keywords; // IsKeyword returns true for tokens corresponding to keywords;
// returns false otherwise. // it returns false otherwise.
// //
func (tok Token) IsKeyword() bool { return keyword_beg < tok && tok < keyword_end } func (tok Token) IsKeyword() bool { return keyword_beg < tok && tok < keyword_end }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment