Commit 968732b6 authored by Robert Griesemer's avatar Robert Griesemer

go/scanner: reject BOMs that are not at the beginning

For compliance with gc. See also issue 5265.
Not Go1.1 critical, but harmless.

R=r
CC=golang-dev
https://golang.org/cl/8736043
parent d4d06358
...@@ -48,6 +48,8 @@ type Scanner struct { ...@@ -48,6 +48,8 @@ type Scanner struct {
ErrorCount int // number of errors encountered ErrorCount int // number of errors encountered
} }
const bom = 0xFEFF // byte order mark, only permitted as very first character
// Read the next Unicode char into s.ch. // Read the next Unicode char into s.ch.
// s.ch < 0 means end-of-file. // s.ch < 0 means end-of-file.
// //
...@@ -67,6 +69,8 @@ func (s *Scanner) next() { ...@@ -67,6 +69,8 @@ func (s *Scanner) next() {
r, w = utf8.DecodeRune(s.src[s.rdOffset:]) r, w = utf8.DecodeRune(s.src[s.rdOffset:])
if r == utf8.RuneError && w == 1 { if r == utf8.RuneError && w == 1 {
s.error(s.offset, "illegal UTF-8 encoding") s.error(s.offset, "illegal UTF-8 encoding")
} else if r == bom && s.offset > 0 {
s.error(s.offset, "illegal byte order mark")
} }
} }
s.rdOffset += w s.rdOffset += w
...@@ -125,8 +129,8 @@ func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode ...@@ -125,8 +129,8 @@ func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode
s.ErrorCount = 0 s.ErrorCount = 0
s.next() s.next()
if s.ch == '\uFEFF' { if s.ch == bom {
s.next() // ignore BOM s.next() // ignore BOM at file beginning
} }
} }
...@@ -713,7 +717,10 @@ scanAgain: ...@@ -713,7 +717,10 @@ scanAgain:
case '|': case '|':
tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
default: default:
// next reports unexpected BOMs - don't repeat
if ch != bom {
s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
}
insertSemi = s.insertSemi // preserve insertSemi info insertSemi = s.insertSemi // preserve insertSemi info
tok = token.ILLEGAL tok = token.ILLEGAL
lit = string(ch) lit = string(ch)
......
...@@ -695,7 +695,10 @@ var errors = []struct { ...@@ -695,7 +695,10 @@ var errors = []struct {
{"0X", token.INT, 0, "illegal hexadecimal number"}, {"0X", token.INT, 0, "illegal hexadecimal number"},
{"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"}, {"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"},
{"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"}, {"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"},
{"\ufeff\ufeff", token.ILLEGAL, 3, "illegal character U+FEFF"}, // only first BOM is ignored {"\ufeff\ufeff", token.ILLEGAL, 3, "illegal byte order mark"}, // only first BOM is ignored
{"//\ufeff", token.COMMENT, 2, "illegal byte order mark"}, // only first BOM is ignored
{"'\ufeff" + `'`, token.CHAR, 1, "illegal byte order mark"}, // only first BOM is ignored
{`"` + "abc\ufeffdef" + `"`, token.STRING, 4, "illegal byte order mark"}, // only first BOM is ignored
} }
func TestScanErrors(t *testing.T) { func TestScanErrors(t *testing.T) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment