Commit e5f3dc8b authored by Nigel Tao's avatar Nigel Tao

html: refactor the tokenizer; parse "</>" correctly.

Previously, Next would call either nextText or nextTag, but nextTag
could also call nextText. Both nextText and nextTag were responsible
for detecting "</a" end tags and "<!" comments. This change simplifies
the call chain and puts that responsibility in a single place.

R=andybalholm
CC=golang-dev
https://golang.org/cl/5263050
parent d2b73730
...@@ -225,11 +225,9 @@ func (z *Tokenizer) skipWhiteSpace() { ...@@ -225,11 +225,9 @@ func (z *Tokenizer) skipWhiteSpace() {
} }
} }
// nextComment reads the next token starting with "<!--". // readComment reads the next comment token starting with "<!--". The opening
// The opening "<!--" has already been consumed. // "<!--" has already been consumed.
// Pre-condition: z.tt == CommentToken && z.err == nil && func (z *Tokenizer) readComment() {
// z.raw.start + 4 <= z.raw.end.
func (z *Tokenizer) nextComment() {
z.data.start = z.raw.end z.data.start = z.raw.end
defer func() { defer func() {
if z.data.end < z.data.start { if z.data.end < z.data.start {
...@@ -269,10 +267,8 @@ func (z *Tokenizer) nextComment() { ...@@ -269,10 +267,8 @@ func (z *Tokenizer) nextComment() {
} }
} }
// nextBogusComment reads text until the next ">" and treats it as a comment. // readUntilCloseAngle reads until the next ">".
// Pre-condition: z.err == nil && z.raw.end is before the first comment byte. func (z *Tokenizer) readUntilCloseAngle() {
func (z *Tokenizer) nextBogusComment() {
z.tt = CommentToken
z.data.start = z.raw.end z.data.start = z.raw.end
for { for {
c := z.readByte() c := z.readByte()
...@@ -287,24 +283,22 @@ func (z *Tokenizer) nextBogusComment() { ...@@ -287,24 +283,22 @@ func (z *Tokenizer) nextBogusComment() {
} }
} }
// nextMarkupDeclaration reads the next token starting with "<!". // readMarkupDeclaration reads the next token starting with "<!". It might be
// It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text". // a "<!--comment-->", a "<!DOCTYPE foo>", or "<!a bogus comment". The opening
// The opening "<!" has already been consumed. // "<!" has already been consumed.
// Pre-condition: z.err == nil && z.raw.start + 2 <= z.raw.end. func (z *Tokenizer) readMarkupDeclaration() TokenType {
func (z *Tokenizer) nextMarkupDeclaration() {
z.tt = CommentToken
z.data.start = z.raw.end z.data.start = z.raw.end
var c [2]byte var c [2]byte
for i := 0; i < 2; i++ { for i := 0; i < 2; i++ {
c[i] = z.readByte() c[i] = z.readByte()
if z.err != nil { if z.err != nil {
z.data.end = z.raw.end z.data.end = z.raw.end
return return CommentToken
} }
} }
if c[0] == '-' && c[1] == '-' { if c[0] == '-' && c[1] == '-' {
z.nextComment() z.readComment()
return return CommentToken
} }
z.raw.end -= 2 z.raw.end -= 2
const s = "DOCTYPE" const s = "DOCTYPE"
...@@ -312,81 +306,33 @@ func (z *Tokenizer) nextMarkupDeclaration() { ...@@ -312,81 +306,33 @@ func (z *Tokenizer) nextMarkupDeclaration() {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
z.data.end = z.raw.end z.data.end = z.raw.end
return return CommentToken
} }
if c != s[i] && c != s[i]+('a'-'A') { if c != s[i] && c != s[i]+('a'-'A') {
// Back up to read the fragment of "DOCTYPE" again. // Back up to read the fragment of "DOCTYPE" again.
z.raw.end = z.data.start z.raw.end = z.data.start
z.nextBogusComment() z.readUntilCloseAngle()
return return CommentToken
} }
} }
z.tt = DoctypeToken
if z.skipWhiteSpace(); z.err != nil { if z.skipWhiteSpace(); z.err != nil {
z.data.start = z.raw.end z.data.start = z.raw.end
z.data.end = z.raw.end z.data.end = z.raw.end
return return DoctypeToken
}
z.data.start = z.raw.end
for {
c := z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return
}
if c == '>' {
z.data.end = z.raw.end - len(">")
return
}
} }
z.readUntilCloseAngle()
return DoctypeToken
} }
// nextTag reads the next token starting with "<". It might be a "<startTag>", // readStartTag reads the next start tag token. The opening "<a" has already
// an "</endTag>", a "<!markup declaration>", or "<malformed text". // been consumed, where 'a' means anything in [A-Za-z].
// The opening "<" has already been consumed. func (z *Tokenizer) readStartTag() TokenType {
// Pre-condition: z.tt == TextToken && z.err == nil && z.attr = z.attr[:0]
// z.raw.start + 1 <= z.raw.end. z.nAttrReturned = 0
func (z *Tokenizer) nextTag() {
c := z.readByte()
if z.err != nil {
z.data = z.raw
return
}
switch {
case c == '/':
// Check that the "</" is followed by something in A-Za-z.
c = z.readByte()
if z.err != nil {
z.data = z.raw
return
}
z.raw.end--
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
z.nextBogusComment()
return
}
z.tt = EndTagToken
z.data.start += len("</")
// Lower-cased characters are more common in tag names, so we check for them first.
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
z.tt = StartTagToken
z.data.start += len("<")
case c == '!':
z.nextMarkupDeclaration()
return
case c == '?':
z.raw.end--
z.nextBogusComment()
return
default:
z.nextText()
return
}
// Read the tag name and attribute key/value pairs. // Read the tag name and attribute key/value pairs.
z.readTagName() z.readTagName()
if z.skipWhiteSpace(); z.err != nil { if z.skipWhiteSpace(); z.err != nil {
z.tt = ErrorToken return ErrorToken
return
} }
for { for {
c := z.readByte() c := z.readByte()
...@@ -404,14 +350,31 @@ func (z *Tokenizer) nextTag() { ...@@ -404,14 +350,31 @@ func (z *Tokenizer) nextTag() {
break break
} }
} }
// Check for a self-closing token. if z.err == nil && z.buf[z.raw.end-2] == '/' {
if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' { return SelfClosingTagToken
z.tt = SelfClosingTagToken
} }
return StartTagToken
} }
// readTagName sets z.data to the "p" in "<p k=v>". // readEndTag reads the next end tag token. The opening "</a" has already
// been consumed, where 'a' means anything in [A-Za-z].
func (z *Tokenizer) readEndTag() {
z.attr = z.attr[:0]
z.nAttrReturned = 0
z.readTagName()
for {
c := z.readByte()
if z.err != nil || c == '>' {
return
}
}
}
// readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)
// is positioned such that the first byte of the tag name (the "d" in "<div")
// has already been consumed.
func (z *Tokenizer) readTagName() { func (z *Tokenizer) readTagName() {
z.data.start = z.raw.end - 1
for { for {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
...@@ -430,7 +393,7 @@ func (z *Tokenizer) readTagName() { ...@@ -430,7 +393,7 @@ func (z *Tokenizer) readTagName() {
} }
} }
// readTagAttrKey sets z.pendingAttr[0] to the "k" in "<p k=v>". // readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>".
// Precondition: z.err == nil. // Precondition: z.err == nil.
func (z *Tokenizer) readTagAttrKey() { func (z *Tokenizer) readTagAttrKey() {
z.pendingAttr[0].start = z.raw.end z.pendingAttr[0].start = z.raw.end
...@@ -452,7 +415,7 @@ func (z *Tokenizer) readTagAttrKey() { ...@@ -452,7 +415,7 @@ func (z *Tokenizer) readTagAttrKey() {
} }
} }
// readTagAttrVal sets z.pendingAttr[1] to the "v" in "<p k=v>". // readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>".
func (z *Tokenizer) readTagAttrVal() { func (z *Tokenizer) readTagAttrVal() {
z.pendingAttr[1].start = z.raw.end z.pendingAttr[1].start = z.raw.end
z.pendingAttr[1].end = z.raw.end z.pendingAttr[1].end = z.raw.end
...@@ -514,69 +477,100 @@ func (z *Tokenizer) readTagAttrVal() { ...@@ -514,69 +477,100 @@ func (z *Tokenizer) readTagAttrVal() {
} }
} }
// nextText reads all text up until a start tag "<a", end tag "</a", comment // next scans the next token and returns its type.
// "<!" or XML processing instruction "<?". func (z *Tokenizer) next() TokenType {
// Pre-condition: z.tt == TextToken && z.err == nil && if z.err != nil {
// z.raw.start + 1 <= z.raw.end. return ErrorToken
func (z *Tokenizer) nextText() { }
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
loop:
for { for {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
break break loop
} }
if c != '<' { if c != '<' {
continue continue loop
} }
// Check if the '<' we have just read is part of a tag, comment
// or doctype. If not, it's part of the accumulated text token.
c = z.readByte() c = z.readByte()
if z.err != nil { if z.err != nil {
break break loop
} }
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' { var tokenType TokenType
z.raw.end -= 2 switch {
break case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
} tokenType = StartTagToken
if c != '/' { case c == '/':
tokenType = EndTagToken
case c == '!' || c == '?':
// We use CommentToken to mean any of "<!--actual comments-->",
// "<!DOCTYPE declarations>" and "<?xml processing instructions?>".
tokenType = CommentToken
default:
continue continue
} }
c = z.readByte()
if z.err != nil { // We have a non-text token, but we might have accumulated some text
break // before that. If so, we return the text first, and return the non-
// text token on the subsequent call to Next.
if x := z.raw.end - len("<a"); z.raw.start < x {
z.raw.end = x
z.data.end = x
return TextToken
} }
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { switch tokenType {
z.raw.end -= 3 case StartTagToken:
break return z.readStartTag()
case EndTagToken:
c = z.readByte()
if z.err != nil {
break loop
}
if c == '>' {
// "</>" does not generate a token at all.
// Reset the tokenizer state and start again.
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
continue loop
}
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
z.readEndTag()
return EndTagToken
}
z.raw.end--
z.readUntilCloseAngle()
return CommentToken
case CommentToken:
if c == '!' {
return z.readMarkupDeclaration()
}
z.raw.end--
z.readUntilCloseAngle()
return CommentToken
} }
} }
z.data = z.raw if z.raw.start < z.raw.end {
z.data.end = z.raw.end
return TextToken
}
return ErrorToken
} }
// Next scans the next token and returns its type. // Next scans the next token and returns its type.
func (z *Tokenizer) Next() TokenType { func (z *Tokenizer) Next() TokenType {
for { for {
if z.err != nil { z.tt = z.next()
z.tt = ErrorToken // TODO: remove the ReturnComments option. A tokenizer should
return z.tt // always return comment tags.
} if z.tt == CommentToken && !z.ReturnComments {
z.raw.start = z.raw.end continue
z.data.start = z.raw.end
z.data.end = z.raw.end
z.attr = z.attr[:0]
z.nAttrReturned = 0
c := z.readByte()
if z.err != nil {
z.tt = ErrorToken
return z.tt
}
// We assume that the next token is text unless proven otherwise.
z.tt = TextToken
if c != '<' {
z.nextText()
} else {
z.nextTag()
if z.tt == CommentToken && !z.ReturnComments {
continue
}
} }
return z.tt return z.tt
} }
...@@ -606,12 +600,14 @@ func (z *Tokenizer) Text() []byte { ...@@ -606,12 +600,14 @@ func (z *Tokenizer) Text() []byte {
// `<IMG SRC="foo">`) and whether the tag has attributes. // `<IMG SRC="foo">`) and whether the tag has attributes.
// The contents of the returned slice may change on the next call to Next. // The contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
switch z.tt { if z.data.start < z.data.end {
case StartTagToken, EndTagToken, SelfClosingTagToken: switch z.tt {
s := z.buf[z.data.start:z.data.end] case StartTagToken, EndTagToken, SelfClosingTagToken:
z.data.start = z.raw.end s := z.buf[z.data.start:z.data.end]
z.data.end = z.raw.end z.data.start = z.raw.end
return lower(s), z.nAttrReturned < len(z.attr) z.data.end = z.raw.end
return lower(s), z.nAttrReturned < len(z.attr)
}
} }
return nil, false return nil, false
} }
...@@ -622,7 +618,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { ...@@ -622,7 +618,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
if z.nAttrReturned < len(z.attr) { if z.nAttrReturned < len(z.attr) {
switch z.tt { switch z.tt {
case StartTagToken, EndTagToken, SelfClosingTagToken: case StartTagToken, SelfClosingTagToken:
x := z.attr[z.nAttrReturned] x := z.attr[z.nAttrReturned]
z.nAttrReturned++ z.nAttrReturned++
key = z.buf[x[0].start:x[0].end] key = z.buf[x[0].start:x[0].end]
...@@ -640,7 +636,7 @@ func (z *Tokenizer) Token() Token { ...@@ -640,7 +636,7 @@ func (z *Tokenizer) Token() Token {
switch z.tt { switch z.tt {
case TextToken, CommentToken, DoctypeToken: case TextToken, CommentToken, DoctypeToken:
t.Data = string(z.Text()) t.Data = string(z.Text())
case StartTagToken, EndTagToken, SelfClosingTagToken: case StartTagToken, SelfClosingTagToken:
var attr []Attribute var attr []Attribute
name, moreAttr := z.TagName() name, moreAttr := z.TagName()
for moreAttr { for moreAttr {
...@@ -650,6 +646,9 @@ func (z *Tokenizer) Token() Token { ...@@ -650,6 +646,9 @@ func (z *Tokenizer) Token() Token {
} }
t.Data = string(name) t.Data = string(name)
t.Attr = attr t.Attr = attr
case EndTagToken:
name, _ := z.TagName()
t.Data = string(name)
} }
return t return t
} }
......
...@@ -57,19 +57,16 @@ var tokenTests = []tokenTest{ ...@@ -57,19 +57,16 @@ var tokenTests = []tokenTest{
"</", "</",
"&lt;/", "&lt;/",
}, },
/* {
// TODO: re-enable these tests when we tokenize them correctly. "not a tag #2",
{ "</>",
"not a tag #2", "",
"</>", },
"", {
}, "not a tag #3",
{ "a</>b",
"not a tag #3", "a$b",
"a</>b", },
"a$b",
},
*/
{ {
"not a tag #4", "not a tag #4",
"</ >", "</ >",
...@@ -77,21 +74,31 @@ var tokenTests = []tokenTest{ ...@@ -77,21 +74,31 @@ var tokenTests = []tokenTest{
}, },
{ {
"not a tag #5", "not a tag #5",
"</.",
"<!--.-->",
},
{
"not a tag #6",
"</.>",
"<!--.-->",
},
{
"not a tag #7",
"a < b", "a < b",
"a &lt; b", "a &lt; b",
}, },
{ {
"not a tag #6", "not a tag #8",
"<.>", "<.>",
"&lt;.&gt;", "&lt;.&gt;",
}, },
{ {
"not a tag #7", "not a tag #9",
"a<<<b>>>c", "a<<<b>>>c",
"a&lt;&lt;$<b>$&gt;&gt;c", "a&lt;&lt;$<b>$&gt;&gt;c",
}, },
{ {
"not a tag #8", "not a tag #10",
"if x<0 and y < 0 then x*y>0", "if x<0 and y < 0 then x*y>0",
"if x&lt;0 and y &lt; 0 then x*y&gt;0", "if x&lt;0 and y &lt; 0 then x*y&gt;0",
}, },
...@@ -345,7 +352,7 @@ var tokenTests = []tokenTest{ ...@@ -345,7 +352,7 @@ var tokenTests = []tokenTest{
func TestTokenizer(t *testing.T) { func TestTokenizer(t *testing.T) {
loop: loop:
for _, tt := range tokenTests { for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) z := NewTokenizer(strings.NewReader(tt.html))
z.ReturnComments = true z.ReturnComments = true
if tt.golden != "" { if tt.golden != "" {
for i, s := range strings.Split(tt.golden, "$") { for i, s := range strings.Split(tt.golden, "$") {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment