Commit e5f3dc8b authored by Nigel Tao's avatar Nigel Tao

html: refactor the tokenizer; parse "</>" correctly.

Previously, Next would call either nextText or nextTag, but nextTag
could also call nextText. Both nextText and nextTag were responsible
for detecting "</a" end tags and "<!" comments. This change simplifies
the call chain and puts that responsibility in a single place.

R=andybalholm
CC=golang-dev
https://golang.org/cl/5263050
parent d2b73730
......@@ -225,11 +225,9 @@ func (z *Tokenizer) skipWhiteSpace() {
}
}
// nextComment reads the next token starting with "<!--".
// The opening "<!--" has already been consumed.
// Pre-condition: z.tt == CommentToken && z.err == nil &&
// z.raw.start + 4 <= z.raw.end.
func (z *Tokenizer) nextComment() {
// readComment reads the next comment token starting with "<!--". The opening
// "<!--" has already been consumed.
func (z *Tokenizer) readComment() {
z.data.start = z.raw.end
defer func() {
if z.data.end < z.data.start {
......@@ -269,10 +267,8 @@ func (z *Tokenizer) nextComment() {
}
}
// nextBogusComment reads text until the next ">" and treats it as a comment.
// Pre-condition: z.err == nil && z.raw.end is before the first comment byte.
func (z *Tokenizer) nextBogusComment() {
z.tt = CommentToken
// readUntilCloseAngle reads until the next ">".
func (z *Tokenizer) readUntilCloseAngle() {
z.data.start = z.raw.end
for {
c := z.readByte()
......@@ -287,24 +283,22 @@ func (z *Tokenizer) nextBogusComment() {
}
}
// nextMarkupDeclaration reads the next token starting with "<!".
// It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text".
// The opening "<!" has already been consumed.
// Pre-condition: z.err == nil && z.raw.start + 2 <= z.raw.end.
func (z *Tokenizer) nextMarkupDeclaration() {
z.tt = CommentToken
// readMarkupDeclaration reads the next token starting with "<!". It might be
// a "<!--comment-->", a "<!DOCTYPE foo>", or "<!a bogus comment". The opening
// "<!" has already been consumed.
func (z *Tokenizer) readMarkupDeclaration() TokenType {
z.data.start = z.raw.end
var c [2]byte
for i := 0; i < 2; i++ {
c[i] = z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return
return CommentToken
}
}
if c[0] == '-' && c[1] == '-' {
z.nextComment()
return
z.readComment()
return CommentToken
}
z.raw.end -= 2
const s = "DOCTYPE"
......@@ -312,81 +306,33 @@ func (z *Tokenizer) nextMarkupDeclaration() {
c := z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return
return CommentToken
}
if c != s[i] && c != s[i]+('a'-'A') {
// Back up to read the fragment of "DOCTYPE" again.
z.raw.end = z.data.start
z.nextBogusComment()
return
z.readUntilCloseAngle()
return CommentToken
}
}
z.tt = DoctypeToken
if z.skipWhiteSpace(); z.err != nil {
z.data.start = z.raw.end
z.data.end = z.raw.end
return
}
z.data.start = z.raw.end
for {
c := z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return
}
if c == '>' {
z.data.end = z.raw.end - len(">")
return
}
return DoctypeToken
}
z.readUntilCloseAngle()
return DoctypeToken
}
// nextTag reads the next token starting with "<". It might be a "<startTag>",
// an "</endTag>", a "<!markup declaration>", or "<malformed text".
// The opening "<" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 1 <= z.raw.end.
func (z *Tokenizer) nextTag() {
c := z.readByte()
if z.err != nil {
z.data = z.raw
return
}
switch {
case c == '/':
// Check that the "</" is followed by something in A-Za-z.
c = z.readByte()
if z.err != nil {
z.data = z.raw
return
}
z.raw.end--
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
z.nextBogusComment()
return
}
z.tt = EndTagToken
z.data.start += len("</")
// Lower-cased characters are more common in tag names, so we check for them first.
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
z.tt = StartTagToken
z.data.start += len("<")
case c == '!':
z.nextMarkupDeclaration()
return
case c == '?':
z.raw.end--
z.nextBogusComment()
return
default:
z.nextText()
return
}
// readStartTag reads the next start tag token. The opening "<a" has already
// been consumed, where 'a' means anything in [A-Za-z].
func (z *Tokenizer) readStartTag() TokenType {
z.attr = z.attr[:0]
z.nAttrReturned = 0
// Read the tag name and attribute key/value pairs.
z.readTagName()
if z.skipWhiteSpace(); z.err != nil {
z.tt = ErrorToken
return
return ErrorToken
}
for {
c := z.readByte()
......@@ -404,14 +350,31 @@ func (z *Tokenizer) nextTag() {
break
}
}
// Check for a self-closing token.
if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
z.tt = SelfClosingTagToken
if z.err == nil && z.buf[z.raw.end-2] == '/' {
return SelfClosingTagToken
}
return StartTagToken
}
// readTagName sets z.data to the "p" in "<p k=v>".
// readEndTag reads the next end tag token. The opening "</a" has already
// been consumed, where 'a' means anything in [A-Za-z].
func (z *Tokenizer) readEndTag() {
z.attr = z.attr[:0]
z.nAttrReturned = 0
z.readTagName()
for {
c := z.readByte()
if z.err != nil || c == '>' {
return
}
}
}
// readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)
// is positioned such that the first byte of the tag name (the "d" in "<div")
// has already been consumed.
func (z *Tokenizer) readTagName() {
z.data.start = z.raw.end - 1
for {
c := z.readByte()
if z.err != nil {
......@@ -430,7 +393,7 @@ func (z *Tokenizer) readTagName() {
}
}
// readTagAttrKey sets z.pendingAttr[0] to the "k" in "<p k=v>".
// readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>".
// Precondition: z.err == nil.
func (z *Tokenizer) readTagAttrKey() {
z.pendingAttr[0].start = z.raw.end
......@@ -452,7 +415,7 @@ func (z *Tokenizer) readTagAttrKey() {
}
}
// readTagAttrVal sets z.pendingAttr[1] to the "v" in "<p k=v>".
// readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>".
func (z *Tokenizer) readTagAttrVal() {
z.pendingAttr[1].start = z.raw.end
z.pendingAttr[1].end = z.raw.end
......@@ -514,69 +477,100 @@ func (z *Tokenizer) readTagAttrVal() {
}
}
// nextText reads all text up until a start tag "<a", end tag "</a", comment
// "<!" or XML processing instruction "<?".
// Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 1 <= z.raw.end.
func (z *Tokenizer) nextText() {
// next scans the next token and returns its type.
func (z *Tokenizer) next() TokenType {
if z.err != nil {
return ErrorToken
}
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
loop:
for {
c := z.readByte()
if z.err != nil {
break
break loop
}
if c != '<' {
continue
continue loop
}
// Check if the '<' we have just read is part of a tag, comment
// or doctype. If not, it's part of the accumulated text token.
c = z.readByte()
if z.err != nil {
break
}
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
z.raw.end -= 2
break
}
if c != '/' {
break loop
}
var tokenType TokenType
switch {
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
tokenType = StartTagToken
case c == '/':
tokenType = EndTagToken
case c == '!' || c == '?':
// We use CommentToken to mean any of "<!--actual comments-->",
// "<!DOCTYPE declarations>" and "<?xml processing instructions?>".
tokenType = CommentToken
default:
continue
}
c = z.readByte()
if z.err != nil {
break
// We have a non-text token, but we might have accumulated some text
// before that. If so, we return the text first, and return the non-
// text token on the subsequent call to Next.
if x := z.raw.end - len("<a"); z.raw.start < x {
z.raw.end = x
z.data.end = x
return TextToken
}
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
z.raw.end -= 3
break
switch tokenType {
case StartTagToken:
return z.readStartTag()
case EndTagToken:
c = z.readByte()
if z.err != nil {
break loop
}
if c == '>' {
// "</>" does not generate a token at all.
// Reset the tokenizer state and start again.
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
continue loop
}
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
z.readEndTag()
return EndTagToken
}
z.raw.end--
z.readUntilCloseAngle()
return CommentToken
case CommentToken:
if c == '!' {
return z.readMarkupDeclaration()
}
z.raw.end--
z.readUntilCloseAngle()
return CommentToken
}
}
z.data = z.raw
if z.raw.start < z.raw.end {
z.data.end = z.raw.end
return TextToken
}
return ErrorToken
}
// Next scans the next token and returns its type.
func (z *Tokenizer) Next() TokenType {
for {
if z.err != nil {
z.tt = ErrorToken
return z.tt
}
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
z.attr = z.attr[:0]
z.nAttrReturned = 0
c := z.readByte()
if z.err != nil {
z.tt = ErrorToken
return z.tt
}
// We assume that the next token is text unless proven otherwise.
z.tt = TextToken
if c != '<' {
z.nextText()
} else {
z.nextTag()
if z.tt == CommentToken && !z.ReturnComments {
continue
}
z.tt = z.next()
// TODO: remove the ReturnComments option. A tokenizer should
// always return comment tags.
if z.tt == CommentToken && !z.ReturnComments {
continue
}
return z.tt
}
......@@ -606,12 +600,14 @@ func (z *Tokenizer) Text() []byte {
// `<IMG SRC="foo">`) and whether the tag has attributes.
// The contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
switch z.tt {
case StartTagToken, EndTagToken, SelfClosingTagToken:
s := z.buf[z.data.start:z.data.end]
z.data.start = z.raw.end
z.data.end = z.raw.end
return lower(s), z.nAttrReturned < len(z.attr)
if z.data.start < z.data.end {
switch z.tt {
case StartTagToken, EndTagToken, SelfClosingTagToken:
s := z.buf[z.data.start:z.data.end]
z.data.start = z.raw.end
z.data.end = z.raw.end
return lower(s), z.nAttrReturned < len(z.attr)
}
}
return nil, false
}
......@@ -622,7 +618,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
if z.nAttrReturned < len(z.attr) {
switch z.tt {
case StartTagToken, EndTagToken, SelfClosingTagToken:
case StartTagToken, SelfClosingTagToken:
x := z.attr[z.nAttrReturned]
z.nAttrReturned++
key = z.buf[x[0].start:x[0].end]
......@@ -640,7 +636,7 @@ func (z *Tokenizer) Token() Token {
switch z.tt {
case TextToken, CommentToken, DoctypeToken:
t.Data = string(z.Text())
case StartTagToken, EndTagToken, SelfClosingTagToken:
case StartTagToken, SelfClosingTagToken:
var attr []Attribute
name, moreAttr := z.TagName()
for moreAttr {
......@@ -650,6 +646,9 @@ func (z *Tokenizer) Token() Token {
}
t.Data = string(name)
t.Attr = attr
case EndTagToken:
name, _ := z.TagName()
t.Data = string(name)
}
return t
}
......
......@@ -57,19 +57,16 @@ var tokenTests = []tokenTest{
"</",
"&lt;/",
},
/*
// TODO: re-enable these tests when we tokenize them correctly.
{
"not a tag #2",
"</>",
"",
},
{
"not a tag #3",
"a</>b",
"a$b",
},
*/
{
"not a tag #2",
"</>",
"",
},
{
"not a tag #3",
"a</>b",
"a$b",
},
{
"not a tag #4",
"</ >",
......@@ -77,21 +74,31 @@ var tokenTests = []tokenTest{
},
{
"not a tag #5",
"</.",
"<!--.-->",
},
{
"not a tag #6",
"</.>",
"<!--.-->",
},
{
"not a tag #7",
"a < b",
"a &lt; b",
},
{
"not a tag #6",
"not a tag #8",
"<.>",
"&lt;.&gt;",
},
{
"not a tag #7",
"not a tag #9",
"a<<<b>>>c",
"a&lt;&lt;$<b>$&gt;&gt;c",
},
{
"not a tag #8",
"not a tag #10",
"if x<0 and y < 0 then x*y>0",
"if x&lt;0 and y &lt; 0 then x*y&gt;0",
},
......@@ -345,7 +352,7 @@ var tokenTests = []tokenTest{
func TestTokenizer(t *testing.T) {
loop:
for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
z := NewTokenizer(strings.NewReader(tt.html))
z.ReturnComments = true
if tt.golden != "" {
for i, s := range strings.Split(tt.golden, "$") {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment