Commit b770c9e9 authored by Andrew Balholm's avatar Andrew Balholm Committed by Nigel Tao

html: improve parsing of comments and "bogus comments"

R=nigeltao
CC=golang-dev
https://golang.org/cl/5279044
parent 5079129d
...@@ -100,9 +100,9 @@ func (t Token) String() string { ...@@ -100,9 +100,9 @@ func (t Token) String() string {
case SelfClosingTagToken: case SelfClosingTagToken:
return "<" + t.tagString() + "/>" return "<" + t.tagString() + "/>"
case CommentToken: case CommentToken:
return "<!--" + EscapeString(t.Data) + "-->" return "<!--" + t.Data + "-->"
case DoctypeToken: case DoctypeToken:
return "<!DOCTYPE " + EscapeString(t.Data) + ">" return "<!DOCTYPE " + t.Data + ">"
} }
return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
} }
...@@ -227,30 +227,62 @@ func (z *Tokenizer) skipWhiteSpace() { ...@@ -227,30 +227,62 @@ func (z *Tokenizer) skipWhiteSpace() {
// nextComment reads the next token starting with "<!--". // nextComment reads the next token starting with "<!--".
// The opening "<!--" has already been consumed. // The opening "<!--" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil && // Pre-condition: z.tt == CommentToken && z.err == nil &&
// z.raw.start + 4 <= z.raw.end. // z.raw.start + 4 <= z.raw.end.
func (z *Tokenizer) nextComment() { func (z *Tokenizer) nextComment() {
// <!--> is a valid comment. z.data.start = z.raw.end
defer func() {
if z.data.end < z.data.start {
// It's a comment with no data, like <!-->.
z.data.end = z.data.start
}
}()
for dashCount := 2; ; { for dashCount := 2; ; {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
z.data = z.raw z.data.end = z.raw.end
return return
} }
switch c { switch c {
case '-': case '-':
dashCount++ dashCount++
continue
case '>': case '>':
if dashCount >= 2 { if dashCount >= 2 {
z.tt = CommentToken z.data.end = z.raw.end - len("-->")
// TODO: adjust z.data to be only the "x" in "<!--x-->".
// Note that "<!>" is also a valid HTML5 comment.
z.data = z.raw
return return
} }
dashCount = 0 case '!':
default: if dashCount >= 2 {
dashCount = 0 c = z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return
}
if c == '>' {
z.data.end = z.raw.end - len("--!>")
return
}
}
}
dashCount = 0
}
}
// nextBogusComment reads text until the next ">" and treats it as a comment.
// Pre-condition: z.err == nil && z.raw.end is before the first comment byte.
func (z *Tokenizer) nextBogusComment() {
z.tt = CommentToken
z.data.start = z.raw.end
for {
c := z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return
}
if c == '>' {
z.data.end = z.raw.end - len(">")
return
} }
} }
} }
...@@ -258,13 +290,15 @@ func (z *Tokenizer) nextComment() { ...@@ -258,13 +290,15 @@ func (z *Tokenizer) nextComment() {
// nextMarkupDeclaration reads the next token starting with "<!". // nextMarkupDeclaration reads the next token starting with "<!".
// It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text". // It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text".
// The opening "<!" has already been consumed. // The opening "<!" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil && // Pre-condition: z.err == nil && z.raw.start + 2 <= z.raw.end.
// z.raw.start + 2 <= z.raw.end.
func (z *Tokenizer) nextMarkupDeclaration() { func (z *Tokenizer) nextMarkupDeclaration() {
z.tt = CommentToken
z.data.start = z.raw.end
var c [2]byte var c [2]byte
for i := 0; i < 2; i++ { for i := 0; i < 2; i++ {
c[i] = z.readByte() c[i] = z.readByte()
if z.err != nil { if z.err != nil {
z.data.end = z.raw.end
return return
} }
} }
...@@ -273,27 +307,35 @@ func (z *Tokenizer) nextMarkupDeclaration() { ...@@ -273,27 +307,35 @@ func (z *Tokenizer) nextMarkupDeclaration() {
return return
} }
z.raw.end -= 2 z.raw.end -= 2
const s = "DOCTYPE " const s = "DOCTYPE"
for i := 0; ; i++ { for i := 0; i < len(s); i++ {
c := z.readByte() c := z.readByte()
if z.err != nil { if z.err != nil {
z.data = z.raw z.data.end = z.raw.end
return return
} }
// Capitalize c. if c != s[i] && c != s[i]+('a'-'A') {
if 'a' <= c && c <= 'z' { // Back up to read the fragment of "DOCTYPE" again.
c = 'A' + (c - 'a') z.raw.end = z.data.start
z.nextBogusComment()
return
} }
if i < len(s) && c != s[i] { }
z.nextText() z.tt = DoctypeToken
if z.skipWhiteSpace(); z.err != nil {
z.data.start = z.raw.end
z.data.end = z.raw.end
return
}
z.data.start = z.raw.end
for {
c := z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return return
} }
if c == '>' { if c == '>' {
if i >= len(s) { z.data.end = z.raw.end - len(">")
z.tt = DoctypeToken
z.data.start = z.raw.start + len("<!DOCTYPE ")
z.data.end = z.raw.end - len(">")
}
return return
} }
} }
...@@ -311,8 +353,18 @@ func (z *Tokenizer) nextTag() { ...@@ -311,8 +353,18 @@ func (z *Tokenizer) nextTag() {
return return
} }
switch { switch {
// TODO: check that the "</" is followed by something in A-Za-z.
case c == '/': case c == '/':
// Check that the "</" is followed by something in A-Za-z.
c = z.readByte()
if z.err != nil {
z.data = z.raw
return
}
z.raw.end--
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
z.nextBogusComment()
return
}
z.tt = EndTagToken z.tt = EndTagToken
z.data.start += len("</") z.data.start += len("</")
// Lower-cased characters are more common in tag names, so we check for them first. // Lower-cased characters are more common in tag names, so we check for them first.
...@@ -323,7 +375,8 @@ func (z *Tokenizer) nextTag() { ...@@ -323,7 +375,8 @@ func (z *Tokenizer) nextTag() {
z.nextMarkupDeclaration() z.nextMarkupDeclaration()
return return
case c == '?': case c == '?':
z.tt, z.err = ErrorToken, os.NewError("html: TODO: implement XML processing instructions") z.raw.end--
z.nextBogusComment()
return return
default: default:
z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags") z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
......
...@@ -87,51 +87,88 @@ var tokenTests = []tokenTest{ ...@@ -87,51 +87,88 @@ var tokenTests = []tokenTest{
`<p id="0"</p>`, `<p id="0"</p>`,
`<p id="0" <="" p="">`, `<p id="0" <="" p="">`,
}, },
// DOCTYPE tests.
{
"Proper DOCTYPE",
"<!DOCTYPE html>",
"<!DOCTYPE html>",
},
{
"DOCTYPE with no space",
"<!doctypehtml>",
"<!DOCTYPE html>",
},
{
"DOCTYPE with two spaces",
"<!doctype html>",
"<!DOCTYPE html>",
},
{
"looks like DOCTYPE but isn't",
"<!DOCUMENT html>",
"<!--DOCUMENT html-->",
},
{
"DOCTYPE at EOF",
"<!DOCtype",
"<!DOCTYPE >",
},
// XML processing instructions.
{
"XML processing instruction",
"<?xml?>",
"<!--?xml?-->",
},
// Comments. // Comments.
{ {
"comment0", "comment0",
"abc<b><!-- skipme --></b>def", "abc<b><!-- skipme --></b>def",
"abc$<b>$</b>$def", "abc$<b>$<!-- skipme -->$</b>$def",
}, },
{ {
"comment1", "comment1",
"a<!-->z", "a<!-->z",
"a$z", "a$<!---->$z",
}, },
{ {
"comment2", "comment2",
"a<!--->z", "a<!--->z",
"a$z", "a$<!---->$z",
}, },
{ {
"comment3", "comment3",
"a<!--x>-->z", "a<!--x>-->z",
"a$z", "a$<!--x>-->$z",
}, },
{ {
"comment4", "comment4",
"a<!--x->-->z", "a<!--x->-->z",
"a$z", "a$<!--x->-->$z",
}, },
{ {
"comment5", "comment5",
"a<!>z", "a<!>z",
"a$&lt;!&gt;z", "a$<!---->$z",
}, },
{ {
"comment6", "comment6",
"a<!->z", "a<!->z",
"a$&lt;!-&gt;z", "a$<!----->$z",
}, },
{ {
"comment7", "comment7",
"a<!---<>z", "a<!---<>z",
"a$&lt;!---&lt;&gt;z", "a$<!---<>z-->",
}, },
{ {
"comment8", "comment8",
"a<!--z", "a<!--z",
"a$&lt;!--z", "a$<!--z-->",
},
{
"comment9",
"a<!--x--!>z",
"a$<!--x-->$z",
}, },
// An attribute with a backslash. // An attribute with a backslash.
{ {
...@@ -229,6 +266,7 @@ func TestTokenizer(t *testing.T) { ...@@ -229,6 +266,7 @@ func TestTokenizer(t *testing.T) {
loop: loop:
for _, tt := range tokenTests { for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
z.ReturnComments = true
for i, s := range strings.Split(tt.golden, "$") { for i, s := range strings.Split(tt.golden, "$") {
if z.Next() == ErrorToken { if z.Next() == ErrorToken {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment