Commit 37afff29 authored by Nigel Tao's avatar Nigel Tao

html: parse malformed tags missing a '>', such as `<p id=0</p>`.

The additional token_test.go cases matches html5lib behavior.

Fixes #2124.

R=gri
CC=golang-dev
https://golang.org/cl/4844055
parent 1ac7a697
...@@ -276,13 +276,12 @@ func (z *Tokenizer) nextTag() { ...@@ -276,13 +276,12 @@ func (z *Tokenizer) nextTag() {
if z.err != nil { if z.err != nil {
return return
} }
var tt TokenType
switch { switch {
case c == '/': case c == '/':
tt = EndTagToken z.tt = EndTagToken
// Lower-cased characters are more common in tag names, so we check for them first. // Lower-cased characters are more common in tag names, so we check for them first.
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
tt = StartTagToken z.tt = StartTagToken
case c == '!': case c == '!':
z.nextMarkupDeclaration() z.nextMarkupDeclaration()
return return
...@@ -305,8 +304,7 @@ func (z *Tokenizer) nextTag() { ...@@ -305,8 +304,7 @@ func (z *Tokenizer) nextTag() {
return return
} }
case '>': case '>':
z.tt = tt if z.buf[z.p1-2] == '/' && z.tt == StartTagToken {
if z.buf[z.p1-2] == '/' && tt == StartTagToken {
z.tt = SelfClosingTagToken z.tt = SelfClosingTagToken
} }
return return
...@@ -379,37 +377,53 @@ func (z *Tokenizer) trim(i int) int { ...@@ -379,37 +377,53 @@ func (z *Tokenizer) trim(i int) int {
return k return k
} }
// word finds the largest alphabetic [0-9A-Za-z]* word at the start // tagName finds the tag name at the start of z.buf[i:] and returns that name
// of z.buf[i:] and returns that word (optionally lower-cased), as // lower-cased, as well as the trimmed cursor location afterwards.
// well as the trimmed cursor location after that word. func (z *Tokenizer) tagName(i int) ([]byte, int) {
func (z *Tokenizer) word(i int, lower bool) ([]byte, int) {
i0 := i i0 := i
loop: loop:
for ; i < z.p1; i++ { for ; i < z.p1; i++ {
c := z.buf[i] c := z.buf[i]
switch { switch c {
case '0' <= c && c <= '9': case ' ', '\n', '\t', '\f', '/', '>':
// No-op.
case 'A' <= c && c <= 'Z':
if lower {
z.buf[i] = c + 'a' - 'A'
}
case 'a' <= c && c <= 'z':
// No-op.
default:
break loop break loop
} }
if 'A' <= c && c <= 'Z' {
z.buf[i] = c + 'a' - 'A'
}
}
return z.buf[i0:i], z.trim(i)
}
// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:]
// and returns that value, as well as the trimmed cursor location afterwards.
func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) {
i0 := i
loop:
for ; i < z.p1; i++ {
switch z.buf[i] {
case ' ', '\n', '\t', '\f', '>':
break loop
case '&':
// TODO: unescape the entity.
}
} }
return z.buf[i0:i], z.trim(i) return z.buf[i0:i], z.trim(i)
} }
// attrName finds the largest attribute name at the start // attrName finds the largest attribute name at the start
// of z.buf[i:] and returns it lower-cased, as well // of z.buf[i:] and returns it lower-cased, as well
// as the trimmed cursor location after that word. // as the trimmed cursor location after that name.
// //
// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name // http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
// TODO: unicode characters // TODO: unicode characters
func (z *Tokenizer) attrName(i int) ([]byte, int) { func (z *Tokenizer) attrName(i int) ([]byte, int) {
for z.buf[i] == '/' {
i++
if z.buf[i] == '>' {
return nil, z.trim(i)
}
}
i0 := i i0 := i
loop: loop:
for ; i < z.p1; i++ { for ; i < z.p1; i++ {
...@@ -469,7 +483,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { ...@@ -469,7 +483,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
if z.buf[i] == '/' { if z.buf[i] == '/' {
i++ i++
} }
name, z.p0 = z.word(i, true) name, z.p0 = z.tagName(i)
hasAttr = z.p0 != z.p1 hasAttr = z.p0 != z.p1
return return
} }
...@@ -496,7 +510,7 @@ func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { ...@@ -496,7 +510,7 @@ func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
} }
closeQuote := z.buf[i] closeQuote := z.buf[i]
if closeQuote != '\'' && closeQuote != '"' { if closeQuote != '\'' && closeQuote != '"' {
val, z.p0 = z.word(i, false) val, z.p0 = z.unquotedAttrVal(i)
moreAttr = z.p0 != z.p1 moreAttr = z.p0 != z.p1
return return
} }
......
...@@ -41,6 +41,22 @@ var tokenTests = []tokenTest{ ...@@ -41,6 +41,22 @@ var tokenTests = []tokenTest{
"<a>b<c/>d</e>", "<a>b<c/>d</e>",
"<a>$b$<c/>$d$</e>", "<a>$b$<c/>$d$</e>",
}, },
// Some malformed tags that are missing a '>'.
{
"malformed tag #0",
`<p</p>`,
`<p< p="">`,
},
{
"malformed tag #1",
`<p id=0</p>`,
`<p id="0&lt;/p">`,
},
{
"malformed tag #2",
`<p id="0</p>`,
`<p id="0&lt;/p&gt;">`,
},
// Comments. // Comments.
{ {
"comment0", "comment0",
...@@ -117,7 +133,6 @@ var tokenTests = []tokenTest{ ...@@ -117,7 +133,6 @@ var tokenTests = []tokenTest{
"&frac12;", "&frac12;",
"½", "½",
}, },
// Attribute tests: // Attribute tests:
// http://dev.w3.org/html5/spec/Overview.html#attributes-0 // http://dev.w3.org/html5/spec/Overview.html#attributes-0
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment