Commit 816c972f authored by Andrew Balholm's avatar Andrew Balholm Committed by Nigel Tao

html: handle character entities without semicolons

Fix the TODO: unescape("&notit;") should be "¬it;"

Also accept digits in entity names.

R=nigeltao
CC=golang-dev, rsc
https://golang.org/cl/4781042
parent 78c89d21
...@@ -4,6 +4,9 @@ ...@@ -4,6 +4,9 @@
package html package html
// All entities that do not end with ';' are 6 or fewer bytes long.
const longestEntityWithoutSemicolon = 6
// entity is a map from HTML entity names to their values. The semicolon matters: // entity is a map from HTML entity names to their values. The semicolon matters:
// http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html // http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html
// lists both "amp" and "amp;" as two separate entries. // lists both "amp" and "amp;" as two separate entries.
......
...@@ -17,6 +17,9 @@ func TestEntityLength(t *testing.T) { ...@@ -17,6 +17,9 @@ func TestEntityLength(t *testing.T) {
if 1+len(k) < utf8.RuneLen(v) { if 1+len(k) < utf8.RuneLen(v) {
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v)) t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
} }
if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
}
} }
for k, v := range entity2 { for k, v := range entity2 {
if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) { if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
......
...@@ -53,7 +53,8 @@ var replacementTable = [...]int{ ...@@ -53,7 +53,8 @@ var replacementTable = [...]int{
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors. // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: b[src] == '&' && dst <= src. // Precondition: b[src] == '&' && dst <= src.
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { // attribute should be true if parsing an attribute value.
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
// i starts at 1 because we already know that s[0] == '&'. // i starts at 1 because we already know that s[0] == '&'.
...@@ -121,12 +122,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { ...@@ -121,12 +122,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
// Consume the maximum number of characters possible, with the // Consume the maximum number of characters possible, with the
// consumed characters matching one of the named references. // consumed characters matching one of the named references.
// TODO(nigeltao): unescape("&notit;") should be "¬it;"
for i < len(s) { for i < len(s) {
c := s[i] c := s[i]
i++ i++
// Lower-cased characters are more common in entities, so we check for them first. // Lower-cased characters are more common in entities, so we check for them first.
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
continue continue
} }
if c != ';' { if c != ';' {
...@@ -136,11 +136,25 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { ...@@ -136,11 +136,25 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
} }
entityName := string(s[1:i]) entityName := string(s[1:i])
if x := entity[entityName]; x != 0 { if entityName == "" {
// No-op.
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
// No-op.
} else if x := entity[entityName]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i return dst + utf8.EncodeRune(b[dst:], x), src + i
} else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity. } else if x := entity2[entityName]; x[0] != 0 {
dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
} else if !attribute {
maxLen := len(entityName) - 1
if maxLen > longestEntityWithoutSemicolon {
maxLen = longestEntityWithoutSemicolon
}
for j := maxLen; j > 1; j-- {
if x := entity[entityName[:j]]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
}
}
} }
dst1, src1 = dst+i, src+i dst1, src1 = dst+i, src+i
...@@ -152,11 +166,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { ...@@ -152,11 +166,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
func unescape(b []byte) []byte { func unescape(b []byte) []byte {
for i, c := range b { for i, c := range b {
if c == '&' { if c == '&' {
dst, src := unescapeEntity(b, i, i) dst, src := unescapeEntity(b, i, i, false)
for src < len(b) { for src < len(b) {
c := b[src] c := b[src]
if c == '&' { if c == '&' {
dst, src = unescapeEntity(b, dst, src) dst, src = unescapeEntity(b, dst, src, false)
} else { } else {
b[dst] = c b[dst] = c
dst, src = dst+1, src+1 dst, src = dst+1, src+1
......
...@@ -459,7 +459,7 @@ loop: ...@@ -459,7 +459,7 @@ loop:
src++ src++
break loop break loop
case '&': case '&':
dst, src = unescapeEntity(z.buf, dst, src) dst, src = unescapeEntity(z.buf, dst, src, true)
case '\\': case '\\':
if src == z.p1 { if src == z.p1 {
z.buf[dst] = '\\' z.buf[dst] = '\\'
......
...@@ -107,6 +107,16 @@ var tokenTests = []tokenTest{ ...@@ -107,6 +107,16 @@ var tokenTests = []tokenTest{
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`, `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
`<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`, `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
}, },
{
"entity without semicolon",
`&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
`¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
},
{
"entity with digits",
"&frac12;",
"½",
},
// Attribute tests: // Attribute tests:
// http://dev.w3.org/html5/spec/Overview.html#attributes-0 // http://dev.w3.org/html5/spec/Overview.html#attributes-0
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment