Commit 2e67dd86 authored by Patrick Smith's avatar Patrick Smith Committed by Russ Cox

encoding/xml: expand allowed entity names

Previously, multi-byte characters were not allowed. Also certain single-byte
characters, such as '-', were disallowed.
Fixes #3813.

R=golang-dev, rsc
CC=golang-dev
https://golang.org/cl/6641052
parent 5d05c780
...@@ -181,7 +181,6 @@ type Decoder struct { ...@@ -181,7 +181,6 @@ type Decoder struct {
ns map[string]string ns map[string]string
err error err error
line int line int
tmp [32]byte
} }
// NewDecoder creates a new XML parser reading from r. // NewDecoder creates a new XML parser reading from r.
...@@ -877,92 +876,92 @@ Input: ...@@ -877,92 +876,92 @@ Input:
// XML in all its glory allows a document to define and use // XML in all its glory allows a document to define and use
// its own character names with <!ENTITY ...> directives. // its own character names with <!ENTITY ...> directives.
// Parsers are required to recognize lt, gt, amp, apos, and quot // Parsers are required to recognize lt, gt, amp, apos, and quot
// even if they have not been declared. That's all we allow. // even if they have not been declared.
var i int before := d.buf.Len()
var semicolon bool d.buf.WriteByte('&')
var valid bool var ok bool
for i = 0; i < len(d.tmp); i++ { var text string
var ok bool var haveText bool
d.tmp[i], ok = d.getc() if b, ok = d.mustgetc(); !ok {
if !ok { return nil
if d.err == io.EOF { }
d.err = d.syntaxError("unexpected EOF") if b == '#' {
} d.buf.WriteByte(b)
if b, ok = d.mustgetc(); !ok {
return nil return nil
} }
c := d.tmp[i] base := 10
if c == ';' { if b == 'x' {
semicolon = true base = 16
valid = i > 0 d.buf.WriteByte(b)
break if b, ok = d.mustgetc(); !ok {
} return nil
if 'a' <= c && c <= 'z' ||
'A' <= c && c <= 'Z' ||
'0' <= c && c <= '9' ||
c == '_' || c == '#' {
continue
}
d.ungetc(c)
break
}
s := string(d.tmp[0:i])
if !valid {
if !d.Strict {
b0, b1 = 0, 0
d.buf.WriteByte('&')
d.buf.Write(d.tmp[0:i])
if semicolon {
d.buf.WriteByte(';')
} }
continue Input
} }
semi := ";" start := d.buf.Len()
if !semicolon { for '0' <= b && b <= '9' ||
semi = " (no semicolon)" base == 16 && 'a' <= b && b <= 'f' ||
base == 16 && 'A' <= b && b <= 'F' {
d.buf.WriteByte(b)
if b, ok = d.mustgetc(); !ok {
return nil
}
} }
if i < len(d.tmp) { if b != ';' {
d.err = d.syntaxError("invalid character entity &" + s + semi) d.ungetc(b)
} else { } else {
d.err = d.syntaxError("invalid character entity &" + s + "... too long") s := string(d.buf.Bytes()[start:])
} d.buf.WriteByte(';')
return nil n, err := strconv.ParseUint(s, base, 64)
} if err == nil && n <= unicode.MaxRune {
var haveText bool text = string(n)
var text string haveText = true
if i >= 2 && s[0] == '#' { }
var n uint64
var err error
if i >= 3 && s[1] == 'x' {
n, err = strconv.ParseUint(s[2:], 16, 64)
} else {
n, err = strconv.ParseUint(s[1:], 10, 64)
}
if err == nil && n <= unicode.MaxRune {
text = string(n)
haveText = true
} }
} else { } else {
if r, ok := entity[s]; ok { d.ungetc(b)
text = string(r) if !d.readName() {
haveText = true if d.err != nil {
} else if d.Entity != nil { return nil
text, haveText = d.Entity[s] }
ok = false
} }
} if b, ok = d.mustgetc(); !ok {
if !haveText { return nil
if !d.Strict { }
b0, b1 = 0, 0 if b != ';' {
d.buf.WriteByte('&') d.ungetc(b)
d.buf.Write(d.tmp[0:i]) } else {
name := d.buf.Bytes()[before+1:]
d.buf.WriteByte(';') d.buf.WriteByte(';')
continue Input if isName(name) {
s := string(name)
if r, ok := entity[s]; ok {
text = string(r)
haveText = true
} else if d.Entity != nil {
text, haveText = d.Entity[s]
}
}
} }
d.err = d.syntaxError("invalid character entity &" + s + ";")
return nil
} }
d.buf.Write([]byte(text))
b0, b1 = 0, 0 if haveText {
continue Input d.buf.Truncate(before)
d.buf.Write([]byte(text))
b0, b1 = 0, 0
continue Input
}
if !d.Strict {
b0, b1 = 0, 0
continue Input
}
ent := string(d.buf.Bytes()[before])
if ent[len(ent)-1] != ';' {
ent += " (no semicolon)"
}
d.err = d.syntaxError("invalid character entity " + ent)
return nil
} }
// We must rewrite unescaped \r and \r\n into \n. // We must rewrite unescaped \r and \r\n into \n.
...@@ -1030,18 +1029,34 @@ func (d *Decoder) nsname() (name Name, ok bool) { ...@@ -1030,18 +1029,34 @@ func (d *Decoder) nsname() (name Name, ok bool) {
// Do not set d.err if the name is missing (unless unexpected EOF is received): // Do not set d.err if the name is missing (unless unexpected EOF is received):
// let the caller provide better context. // let the caller provide better context.
func (d *Decoder) name() (s string, ok bool) { func (d *Decoder) name() (s string, ok bool) {
d.buf.Reset()
if !d.readName() {
return "", false
}
// Now we check the characters.
s = d.buf.String()
if !isName([]byte(s)) {
d.err = d.syntaxError("invalid XML name: " + s)
return "", false
}
return s, true
}
// Read a name and append its bytes to d.buf.
// The name is delimited by any single-byte character not valid in names.
// All multi-byte characters are accepted; the caller must check their validity.
func (d *Decoder) readName() (ok bool) {
var b byte var b byte
if b, ok = d.mustgetc(); !ok { if b, ok = d.mustgetc(); !ok {
return return
} }
// As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]*
if b < utf8.RuneSelf && !isNameByte(b) { if b < utf8.RuneSelf && !isNameByte(b) {
d.ungetc(b) d.ungetc(b)
return "", false return false
} }
d.buf.Reset()
d.buf.WriteByte(b) d.buf.WriteByte(b)
for { for {
if b, ok = d.mustgetc(); !ok { if b, ok = d.mustgetc(); !ok {
return return
...@@ -1052,16 +1067,7 @@ func (d *Decoder) name() (s string, ok bool) { ...@@ -1052,16 +1067,7 @@ func (d *Decoder) name() (s string, ok bool) {
} }
d.buf.WriteByte(b) d.buf.WriteByte(b)
} }
return true
// Then we check the characters.
s = d.buf.String()
for i, c := range s {
if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) {
d.err = d.syntaxError("invalid XML name: " + s)
return "", false
}
}
return s, true
} }
func isNameByte(c byte) bool { func isNameByte(c byte) bool {
...@@ -1071,6 +1077,30 @@ func isNameByte(c byte) bool { ...@@ -1071,6 +1077,30 @@ func isNameByte(c byte) bool {
c == '_' || c == ':' || c == '.' || c == '-' c == '_' || c == ':' || c == '.' || c == '-'
} }
func isName(s []byte) bool {
if len(s) == 0 {
return false
}
c, n := utf8.DecodeRune(s)
if c == utf8.RuneError && n == 1 {
return false
}
if !unicode.Is(first, c) {
return false
}
for n < len(s) {
s = s[n:]
c, n = utf8.DecodeRune(s)
if c == utf8.RuneError && n == 1 {
return false
}
if !unicode.Is(first, c) && !unicode.Is(second, c) {
return false
}
}
return true
}
// These tables were generated by cut and paste from Appendix B of // These tables were generated by cut and paste from Appendix B of
// the XML spec at http://www.xml.com/axml/testaxml.htm // the XML spec at http://www.xml.com/axml/testaxml.htm
// and then reformatting. First corresponds to (Letter | '_' | ':') // and then reformatting. First corresponds to (Letter | '_' | ':')
......
...@@ -19,6 +19,7 @@ const testInput = ` ...@@ -19,6 +19,7 @@ const testInput = `
<body xmlns:foo="ns1" xmlns="ns2" xmlns:tag="ns3" ` + <body xmlns:foo="ns1" xmlns="ns2" xmlns:tag="ns3" ` +
"\r\n\t" + ` > "\r\n\t" + ` >
<hello lang="en">World &lt;&gt;&apos;&quot; &#x767d;&#40300;翔</hello> <hello lang="en">World &lt;&gt;&apos;&quot; &#x767d;&#40300;翔</hello>
<query>&何; &is-it;</query>
<goodbye /> <goodbye />
<outer foo:attr="value" xmlns:tag="ns4"> <outer foo:attr="value" xmlns:tag="ns4">
<inner/> <inner/>
...@@ -28,6 +29,8 @@ const testInput = ` ...@@ -28,6 +29,8 @@ const testInput = `
</tag:name> </tag:name>
</body><!-- missing final newline -->` </body><!-- missing final newline -->`
var testEntity = map[string]string{"何": "What", "is-it": "is it?"}
var rawTokens = []Token{ var rawTokens = []Token{
CharData("\n"), CharData("\n"),
ProcInst{"xml", []byte(`version="1.0" encoding="UTF-8"`)}, ProcInst{"xml", []byte(`version="1.0" encoding="UTF-8"`)},
...@@ -41,6 +44,10 @@ var rawTokens = []Token{ ...@@ -41,6 +44,10 @@ var rawTokens = []Token{
CharData("World <>'\" 白鵬翔"), CharData("World <>'\" 白鵬翔"),
EndElement{Name{"", "hello"}}, EndElement{Name{"", "hello"}},
CharData("\n "), CharData("\n "),
StartElement{Name{"", "query"}, []Attr{}},
CharData("What is it?"),
EndElement{Name{"", "query"}},
CharData("\n "),
StartElement{Name{"", "goodbye"}, []Attr{}}, StartElement{Name{"", "goodbye"}, []Attr{}},
EndElement{Name{"", "goodbye"}}, EndElement{Name{"", "goodbye"}},
CharData("\n "), CharData("\n "),
...@@ -74,6 +81,10 @@ var cookedTokens = []Token{ ...@@ -74,6 +81,10 @@ var cookedTokens = []Token{
CharData("World <>'\" 白鵬翔"), CharData("World <>'\" 白鵬翔"),
EndElement{Name{"ns2", "hello"}}, EndElement{Name{"ns2", "hello"}},
CharData("\n "), CharData("\n "),
StartElement{Name{"ns2", "query"}, []Attr{}},
CharData("What is it?"),
EndElement{Name{"ns2", "query"}},
CharData("\n "),
StartElement{Name{"ns2", "goodbye"}, []Attr{}}, StartElement{Name{"ns2", "goodbye"}, []Attr{}},
EndElement{Name{"ns2", "goodbye"}}, EndElement{Name{"ns2", "goodbye"}},
CharData("\n "), CharData("\n "),
...@@ -156,6 +167,7 @@ var xmlInput = []string{ ...@@ -156,6 +167,7 @@ var xmlInput = []string{
func TestRawToken(t *testing.T) { func TestRawToken(t *testing.T) {
d := NewDecoder(strings.NewReader(testInput)) d := NewDecoder(strings.NewReader(testInput))
d.Entity = testEntity
testRawToken(t, d, rawTokens) testRawToken(t, d, rawTokens)
} }
...@@ -164,8 +176,14 @@ const nonStrictInput = ` ...@@ -164,8 +176,14 @@ const nonStrictInput = `
<tag>&unknown;entity</tag> <tag>&unknown;entity</tag>
<tag>&#123</tag> <tag>&#123</tag>
<tag>&#zzz;</tag> <tag>&#zzz;</tag>
<tag>&なまえ3;</tag>
<tag>&lt-gt;</tag>
<tag>&;</tag>
<tag>&0a;</tag>
` `
var nonStringEntity = map[string]string{"": "oops!", "0a": "oops!"}
var nonStrictTokens = []Token{ var nonStrictTokens = []Token{
CharData("\n"), CharData("\n"),
StartElement{Name{"", "tag"}, []Attr{}}, StartElement{Name{"", "tag"}, []Attr{}},
...@@ -184,6 +202,22 @@ var nonStrictTokens = []Token{ ...@@ -184,6 +202,22 @@ var nonStrictTokens = []Token{
CharData("&#zzz;"), CharData("&#zzz;"),
EndElement{Name{"", "tag"}}, EndElement{Name{"", "tag"}},
CharData("\n"), CharData("\n"),
StartElement{Name{"", "tag"}, []Attr{}},
CharData("&なまえ3;"),
EndElement{Name{"", "tag"}},
CharData("\n"),
StartElement{Name{"", "tag"}, []Attr{}},
CharData("&lt-gt;"),
EndElement{Name{"", "tag"}},
CharData("\n"),
StartElement{Name{"", "tag"}, []Attr{}},
CharData("&;"),
EndElement{Name{"", "tag"}},
CharData("\n"),
StartElement{Name{"", "tag"}, []Attr{}},
CharData("&0a;"),
EndElement{Name{"", "tag"}},
CharData("\n"),
} }
func TestNonStrictRawToken(t *testing.T) { func TestNonStrictRawToken(t *testing.T) {
...@@ -317,6 +351,7 @@ func TestNestedDirectives(t *testing.T) { ...@@ -317,6 +351,7 @@ func TestNestedDirectives(t *testing.T) {
func TestToken(t *testing.T) { func TestToken(t *testing.T) {
d := NewDecoder(strings.NewReader(testInput)) d := NewDecoder(strings.NewReader(testInput))
d.Entity = testEntity
for i, want := range cookedTokens { for i, want := range cookedTokens {
have, err := d.Token() have, err := d.Token()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment