mime: RFC 2231 continuation / non-ASCII support

Fixes #1119. R=rsc, r CC=golang-dev https://golang.org/cl/4437052

mime: RFC 2231 continuation / non-ASCII support
Fixes #1119. R=rsc, r CC=golang-dev https://golang.org/cl/4437052
98176b77 · Brad Fitzpatrick · 23fc9c84 · 98176b77 · 98176b77
Commit 98176b77 authored Apr 18, 2011 by Brad Fitzpatrick
Hide whitespace changes
Inline Side-by-side

Showing with 203 additions and 9 deletions

src/pkg/mime/mediatype.go src/pkg/mime/mediatype.go +146 -7

src/pkg/mime/mediatype_test.go src/pkg/mime/mediatype_test.go +57 -2

No files found.
--- a/src/pkg/mime/mediatype.go
+++ b/src/pkg/mime/mediatype.go
@@ -6,6 +6,8 @@ package mime

 import (
 	"bytes"
+	"fmt"
+	"os"
 	"strings"
 	"unicode"
 )
@@ -46,11 +48,16 @@ func ParseMediaType(v string) (mediatype string, params map[string]string) {

 	params = make(map[string]string)

+	// Map of base parameter name -> parameter name -> value
+	// for parameters containing a '*' character.
+	// Lazily initialized.
+	var continuation map[string]map[string]string
+
 	v = v[i:]
 	for len(v) > 0 {
 		v = strings.TrimLeftFunc(v, unicode.IsSpace)
 		if len(v) == 0 {
-			return
+			break
 		}
 		key, value, rest := consumeMediaParam(v)
 		if key == "" {
@@ -62,12 +69,83 @@ func ParseMediaType(v string) (mediatype string, params map[string]string) {
 			// Parse error.
 			return "", nil
 		}
-		params[key] = value
+
+		pmap := params
+		if idx := strings.Index(key, "*"); idx != -1 {
+			baseName := key[:idx]
+			if continuation == nil {
+				continuation = make(map[string]map[string]string)
+			}
+			var ok bool
+			if pmap, ok = continuation[baseName]; !ok {
+				continuation[baseName] = make(map[string]string)
+				pmap = continuation[baseName]
+			}
+		}
+		if _, exists := pmap[key]; exists {
+			// Duplicate parameter name is bogus.
+			return "", nil
+		}
+		pmap[key] = value
 		v = rest
 	}
+
+	// Stitch together any continuations or things with stars
+	// (i.e. RFC 2231 things with stars: "foo*0" or "foo*")
+	var buf bytes.Buffer
+	for key, pieceMap := range continuation {
+		singlePartKey := key + "*"
+		if v, ok := pieceMap[singlePartKey]; ok {
+			decv := decode2231Enc(v)
+			params[key] = decv
+			continue
+		}
+
+		buf.Reset()
+		valid := false
+		for n := 0; ; n++ {
+			simplePart := fmt.Sprintf("%s*%d", key, n)
+			if v, ok := pieceMap[simplePart]; ok {
+				valid = true
+				buf.WriteString(v)
+				continue
+			}
+			encodedPart := simplePart + "*"
+			if v, ok := pieceMap[encodedPart]; ok {
+				valid = true
+				if n == 0 {
+					buf.WriteString(decode2231Enc(v))
+				} else {
+					decv, _ := percentHexUnescape(v)
+					buf.WriteString(decv)
+				}
+			} else {
+				break
+			}
+		}
+		if valid {
+			params[key] = buf.String()
+		}
+	}
+
 	return
 }

+func decode2231Enc(v string) string {
+	sv := strings.Split(v, "'", 3)
+	if len(sv) != 3 {
+		return ""
+	}
+	// Ignoring lang in sv[1] for now.
+	charset := strings.ToLower(sv[0])
+	if charset != "us-ascii" && charset != "utf-8" {
+		// TODO: unsupported encoding
+		return ""
+	}
+	encv, _ := percentHexUnescape(sv[2])
+	return encv
+}
+
 func isNotTokenChar(rune int) bool {
 	return !IsTokenChar(rune)
 }
@@ -107,17 +185,14 @@ func consumeValue(v string) (value, rest string) {
 	for idx, rune = range rest {
 		switch {
 		case nextIsLiteral:
-			if rune >= 0x80 {
-				return "", v
-			}
 			buffer.WriteRune(rune)
 			nextIsLiteral = false
 		case rune == leadQuote:
 			return buffer.String(), rest[idx+1:]
-		case IsQText(rune):
-			buffer.WriteRune(rune)
 		case rune == '\\':
 			nextIsLiteral = true
+		case rune != '\r' && rune != '\n':
+			buffer.WriteRune(rune)
 		default:
 			return "", v
 		}
@@ -137,6 +212,7 @@ func consumeMediaParam(v string) (param, value, rest string) {
 	if param == "" {
 		return "", "", v
 	}
+
 	rest = strings.TrimLeftFunc(rest, unicode.IsSpace)
 	if !strings.HasPrefix(rest, "=") {
 		return "", "", v
@@ -149,3 +225,66 @@ func consumeMediaParam(v string) (param, value, rest string) {
 	}
 	return param, value, rest
 }
+
+func percentHexUnescape(s string) (string, os.Error) {
+	// Count %, check that they're well-formed.
+	percents := 0
+	for i := 0; i < len(s); {
+		if s[i] != '%' {
+			i++
+			continue
+		}
+		percents++
+		if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
+			s = s[i:]
+			if len(s) > 3 {
+				s = s[0:3]
+			}
+			return "", fmt.Errorf("Bogus characters after %: %q", s)
+		}
+		i += 3
+	}
+	if percents == 0 {
+		return s, nil
+	}
+
+	t := make([]byte, len(s)-2*percents)
+	j := 0
+	for i := 0; i < len(s); {
+		switch s[i] {
+		case '%':
+			t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
+			j++
+			i += 3
+		default:
+			t[j] = s[i]
+			j++
+			i++
+		}
+	}
+	return string(t), nil
+}
+
+func ishex(c byte) bool {
+	switch {
+	case '0' <= c && c <= '9':
+		return true
+	case 'a' <= c && c <= 'f':
+		return true
+	case 'A' <= c && c <= 'F':
+		return true
+	}
+	return false
+}
+
+func unhex(c byte) byte {
+	switch {
+	case '0' <= c && c <= '9':
+		return c - '0'
+	case 'a' <= c && c <= 'f':
+		return c - 'a' + 10
+	case 'A' <= c && c <= 'F':
+		return c - 'A' + 10
+	}
+	return 0
+}
--- a/src/pkg/mime/mediatype_test.go
+++ b/src/pkg/mime/mediatype_test.go
@@ -114,6 +114,28 @@ func TestParseMediaType(t *testing.T) {
 			"form-data",
 			m("key", "value", "blah", "value", "name", "foo")},

+		{`foo; key=val1; key=the-key-appears-again-which-is-bogus`,
+			"", m()},
+
+		// From RFC 2231:
+		{`application/x-stuff; title*=us-ascii'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A`,
+			"application/x-stuff",
+			m("title", "This is ***fun***")},
+
+		{`message/external-body; access-type=URL; ` +
+			`URL*0="ftp://";` +
+			`URL*1="cs.utk.edu/pub/moore/bulk-mailer/bulk-mailer.tar"`,
+			"message/external-body",
+			m("access-type", "URL",
+				"URL", "ftp://cs.utk.edu/pub/moore/bulk-mailer/bulk-mailer.tar")},
+
+		{`application/x-stuff; ` +
+			`title*0*=us-ascii'en'This%20is%20even%20more%20; ` +
+			`title*1*=%2A%2A%2Afun%2A%2A%2A%20; ` +
+			`title*2="isn't it!"`,
+			"application/x-stuff",
+			m("title", "This is even more ***fun*** isn't it!")},
+
 		// Tests from http://greenbytes.de/tech/tc2231/
 		// TODO(bradfitz): add the rest of the tests from that site.
 		{`attachment; filename="f\oo.html"`,
@@ -159,8 +181,41 @@ func TestParseMediaType(t *testing.T) {
 			"attachment",
 			m("creation-date", "Wed, 12 Feb 1997 16:29:51 -0500")},
 		{`foobar`, "foobar", m()},
-		// TODO(bradfitz): rest of them, including RFC2231 encoded UTF-8 and
-		// other charsets.
+		{`attachment; filename* =UTF-8''foo-%c3%a4.html`,
+			"attachment",
+			m("filename", "foo-ä.html")},
+		{`attachment; filename*=UTF-8''A-%2541.html`,
+			"attachment",
+			m("filename", "A-%41.html")},
+		{`attachment; filename*0="foo."; filename*1="html"`,
+			"attachment",
+			m("filename", "foo.html")},
+		{`attachment; filename*0*=UTF-8''foo-%c3%a4; filename*1=".html"`,
+			"attachment",
+			m("filename", "foo-ä.html")},
+		{`attachment; filename*0="foo"; filename*01="bar"`,
+			"attachment",
+			m("filename", "foo")},
+		{`attachment; filename*0="foo"; filename*2="bar"`,
+			"attachment",
+			m("filename", "foo")},
+		{`attachment; filename*1="foo"; filename*2="bar"`,
+			"attachment", m()},
+		{`attachment; filename*1="bar"; filename*0="foo"`,
+			"attachment",
+			m("filename", "foobar")},
+		{`attachment; filename="foo-ae.html"; filename*=UTF-8''foo-%c3%a4.html`,
+			"attachment",
+			m("filename", "foo-ä.html")},
+		{`attachment; filename*=UTF-8''foo-%c3%a4.html; filename="foo-ae.html"`,
+			"attachment",
+			m("filename", "foo-ä.html")},
+
+		// Browsers also just send UTF-8 directly without RFC 2231,
+		// at least when the source page is served with UTF-8.
+		{`form-data; firstname="Брэд"; lastname="Фицпатрик"`,
+			"form-data",
+			m("firstname", "Брэд", "lastname", "Фицпатрик")},
 	}
 	for _, test := range tests {
 		mt, params := ParseMediaType(test.in)