encoding/xml: add InputOffset method to Decoder

Among other things, this allows users to match the decoded pieces with the original XML, which can be necessary for implementing standards like XML signatures. Fixes #8484. LGTM=bradfitz R=bradfitz CC=golang-codereviews https://golang.org/cl/122960043

encoding/xml: add InputOffset method to Decoder
Among other things, this allows users to match the decoded pieces with the original XML, which can be necessary for implementing standards like XML signatures. Fixes #8484. LGTM=bradfitz R=bradfitz CC=golang-codereviews https://golang.org/cl/122960043
b91aea55 · Russ Cox · fef54b22 · b91aea55 · b91aea55
Commit b91aea55 authored Aug 06, 2014 by Russ Cox
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 4 deletions

src/pkg/encoding/xml/xml.go src/pkg/encoding/xml/xml.go +11 -0

src/pkg/encoding/xml/xml_test.go src/pkg/encoding/xml/xml_test.go +27 -4

No files found.
--- a/src/pkg/encoding/xml/xml.go
+++ b/src/pkg/encoding/xml/xml.go
@@ -29,6 +29,7 @@ import (
 type SyntaxError struct {
 	Msg  string
 	Line int
+	Byte int64 // byte offset from start of stream
 }

 func (e *SyntaxError) Error() string {
@@ -196,6 +197,7 @@ type Decoder struct {
 	ns             map[string]string
 	err            error
 	line           int
+	offset         int64
 	unmarshalDepth int
 }

@@ -859,9 +861,17 @@ func (d *Decoder) getc() (b byte, ok bool) {
 	if b == '\n' {
 		d.line++
 	}
+	d.offset++
 	return b, true
 }

+// InputOffset returns the input stream byte offset of the current decoder position.
+// The offset gives the location of the end of the most recently returned token
+// and the beginning of the next token.
+func (d *Decoder) InputOffset() int64 {
+	return d.offset
+}
+
 // Return saved offset.
 // If we did ungetc (nextByte >= 0), have to back up one.
 func (d *Decoder) savedOffset() int {
@@ -891,6 +901,7 @@ func (d *Decoder) ungetc(b byte) {
 		d.line--
 	}
 	d.nextByte = int(b)
+	d.offset--
 }

 var entity = map[string]int{

--- a/src/pkg/encoding/xml/xml_test.go
+++ b/src/pkg/encoding/xml/xml_test.go
@@ -170,7 +170,7 @@ var xmlInput = []string{
 func TestRawToken(t *testing.T) {
 	d := NewDecoder(strings.NewReader(testInput))
 	d.Entity = testEntity
-	testRawToken(t, d, rawTokens)
+	testRawToken(t, d, testInput, rawTokens)
 }

 const nonStrictInput = `
@@ -225,7 +225,7 @@ var nonStrictTokens = []Token{
 func TestNonStrictRawToken(t *testing.T) {
 	d := NewDecoder(strings.NewReader(nonStrictInput))
 	d.Strict = false
-	testRawToken(t, d, nonStrictTokens)
+	testRawToken(t, d, nonStrictInput, nonStrictTokens)
 }

 type downCaser struct {
@@ -254,7 +254,7 @@ func TestRawTokenAltEncoding(t *testing.T) {
 		}
 		return &downCaser{t, input.(io.ByteReader)}, nil
 	}
-	testRawToken(t, d, rawTokensAltEncoding)
+	testRawToken(t, d, testInputAltEncoding, rawTokensAltEncoding)
 }

 func TestRawTokenAltEncodingNoConverter(t *testing.T) {
@@ -280,9 +280,12 @@ func TestRawTokenAltEncodingNoConverter(t *testing.T) {
 	}
 }

-func testRawToken(t *testing.T, d *Decoder, rawTokens []Token) {
+func testRawToken(t *testing.T, d *Decoder, raw string, rawTokens []Token) {
+	lastEnd := int64(0)
 	for i, want := range rawTokens {
+		start := d.InputOffset()
 		have, err := d.RawToken()
+		end := d.InputOffset()
 		if err != nil {
 			t.Fatalf("token %d: unexpected error: %s", i, err)
 		}
@@ -300,6 +303,26 @@ func testRawToken(t *testing.T, d *Decoder, rawTokens []Token) {
 			}
 			t.Errorf("token %d = %s, want %s", i, shave, swant)
 		}
+
+		// Check that InputOffset returned actual token.
+		switch {
+		case start < lastEnd:
+			t.Errorf("token %d: position [%d,%d) for %T is before previous token", i, start, end, have)
+		case start >= end:
+			// Special case: EndElement can be synthesized.
+			if start == end && end == lastEnd {
+				break
+			}
+			t.Errorf("token %d: position [%d,%d) for %T is empty", i, start, end, have)
+		case end > int64(len(raw)):
+			t.Errorf("token %d: position [%d,%d) for %T extends beyond input", i, start, end, have)
+		default:
+			text := raw[start:end]
+			if strings.ContainsAny(text, "<>") && (!strings.HasPrefix(text, "<") || !strings.HasSuffix(text, ">")) {
+				t.Errorf("token %d: misaligned raw token %#q for %T", i, text, have)
+			}
+		}
+		lastEnd = end
 	}
 }