html: improve parsing of comments and "bogus comments"

R=nigeltao CC=golang-dev https://golang.org/cl/5279044

html: improve parsing of comments and "bogus comments"
R=nigeltao CC=golang-dev https://golang.org/cl/5279044
b770c9e9 · Andrew Balholm · Nigel Tao · 5079129d · b770c9e9 · b770c9e9
Commit b770c9e9 authored Oct 15, 2011 by Andrew Balholm Committed by Nigel Tao Oct 15, 2011
Hide whitespace changes
Inline Side-by-side

Showing with 129 additions and 38 deletions

src/pkg/html/token.go src/pkg/html/token.go +82 -29

src/pkg/html/token_test.go src/pkg/html/token_test.go +47 -9

No files found.
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -100,9 +100,9 @@ func (t Token) String() string {
 	case SelfClosingTagToken:
 		return "<" + t.tagString() + "/>"
 	case CommentToken:
-		return "<!--" + EscapeString(t.Data) + "-->"
+		return "<!--" + t.Data + "-->"
 	case DoctypeToken:
-		return "<!DOCTYPE " + EscapeString(t.Data) + ">"
+		return "<!DOCTYPE " + t.Data + ">"
 	}
 	return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
 }
@@ -227,30 +227,62 @@ func (z *Tokenizer) skipWhiteSpace() {
 // nextComment reads the next token starting with "<!--".
 // The opening "<!--" has already been consumed.
-// Pre-condition: z.tt == TextToken && z.err == nil &&
+// Pre-condition: z.tt == CommentToken && z.err == nil &&
 //   z.raw.start + 4 <= z.raw.end.
 func (z *Tokenizer) nextComment() {
-	// <!--> is a valid comment.
+	z.data.start = z.raw.end
+	defer func() {
+		if z.data.end < z.data.start {
+			// It's a comment with no data, like <!-->.
+			z.data.end = z.data.start
+		}
+	}()
 	for dashCount := 2; ; {
 		c := z.readByte()
 		if z.err != nil {
-			z.data = z.raw
+			z.data.end = z.raw.end
 			return
 		}
 		switch c {
 		case '-':
 			dashCount++
+			continue
 		case '>':
 			if dashCount >= 2 {
-				z.tt = CommentToken
+				z.data.end = z.raw.end - len("-->")
-				// TODO: adjust z.data to be only the "x" in "<!--x-->".
-				// Note that "<!>" is also a valid HTML5 comment.
-				z.data = z.raw
 				return
 			}
-			dashCount = 0
+		case '!':
-		default:
+			if dashCount >= 2 {
-			dashCount = 0
+				c = z.readByte()
+				if z.err != nil {
+					z.data.end = z.raw.end
+					return
+				}
+				if c == '>' {
+					z.data.end = z.raw.end - len("--!>")
+					return
+				}
+			}
+		}
+		dashCount = 0
+	}
+}
+// nextBogusComment reads text until the next ">" and treats it as a comment.
+// Pre-condition: z.err == nil && z.raw.end is before the first comment byte.
+func (z *Tokenizer) nextBogusComment() {
+	z.tt = CommentToken
+	z.data.start = z.raw.end
+	for {
+		c := z.readByte()
+		if z.err != nil {
+			z.data.end = z.raw.end
+			return
+		}
+		if c == '>' {
+			z.data.end = z.raw.end - len(">")
+			return
 		}
 	}
 }
@@ -258,13 +290,15 @@ func (z *Tokenizer) nextComment() {
 // nextMarkupDeclaration reads the next token starting with "<!".
 // It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text".
 // The opening "<!" has already been consumed.
-// Pre-condition: z.tt == TextToken && z.err == nil &&
+// Pre-condition: z.err == nil && z.raw.start + 2 <= z.raw.end.
-//   z.raw.start + 2 <= z.raw.end.
 func (z *Tokenizer) nextMarkupDeclaration() {
+	z.tt = CommentToken
+	z.data.start = z.raw.end
 	var c [2]byte
 	for i := 0; i < 2; i++ {
 		c[i] = z.readByte()
 		if z.err != nil {
+			z.data.end = z.raw.end
 			return
 		}
 	}
@@ -273,27 +307,35 @@ func (z *Tokenizer) nextMarkupDeclaration() {
 		return
 	}
 	z.raw.end -= 2
-	const s = "DOCTYPE "
+	const s = "DOCTYPE"
-	for i := 0; ; i++ {
+	for i := 0; i < len(s); i++ {
 		c := z.readByte()
 		if z.err != nil {
-			z.data = z.raw
+			z.data.end = z.raw.end
 			return
 		}
-		// Capitalize c.
+		if c != s[i] && c != s[i]+('a'-'A') {
-		if 'a' <= c && c <= 'z' {
+			// Back up to read the fragment of "DOCTYPE" again.
-			c = 'A' + (c - 'a')
+			z.raw.end = z.data.start
+			z.nextBogusComment()
+			return
 		}
-		if i < len(s) && c != s[i] {
+	}
-			z.nextText()
+	z.tt = DoctypeToken
+	if z.skipWhiteSpace(); z.err != nil {
+		z.data.start = z.raw.end
+		z.data.end = z.raw.end
+		return
+	}
+	z.data.start = z.raw.end
+	for {
+		c := z.readByte()
+		if z.err != nil {
+			z.data.end = z.raw.end
 			return
 		}
 		if c == '>' {
-			if i >= len(s) {
+			z.data.end = z.raw.end - len(">")
-				z.tt = DoctypeToken
-				z.data.start = z.raw.start + len("<!DOCTYPE ")
-				z.data.end = z.raw.end - len(">")
-			}
 			return
 		}
 	}
@@ -311,8 +353,18 @@ func (z *Tokenizer) nextTag() {
 		return
 	}
 	switch {
-	// TODO: check that the "</" is followed by something in A-Za-z.
 	case c == '/':
+		// Check that the "</" is followed by something in A-Za-z.
+		c = z.readByte()
+		if z.err != nil {
+			z.data = z.raw
+			return
+		}
+		z.raw.end--
+		if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
+			z.nextBogusComment()
+			return
+		}
 		z.tt = EndTagToken
 		z.data.start += len("</")
 	// Lower-cased characters are more common in tag names, so we check for them first.
@@ -323,7 +375,8 @@ func (z *Tokenizer) nextTag() {
 		z.nextMarkupDeclaration()
 		return
 	case c == '?':
-		z.tt, z.err = ErrorToken, os.NewError("html: TODO: implement XML processing instructions")
+		z.raw.end--
+		z.nextBogusComment()
 		return
 	default:
 		z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")

--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -87,51 +87,88 @@ var tokenTests = []tokenTest{
 		`<p id="0"</p>`,
 		`<p id="0" <="" p="">`,
 	},
+	// DOCTYPE tests.
+	{
+		"Proper DOCTYPE",
+		"<!DOCTYPE html>",
+		"<!DOCTYPE html>",
+	},
+	{
+		"DOCTYPE with no space",
+		"<!doctypehtml>",
+		"<!DOCTYPE html>",
+	},
+	{
+		"DOCTYPE with two spaces",
+		"<!doctype  html>",
+		"<!DOCTYPE html>",
+	},
+	{
+		"looks like DOCTYPE but isn't",
+		"<!DOCUMENT html>",
+		"<!--DOCUMENT html-->",
+	},
+	{
+		"DOCTYPE at EOF",
+		"<!DOCtype",
+		"<!DOCTYPE >",
+	},
+	// XML processing instructions.
+	{
+		"XML processing instruction",
+		"<?xml?>",
+		"<!--?xml?-->",
+	},
 	// Comments.
 	{
 		"comment0",
 		"abc<b><!-- skipme --></b>def",
-		"abc$<b>$</b>$def",
+		"abc$<b>$<!-- skipme -->$</b>$def",
 	},
 	{
 		"comment1",
 		"a<!-->z",
-		"a$z",
+		"a$<!---->$z",
 	},
 	{
 		"comment2",
 		"a<!--->z",
-		"a$z",
+		"a$<!---->$z",
 	},
 	{
 		"comment3",
 		"a<!--x>-->z",
-		"a$z",
+		"a$<!--x>-->$z",
 	},
 	{
 		"comment4",
 		"a<!--x->-->z",
-		"a$z",
+		"a$<!--x->-->$z",
 	},
 	{
 		"comment5",
 		"a<!>z",
-		"a$&lt;!&gt;z",
+		"a$<!---->$z",
 	},
 	{
 		"comment6",
 		"a<!->z",
-		"a$&lt;!-&gt;z",
+		"a$<!----->$z",
 	},
 	{
 		"comment7",
 		"a<!---<>z",
-		"a$&lt;!---&lt;&gt;z",
+		"a$<!---<>z-->",
 	},
 	{
 		"comment8",
 		"a<!--z",
-		"a$&lt;!--z",
+		"a$<!--z-->",
+	},
+	{
+		"comment9",
+		"a<!--x--!>z",
+		"a$<!--x-->$z",
 	},
 	// An attribute with a backslash.
 	{
@@ -229,6 +266,7 @@ func TestTokenizer(t *testing.T) {
 loop:
 	for _, tt := range tokenTests {
 		z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
+		z.ReturnComments = true
 		for i, s := range strings.Split(tt.golden, "$") {
 			if z.Next() == ErrorToken {
 				t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())