html: tokenize "a < b" as one whole text token.

R=andybalholm CC=golang-dev https://golang.org/cl/5284042

html: tokenize "a < b" as one whole text token.
R=andybalholm CC=golang-dev https://golang.org/cl/5284042
1887907f · Nigel Tao · 1135fc39 · 1887907f · 1887907f
Commit 1887907f authored Oct 16, 2011 by Nigel Tao
Hide whitespace changes
Inline Side-by-side

Showing with 128 additions and 23 deletions

src/pkg/html/token.go src/pkg/html/token.go +35 -12

src/pkg/html/token_test.go src/pkg/html/token_test.go +93 -11

No files found.
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -379,15 +379,16 @@ func (z *Tokenizer) nextTag() {
 		z.nextBogusComment()
 		return
 	default:
-		z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
+		z.nextText()
 		return
 	}
 	// Read the tag name and attribute key/value pairs.
 	z.readTagName()
+	if z.skipWhiteSpace(); z.err != nil {
+		z.tt = ErrorToken
+		return
+	}
 	for {
-		if z.skipWhiteSpace(); z.err != nil {
-			break
-		}
 		c := z.readByte()
 		if z.err != nil || c == '>' {
 			break
@@ -399,6 +400,9 @@ func (z *Tokenizer) nextTag() {
 		if z.pendingAttr[0].start != z.pendingAttr[0].end {
 			z.attr = append(z.attr, z.pendingAttr)
 		}
+		if z.skipWhiteSpace(); z.err != nil {
+			break
+		}
 	}
 	// Check for a self-closing token.
 	if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
@@ -510,21 +514,40 @@ func (z *Tokenizer) readTagAttrVal() {
 	}
 }

-// nextText reads all text up until an '<'.
-// Pre-condition: z.tt == TextToken && z.err == nil && z.raw.start + 1 <= z.raw.end.
+// nextText reads all text up until a start tag "<a", end tag "</a", comment
+// "<!" or XML processing instruction "<?".
+// Pre-condition: z.tt == TextToken && z.err == nil &&
+//   z.raw.start + 1 <= z.raw.end.
 func (z *Tokenizer) nextText() {
 	for {
 		c := z.readByte()
 		if z.err != nil {
-			z.data = z.raw
-			return
+			break
 		}
-		if c == '<' {
-			z.raw.end--
-			z.data = z.raw
-			return
+		if c != '<' {
+			continue
+		}
+		c = z.readByte()
+		if z.err != nil {
+			break
+		}
+		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
+			z.raw.end -= 2
+			break
+		}
+		if c != '/' {
+			continue
+		}
+		c = z.readByte()
+		if z.err != nil {
+			break
+		}
+		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
+			z.raw.end -= 3
+			break
 		}
 	}
+	z.data = z.raw
 }

 // Next scans the next token and returns its type.

--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -21,6 +21,11 @@ type tokenTest struct {
 }

 var tokenTests = []tokenTest{
+	{
+		"empty",
+		"",
+		"",
+	},
 	// A single text node. The tokenizer should not break text nodes on whitespace,
 	// nor should it normalize whitespace within a text node.
 	{
@@ -41,6 +46,81 @@ var tokenTests = []tokenTest{
 		"<a>b<c/>d</e>",
 		"<a>$b$<c/>$d$</e>",
 	},
+	// Angle brackets that aren't a tag.
+	{
+		"not a tag #0",
+		"<",
+		"&lt;",
+	},
+	{
+		"not a tag #1",
+		"</",
+		"&lt;/",
+	},
+	/*
+		// TODO: re-enable these tests when we tokenize them correctly.
+		{
+			"not a tag #2",
+			"</>",
+			"",
+		},
+		{
+			"not a tag #3",
+			"a</>b",
+			"a$b",
+		},
+	*/
+	{
+		"not a tag #4",
+		"</ >",
+		"<!-- -->",
+	},
+	{
+		"not a tag #5",
+		"a < b",
+		"a &lt; b",
+	},
+	{
+		"not a tag #6",
+		"<.>",
+		"&lt;.&gt;",
+	},
+	{
+		"not a tag #7",
+		"a<<<b>>>c",
+		"a&lt;&lt;$<b>$&gt;&gt;c",
+	},
+	{
+		"not a tag #8",
+		"if x<0 and y < 0 then x*y>0",
+		"if x&lt;0 and y &lt; 0 then x*y&gt;0",
+	},
+	// EOF in a tag name.
+	{
+		"tag name eof #0",
+		"<a",
+		"",
+	},
+	{
+		"tag name eof #1",
+		"<a ",
+		"",
+	},
+	{
+		"tag name eof #2",
+		"a<b",
+		"a",
+	},
+	{
+		"tag name eof #3",
+		"<a><b",
+		"<a>",
+	},
+	{
+		"tag name eof #4",
+		`<a x`,
+		`<a x="">`,
+	},
 	// Some malformed tags that are missing a '>'.
 	{
 		"malformed tag #0",
@@ -257,8 +337,8 @@ var tokenTests = []tokenTest{
 	},
 	{
 		"Attributes with a solitary single quote",
-		"<p id=can't><p id=won't>",
-		"<p id=\"can&apos;t\">$<p id=\"won&apos;t\">",
+		`<p id=can't><p id=won't>`,
+		`<p id="can&apos;t">$<p id="won&apos;t">`,
 	},
 }

@@ -267,15 +347,17 @@ loop:
 	for _, tt := range tokenTests {
 		z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
 		z.ReturnComments = true
-		for i, s := range strings.Split(tt.golden, "$") {
-			if z.Next() == ErrorToken {
-				t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
-				continue loop
-			}
-			actual := z.Token().String()
-			if s != actual {
-				t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
-				continue loop
+		if tt.golden != "" {
+			for i, s := range strings.Split(tt.golden, "$") {
+				if z.Next() == ErrorToken {
+					t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
+					continue loop
+				}
+				actual := z.Token().String()
+				if s != actual {
+					t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
+					continue loop
+				}
 			}
 		}
 		z.Next()