parse_test.go 9.89 KB
Newer Older
Nigel Tao's avatar
Nigel Tao committed
1 2 3 4 5 6 7 8 9
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package html

import (
	"bufio"
	"bytes"
10
	"errors"
11
	"exp/html/atom"
Nigel Tao's avatar
Nigel Tao committed
12 13
	"fmt"
	"io"
14
	"io/ioutil"
Nigel Tao's avatar
Nigel Tao committed
15
	"os"
16
	"path/filepath"
17
	"runtime"
18
	"sort"
Nigel Tao's avatar
Nigel Tao committed
19
	"strings"
Nigel Tao's avatar
Nigel Tao committed
20 21 22
	"testing"
)

23
// readParseTest reads a single test case from r.
24
func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
25
	line, err := r.ReadSlice('\n')
Nigel Tao's avatar
Nigel Tao committed
26
	if err != nil {
27
		return "", "", "", err
Nigel Tao's avatar
Nigel Tao committed
28
	}
29
	var b []byte
Nigel Tao's avatar
Nigel Tao committed
30

31 32
	// Read the HTML.
	if string(line) != "#data\n" {
33
		return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
34
	}
Nigel Tao's avatar
Nigel Tao committed
35
	for {
36
		line, err = r.ReadSlice('\n')
Nigel Tao's avatar
Nigel Tao committed
37
		if err != nil {
38
			return "", "", "", err
Nigel Tao's avatar
Nigel Tao committed
39
		}
40 41 42 43 44
		if line[0] == '#' {
			break
		}
		b = append(b, line...)
	}
45
	text = strings.TrimSuffix(string(b), "\n")
46 47 48 49
	b = b[:0]

	// Skip the error list.
	if string(line) != "#errors\n" {
50
		return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
51 52 53 54
	}
	for {
		line, err = r.ReadSlice('\n')
		if err != nil {
55
			return "", "", "", err
Nigel Tao's avatar
Nigel Tao committed
56 57
		}
		if line[0] == '#' {
58
			break
Nigel Tao's avatar
Nigel Tao committed
59
		}
60 61
	}

62 63 64 65 66 67 68 69 70 71 72 73
	if string(line) == "#document-fragment\n" {
		line, err = r.ReadSlice('\n')
		if err != nil {
			return "", "", "", err
		}
		context = strings.TrimSpace(string(line))
		line, err = r.ReadSlice('\n')
		if err != nil {
			return "", "", "", err
		}
	}

74 75
	// Read the dump of what the parse tree should be.
	if string(line) != "#document\n" {
76
		return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
77
	}
78
	inQuote := false
79 80 81
	for {
		line, err = r.ReadSlice('\n')
		if err != nil && err != io.EOF {
82
			return "", "", "", err
Nigel Tao's avatar
Nigel Tao committed
83
		}
84 85 86 87 88 89 90 91 92 93
		trimmed := bytes.Trim(line, "| \n")
		if len(trimmed) > 0 {
			if line[0] == '|' && trimmed[0] == '"' {
				inQuote = true
			}
			if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
				inQuote = false
			}
		}
		if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
94
			break
Nigel Tao's avatar
Nigel Tao committed
95
		}
96
		b = append(b, line...)
Nigel Tao's avatar
Nigel Tao committed
97
	}
98
	return text, string(b), context, nil
Nigel Tao's avatar
Nigel Tao committed
99 100
}

101
func dumpIndent(w io.Writer, level int) {
Nigel Tao's avatar
Nigel Tao committed
102 103 104 105
	io.WriteString(w, "| ")
	for i := 0; i < level; i++ {
		io.WriteString(w, "  ")
	}
106 107
}

108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
type sortedAttributes []Attribute

func (a sortedAttributes) Len() int {
	return len(a)
}

func (a sortedAttributes) Less(i, j int) bool {
	if a[i].Namespace != a[j].Namespace {
		return a[i].Namespace < a[j].Namespace
	}
	return a[i].Key < a[j].Key
}

func (a sortedAttributes) Swap(i, j int) {
	a[i], a[j] = a[j], a[i]
}

125
func dumpLevel(w io.Writer, n *Node, level int) error {
126
	dumpIndent(w, level)
Nigel Tao's avatar
Nigel Tao committed
127 128
	switch n.Type {
	case ErrorNode:
129
		return errors.New("unexpected ErrorNode")
Nigel Tao's avatar
Nigel Tao committed
130
	case DocumentNode:
131
		return errors.New("unexpected DocumentNode")
Nigel Tao's avatar
Nigel Tao committed
132
	case ElementNode:
133 134 135 136 137
		if n.Namespace != "" {
			fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
		} else {
			fmt.Fprintf(w, "<%s>", n.Data)
		}
138 139
		attr := sortedAttributes(n.Attr)
		sort.Sort(attr)
Nigel Tao's avatar
Nigel Tao committed
140
		for _, a := range attr {
141 142
			io.WriteString(w, "\n")
			dumpIndent(w, level+1)
Nigel Tao's avatar
Nigel Tao committed
143 144 145 146 147
			if a.Namespace != "" {
				fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
			} else {
				fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
			}
148
		}
Nigel Tao's avatar
Nigel Tao committed
149
	case TextNode:
150
		fmt.Fprintf(w, `"%s"`, n.Data)
Nigel Tao's avatar
Nigel Tao committed
151
	case CommentNode:
152
		fmt.Fprintf(w, "<!-- %s -->", n.Data)
153
	case DoctypeNode:
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
		fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
		if n.Attr != nil {
			var p, s string
			for _, a := range n.Attr {
				switch a.Key {
				case "public":
					p = a.Val
				case "system":
					s = a.Val
				}
			}
			if p != "" || s != "" {
				fmt.Fprintf(w, ` "%s"`, p)
				fmt.Fprintf(w, ` "%s"`, s)
			}
		}
		io.WriteString(w, ">")
171
	case scopeMarkerNode:
172
		return errors.New("unexpected scopeMarkerNode")
Nigel Tao's avatar
Nigel Tao committed
173
	default:
174
		return errors.New("unknown node type")
Nigel Tao's avatar
Nigel Tao committed
175 176
	}
	io.WriteString(w, "\n")
177
	for c := n.FirstChild; c != nil; c = c.NextSibling {
Nigel Tao's avatar
Nigel Tao committed
178 179 180 181 182 183 184
		if err := dumpLevel(w, c, level+1); err != nil {
			return err
		}
	}
	return nil
}

185
func dump(n *Node) (string, error) {
186
	if n == nil || n.FirstChild == nil {
Nigel Tao's avatar
Nigel Tao committed
187 188
		return "", nil
	}
Rob Pike's avatar
Rob Pike committed
189
	var b bytes.Buffer
190 191
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		if err := dumpLevel(&b, c, 0); err != nil {
192 193
			return "", err
		}
Nigel Tao's avatar
Nigel Tao committed
194 195 196 197
	}
	return b.String(), nil
}

198 199
const testDataDir = "testdata/webkit/"

Nigel Tao's avatar
Nigel Tao committed
200
func TestParser(t *testing.T) {
201 202 203
	testFiles, err := filepath.Glob(testDataDir + "*.dat")
	if err != nil {
		t.Fatal(err)
Nigel Tao's avatar
Nigel Tao committed
204
	}
205
	for _, tf := range testFiles {
206
		f, err := os.Open(tf)
207 208 209 210 211
		if err != nil {
			t.Fatal(err)
		}
		defer f.Close()
		r := bufio.NewReader(f)
212 213

		for i := 0; ; i++ {
214
			text, want, context, err := readParseTest(r)
215
			if err == io.EOF {
216 217
				break
			}
Nigel Tao's avatar
Nigel Tao committed
218 219 220
			if err != nil {
				t.Fatal(err)
			}
221

222
			err = testParseCase(text, want, context)
223

224
			if err != nil {
225
				t.Errorf("%s test #%d %q, %s", tf, i, text, err)
Nigel Tao's avatar
Nigel Tao committed
226
			}
227 228 229 230
		}
	}
}

231 232
// testParseCase tests one test case from the test files. If the test does not
// pass, it returns an error that explains the failure.
233 234
// text is the HTML to be parsed, want is a dump of the correct parse tree,
// and context is the name of the context node, if any.
235
func testParseCase(text, want, context string) (err error) {
236 237 238 239 240 241 242
	defer func() {
		if x := recover(); x != nil {
			switch e := x.(type) {
			case error:
				err = e
			default:
				err = fmt.Errorf("%v", e)
Nigel Tao's avatar
Nigel Tao committed
243 244
			}
		}
245 246 247 248 249 250
	}()

	var doc *Node
	if context == "" {
		doc, err = Parse(strings.NewReader(text))
		if err != nil {
251
			return err
252 253 254
		}
	} else {
		contextNode := &Node{
255 256 257
			Type:     ElementNode,
			DataAtom: atom.Lookup([]byte(context)),
			Data:     context,
258 259 260
		}
		nodes, err := ParseFragment(strings.NewReader(text), contextNode)
		if err != nil {
261
			return err
262 263 264 265 266
		}
		doc = &Node{
			Type: DocumentNode,
		}
		for _, n := range nodes {
267
			doc.AppendChild(n)
268 269 270
		}
	}

271 272 273 274
	if err := checkTreeConsistency(doc); err != nil {
		return err
	}

275 276
	got, err := dump(doc)
	if err != nil {
277
		return err
Nigel Tao's avatar
Nigel Tao committed
278
	}
279 280
	// Compare the parsed tree to the #document section.
	if got != want {
281
		return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
282 283 284
	}

	if renderTestBlacklist[text] || context != "" {
285
		return nil
286 287 288 289 290 291 292 293 294
	}

	// Check that rendering and re-parsing results in an identical tree.
	pr, pw := io.Pipe()
	go func() {
		pw.CloseWithError(Render(pw, doc))
	}()
	doc1, err := Parse(pr)
	if err != nil {
295
		return err
296 297 298
	}
	got1, err := dump(doc1)
	if err != nil {
299
		return err
300 301
	}
	if got != got1 {
302
		return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
303 304
	}

305
	return nil
Nigel Tao's avatar
Nigel Tao committed
306
}
307 308 309 310 311 312 313 314 315

// Some test input result in parse trees are not 'well-formed' despite
// following the HTML5 recovery algorithms. Rendering and re-parsing such a
// tree will not result in an exact clone of that tree. We blacklist such
// inputs from the render test.
var renderTestBlacklist = map[string]bool{
	// The second <a> will be reparented to the first <table>'s parent. This
	// results in an <a> whose parent is an <a>, which is not 'well-formed'.
	`<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
316 317
	// The same thing with a <p>:
	`<p><table></p>`: true,
318
	// More cases of <a> being reparented:
319
	`<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
320
	`<a><table><a></table><p><a><div><a>`:                                     true,
321
	`<a><table><td><a><table></table><a></tr><a></table><a>`:                  true,
322 323
	// A similar reparenting situation involving <nobr>:
	`<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
324 325
	// A <plaintext> element is reparented, putting it before a table.
	// A <plaintext> element can't have anything after it in HTML.
326 327 328 329 330 331
	`<table><plaintext><td>`:                                   true,
	`<!doctype html><table><plaintext></plaintext>`:            true,
	`<!doctype html><table><tbody><plaintext></plaintext>`:     true,
	`<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
	// A form inside a table inside a form doesn't work either.
	`<!doctype html><form><table></form><form></table></form>`: true,
332 333
	// A script that ends at EOF may escape its own closing tag when rendered.
	`<!doctype html><script><!--<script `:          true,
334
	`<!doctype html><script><!--<script <`:         true,
335
	`<!doctype html><script><!--<script <a`:        true,
336 337
	`<!doctype html><script><!--<script </`:        true,
	`<!doctype html><script><!--<script </s`:       true,
338 339 340 341
	`<!doctype html><script><!--<script </script`:  true,
	`<!doctype html><script><!--<script </scripta`: true,
	`<!doctype html><script><!--<script -`:         true,
	`<!doctype html><script><!--<script -a`:        true,
342
	`<!doctype html><script><!--<script -<`:        true,
343 344
	`<!doctype html><script><!--<script --`:        true,
	`<!doctype html><script><!--<script --a`:       true,
345
	`<!doctype html><script><!--<script --<`:       true,
346 347 348 349 350 351 352 353
	`<script><!--<script `:                         true,
	`<script><!--<script <a`:                       true,
	`<script><!--<script </script`:                 true,
	`<script><!--<script </scripta`:                true,
	`<script><!--<script -`:                        true,
	`<script><!--<script -a`:                       true,
	`<script><!--<script --`:                       true,
	`<script><!--<script --a`:                      true,
354 355 356 357 358 359
	`<script><!--<script <`:                        true,
	`<script><!--<script </`:                       true,
	`<script><!--<script </s`:                      true,
	// Reconstructing the active formatting elements results in a <plaintext>
	// element that contains an <a> element.
	`<!doctype html><p><a><plaintext>b`: true,
360
}
361

362 363 364 365 366 367 368 369 370 371 372 373 374
func TestNodeConsistency(t *testing.T) {
	// inconsistentNode is a Node whose DataAtom and Data do not agree.
	inconsistentNode := &Node{
		Type:     ElementNode,
		DataAtom: atom.Frameset,
		Data:     "table",
	}
	_, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode)
	if err == nil {
		t.Errorf("got nil error, want non-nil")
	}
}

375 376 377 378 379 380 381
func BenchmarkParser(b *testing.B) {
	buf, err := ioutil.ReadFile("testdata/go1.html")
	if err != nil {
		b.Fatalf("could not read testdata/go1.html: %v", err)
	}
	b.SetBytes(int64(len(buf)))
	runtime.GC()
382
	b.ReportAllocs()
383 384 385 386 387
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		Parse(bytes.NewBuffer(buf))
	}
}