Commit 37d92d25 authored by Nigel Tao's avatar Nigel Tao

exp/html, exp/html/atom: delete, as they're moving to the go.net

sub-repo.

The matching change is at https://golang.org/cl/7310063

The rationale was discussed at
https://groups.google.com/d/topic/golang-nuts/Qq5hTQyPuLg/discussion

R=adg, dave
CC=golang-dev
https://golang.org/cl/7317043
parent de69401b
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package atom provides integer codes (also known as atoms) for a fixed set of
// frequently occurring HTML strings: tag names and attribute keys such as "p"
// and "id".
//
// Sharing an atom's name between all elements with the same tag can result in
// fewer string allocations when tokenizing and parsing HTML. Integer
// comparisons are also generally faster than string comparisons.
//
// The value of an atom's particular code is not guaranteed to stay the same
// between versions of this package. Neither is any ordering guaranteed:
// whether atom.H1 < atom.H2 may also change. The codes are not guaranteed to
// be dense. The only guarantees are that e.g. looking up "div" will yield
// atom.Div, calling atom.Div.String will return "div", and atom.Div != 0.
//
// TODO(rsc): When this package moves out of exp we need to freeze atom values
// across releases.
package atom
// Atom is an integer code for a string. The zero value maps to "".
type Atom uint32
// String returns the atom's name.
func (a Atom) String() string {
start := uint32(a >> 8)
n := uint32(a & 0xff)
if start+n > uint32(len(atomText)) {
return ""
}
return atomText[start : start+n]
}
func (a Atom) string() string {
return atomText[a>>8 : a>>8+a&0xff]
}
// fnv computes the FNV hash with an arbitrary starting value h.
func fnv(h uint32, s []byte) uint32 {
for i := range s {
h ^= uint32(s[i])
h *= 16777619
}
return h
}
func match(s string, t []byte) bool {
for i, c := range t {
if s[i] != c {
return false
}
}
return true
}
// Lookup returns the atom whose name is s. It returns zero if there is no
// such atom. The lookup is case sensitive.
func Lookup(s []byte) Atom {
if len(s) == 0 || len(s) > maxAtomLen {
return 0
}
h := fnv(hash0, s)
if a := table[h&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) {
return a
}
if a := table[(h>>16)&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) {
return a
}
return 0
}
// String returns a string whose contents are equal to s. In that sense, it is
// equivalent to string(s) but may be more efficient.
func String(s []byte) string {
if a := Lookup(s); a != 0 {
return a.String()
}
return string(s)
}
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package atom
import (
"sort"
"testing"
)
func TestKnown(t *testing.T) {
for _, s := range testAtomList {
if atom := Lookup([]byte(s)); atom.String() != s {
t.Errorf("Lookup(%q) = %#x (%q)", s, uint32(atom), atom.String())
}
}
}
func TestHits(t *testing.T) {
for _, a := range table {
if a == 0 {
continue
}
got := Lookup([]byte(a.String()))
if got != a {
t.Errorf("Lookup(%q) = %#x, want %#x", a.String(), uint32(got), uint32(a))
}
}
}
func TestMisses(t *testing.T) {
testCases := []string{
"",
"\x00",
"\xff",
"A",
"DIV",
"Div",
"dIV",
"aa",
"a\x00",
"ab",
"abb",
"abbr0",
"abbr ",
" abbr",
" a",
"acceptcharset",
"acceptCharset",
"accept_charset",
"h0",
"h1h2",
"h7",
"onClick",
"λ",
// The following string has the same hash (0xa1d7fab7) as "onmouseover".
"\x00\x00\x00\x00\x00\x50\x18\xae\x38\xd0\xb7",
}
for _, tc := range testCases {
got := Lookup([]byte(tc))
if got != 0 {
t.Errorf("Lookup(%q): got %d, want 0", tc, got)
}
}
}
func TestForeignObject(t *testing.T) {
const (
afo = Foreignobject
afO = ForeignObject
sfo = "foreignobject"
sfO = "foreignObject"
)
if got := Lookup([]byte(sfo)); got != afo {
t.Errorf("Lookup(%q): got %#v, want %#v", sfo, got, afo)
}
if got := Lookup([]byte(sfO)); got != afO {
t.Errorf("Lookup(%q): got %#v, want %#v", sfO, got, afO)
}
if got := afo.String(); got != sfo {
t.Errorf("Atom(%#v).String(): got %q, want %q", afo, got, sfo)
}
if got := afO.String(); got != sfO {
t.Errorf("Atom(%#v).String(): got %q, want %q", afO, got, sfO)
}
}
func BenchmarkLookup(b *testing.B) {
sortedTable := make([]string, 0, len(table))
for _, a := range table {
if a != 0 {
sortedTable = append(sortedTable, a.String())
}
}
sort.Strings(sortedTable)
x := make([][]byte, 1000)
for i := range x {
x[i] = []byte(sortedTable[i%len(sortedTable)])
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, s := range x {
Lookup(s)
}
}
}
This diff is collapsed.
This diff is collapsed.
// generated by go run gen.go -test; DO NOT EDIT
package atom
var testAtomList = []string{
"a",
"abbr",
"accept",
"accept-charset",
"accesskey",
"action",
"address",
"align",
"alt",
"annotation",
"annotation-xml",
"applet",
"area",
"article",
"aside",
"async",
"audio",
"autocomplete",
"autofocus",
"autoplay",
"b",
"base",
"basefont",
"bdi",
"bdo",
"bgsound",
"big",
"blink",
"blockquote",
"body",
"border",
"br",
"button",
"canvas",
"caption",
"center",
"challenge",
"charset",
"checked",
"cite",
"cite",
"class",
"code",
"col",
"colgroup",
"color",
"cols",
"colspan",
"command",
"command",
"content",
"contenteditable",
"contextmenu",
"controls",
"coords",
"crossorigin",
"data",
"data",
"datalist",
"datetime",
"dd",
"default",
"defer",
"del",
"desc",
"details",
"dfn",
"dialog",
"dir",
"dirname",
"disabled",
"div",
"dl",
"download",
"draggable",
"dropzone",
"dt",
"em",
"embed",
"enctype",
"face",
"fieldset",
"figcaption",
"figure",
"font",
"footer",
"for",
"foreignObject",
"foreignobject",
"form",
"form",
"formaction",
"formenctype",
"formmethod",
"formnovalidate",
"formtarget",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"headers",
"height",
"hgroup",
"hidden",
"high",
"hr",
"href",
"hreflang",
"html",
"http-equiv",
"i",
"icon",
"id",
"iframe",
"image",
"img",
"inert",
"input",
"ins",
"isindex",
"ismap",
"itemid",
"itemprop",
"itemref",
"itemscope",
"itemtype",
"kbd",
"keygen",
"keytype",
"kind",
"label",
"label",
"lang",
"legend",
"li",
"link",
"list",
"listing",
"loop",
"low",
"malignmark",
"manifest",
"map",
"mark",
"marquee",
"math",
"max",
"maxlength",
"media",
"mediagroup",
"menu",
"meta",
"meter",
"method",
"mglyph",
"mi",
"min",
"mn",
"mo",
"ms",
"mtext",
"multiple",
"muted",
"name",
"nav",
"nobr",
"noembed",
"noframes",
"noscript",
"novalidate",
"object",
"ol",
"onabort",
"onafterprint",
"onbeforeprint",
"onbeforeunload",
"onblur",
"oncancel",
"oncanplay",
"oncanplaythrough",
"onchange",
"onclick",
"onclose",
"oncontextmenu",
"oncuechange",
"ondblclick",
"ondrag",
"ondragend",
"ondragenter",
"ondragleave",
"ondragover",
"ondragstart",
"ondrop",
"ondurationchange",
"onemptied",
"onended",
"onerror",
"onfocus",
"onhashchange",
"oninput",
"oninvalid",
"onkeydown",
"onkeypress",
"onkeyup",
"onload",
"onloadeddata",
"onloadedmetadata",
"onloadstart",
"onmessage",
"onmousedown",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onmousewheel",
"onoffline",
"ononline",
"onpagehide",
"onpageshow",
"onpause",
"onplay",
"onplaying",
"onpopstate",
"onprogress",
"onratechange",
"onreset",
"onresize",
"onscroll",
"onseeked",
"onseeking",
"onselect",
"onshow",
"onstalled",
"onstorage",
"onsubmit",
"onsuspend",
"ontimeupdate",
"onunload",
"onvolumechange",
"onwaiting",
"open",
"optgroup",
"optimum",
"option",
"output",
"p",
"param",
"pattern",
"ping",
"placeholder",
"plaintext",
"poster",
"pre",
"preload",
"progress",
"prompt",
"public",
"q",
"radiogroup",
"readonly",
"rel",
"required",
"reversed",
"rows",
"rowspan",
"rp",
"rt",
"ruby",
"s",
"samp",
"sandbox",
"scope",
"scoped",
"script",
"seamless",
"section",
"select",
"selected",
"shape",
"size",
"sizes",
"small",
"source",
"spacer",
"span",
"span",
"spellcheck",
"src",
"srcdoc",
"srclang",
"start",
"step",
"strike",
"strong",
"style",
"style",
"sub",
"summary",
"sup",
"svg",
"system",
"tabindex",
"table",
"target",
"tbody",
"td",
"textarea",
"tfoot",
"th",
"thead",
"time",
"title",
"title",
"tr",
"track",
"translate",
"tt",
"type",
"typemustmatch",
"u",
"ul",
"usemap",
"value",
"var",
"video",
"wbr",
"width",
"wrap",
"xmp",
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
// Section 12.2.3.2 of the HTML5 specification says "The following elements
// have varying levels of special parsing rules".
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-stack-of-open-elements
var isSpecialElementMap = map[string]bool{
"address": true,
"applet": true,
"area": true,
"article": true,
"aside": true,
"base": true,
"basefont": true,
"bgsound": true,
"blockquote": true,
"body": true,
"br": true,
"button": true,
"caption": true,
"center": true,
"col": true,
"colgroup": true,
"command": true,
"dd": true,
"details": true,
"dir": true,
"div": true,
"dl": true,
"dt": true,
"embed": true,
"fieldset": true,
"figcaption": true,
"figure": true,
"footer": true,
"form": true,
"frame": true,
"frameset": true,
"h1": true,
"h2": true,
"h3": true,
"h4": true,
"h5": true,
"h6": true,
"head": true,
"header": true,
"hgroup": true,
"hr": true,
"html": true,
"iframe": true,
"img": true,
"input": true,
"isindex": true,
"li": true,
"link": true,
"listing": true,
"marquee": true,
"menu": true,
"meta": true,
"nav": true,
"noembed": true,
"noframes": true,
"noscript": true,
"object": true,
"ol": true,
"p": true,
"param": true,
"plaintext": true,
"pre": true,
"script": true,
"section": true,
"select": true,
"style": true,
"summary": true,
"table": true,
"tbody": true,
"td": true,
"textarea": true,
"tfoot": true,
"th": true,
"thead": true,
"title": true,
"tr": true,
"ul": true,
"wbr": true,
"xmp": true,
}
func isSpecialElement(element *Node) bool {
switch element.Namespace {
case "", "html":
return isSpecialElementMap[element.Data]
case "svg":
return element.Data == "foreignObject"
}
return false
}
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
Package html implements an HTML5-compliant tokenizer and parser.
Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
caller's responsibility to ensure that r provides UTF-8 encoded HTML.
z := html.NewTokenizer(r)
Given a Tokenizer z, the HTML is tokenized by repeatedly calling z.Next(),
which parses the next token and returns its type, or an error:
for {
tt := z.Next()
if tt == html.ErrorToken {
// ...
return ...
}
// Process the current token.
}
There are two APIs for retrieving the current token. The high-level API is to
call Token; the low-level API is to call Text or TagName / TagAttr. Both APIs
allow optionally calling Raw after Next but before Token, Text, TagName, or
TagAttr. In EBNF notation, the valid call sequence per token is:
Next {Raw} [ Token | Text | TagName {TagAttr} ]
Token returns an independent data structure that completely describes a token.
Entities (such as "&lt;") are unescaped, tag names and attribute keys are
lower-cased, and attributes are collected into a []Attribute. For example:
for {
if z.Next() == html.ErrorToken {
// Returning io.EOF indicates success.
return z.Err()
}
emitToken(z.Token())
}
The low-level API performs fewer allocations and copies, but the contents of
the []byte values returned by Text, TagName and TagAttr may change on the next
call to Next. For example, to extract an HTML page's anchor text:
depth := 0
for {
tt := z.Next()
switch tt {
case ErrorToken:
return z.Err()
case TextToken:
if depth > 0 {
// emitBytes should copy the []byte it receives,
// if it doesn't process it immediately.
emitBytes(z.Text())
}
case StartTagToken, EndTagToken:
tn, _ := z.TagName()
if len(tn) == 1 && tn[0] == 'a' {
if tt == StartTagToken {
depth++
} else {
depth--
}
}
}
}
Parsing is done by calling Parse with an io.Reader, which returns the root of
the parse tree (the document element) as a *Node. It is the caller's
responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
example, to process each anchor node in depth-first order:
doc, err := html.Parse(r)
if err != nil {
// ...
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
// Do something with n...
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
The relevant specifications include:
http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html and
http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
*/
package html
// The tokenization algorithm implemented by this package is not a line-by-line
// transliteration of the relatively verbose state-machine in the WHATWG
// specification. A more direct approach is used instead, where the program
// counter implies the state, such as whether it is tokenizing a tag or a text
// node. Specification compliance is verified by checking expected and actual
// outputs over a test suite rather than aiming for algorithmic fidelity.
// TODO(nigeltao): Does a DOM API belong in this package or a separate one?
// TODO(nigeltao): How does parsing interact with a JavaScript engine?
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"strings"
)
// parseDoctype parses the data from a DoctypeToken into a name,
// public identifier, and system identifier. It returns a Node whose Type
// is DoctypeNode, whose Data is the name, and which has attributes
// named "system" and "public" for the two identifiers if they were present.
// quirks is whether the document should be parsed in "quirks mode".
func parseDoctype(s string) (n *Node, quirks bool) {
n = &Node{Type: DoctypeNode}
// Find the name.
space := strings.IndexAny(s, whitespace)
if space == -1 {
space = len(s)
}
n.Data = s[:space]
// The comparison to "html" is case-sensitive.
if n.Data != "html" {
quirks = true
}
n.Data = strings.ToLower(n.Data)
s = strings.TrimLeft(s[space:], whitespace)
if len(s) < 6 {
// It can't start with "PUBLIC" or "SYSTEM".
// Ignore the rest of the string.
return n, quirks || s != ""
}
key := strings.ToLower(s[:6])
s = s[6:]
for key == "public" || key == "system" {
s = strings.TrimLeft(s, whitespace)
if s == "" {
break
}
quote := s[0]
if quote != '"' && quote != '\'' {
break
}
s = s[1:]
q := strings.IndexRune(s, rune(quote))
var id string
if q == -1 {
id = s
s = ""
} else {
id = s[:q]
s = s[q+1:]
}
n.Attr = append(n.Attr, Attribute{Key: key, Val: id})
if key == "public" {
key = "system"
} else {
key = ""
}
}
if key != "" || s != "" {
quirks = true
} else if len(n.Attr) > 0 {
if n.Attr[0].Key == "public" {
public := strings.ToLower(n.Attr[0].Val)
switch public {
case "-//w3o//dtd w3 html strict 3.0//en//", "-/w3d/dtd html 4.0 transitional/en", "html":
quirks = true
default:
for _, q := range quirkyIDs {
if strings.HasPrefix(public, q) {
quirks = true
break
}
}
}
// The following two public IDs only cause quirks mode if there is no system ID.
if len(n.Attr) == 1 && (strings.HasPrefix(public, "-//w3c//dtd html 4.01 frameset//") ||
strings.HasPrefix(public, "-//w3c//dtd html 4.01 transitional//")) {
quirks = true
}
}
if lastAttr := n.Attr[len(n.Attr)-1]; lastAttr.Key == "system" &&
strings.ToLower(lastAttr.Val) == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" {
quirks = true
}
}
return n, quirks
}
// quirkyIDs is a list of public doctype identifiers that cause a document
// to be interpreted in quirks mode. The identifiers should be in lower case.
var quirkyIDs = []string{
"+//silmaril//dtd html pro v0r11 19970101//",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
"-//as//dtd html 3.0 aswedit + extensions//",
"-//ietf//dtd html 2.0 level 1//",
"-//ietf//dtd html 2.0 level 2//",
"-//ietf//dtd html 2.0 strict level 1//",
"-//ietf//dtd html 2.0 strict level 2//",
"-//ietf//dtd html 2.0 strict//",
"-//ietf//dtd html 2.0//",
"-//ietf//dtd html 2.1e//",
"-//ietf//dtd html 3.0//",
"-//ietf//dtd html 3.2 final//",
"-//ietf//dtd html 3.2//",
"-//ietf//dtd html 3//",
"-//ietf//dtd html level 0//",
"-//ietf//dtd html level 1//",
"-//ietf//dtd html level 2//",
"-//ietf//dtd html level 3//",
"-//ietf//dtd html strict level 0//",
"-//ietf//dtd html strict level 1//",
"-//ietf//dtd html strict level 2//",
"-//ietf//dtd html strict level 3//",
"-//ietf//dtd html strict//",
"-//ietf//dtd html//",
"-//metrius//dtd metrius presentational//",
"-//microsoft//dtd internet explorer 2.0 html strict//",
"-//microsoft//dtd internet explorer 2.0 html//",
"-//microsoft//dtd internet explorer 2.0 tables//",
"-//microsoft//dtd internet explorer 3.0 html strict//",
"-//microsoft//dtd internet explorer 3.0 html//",
"-//microsoft//dtd internet explorer 3.0 tables//",
"-//netscape comm. corp.//dtd html//",
"-//netscape comm. corp.//dtd strict html//",
"-//o'reilly and associates//dtd html 2.0//",
"-//o'reilly and associates//dtd html extended 1.0//",
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
"-//spyglass//dtd html 2.0 extended//",
"-//sq//dtd html 2.0 hotmetal + extensions//",
"-//sun microsystems corp.//dtd hotjava html//",
"-//sun microsystems corp.//dtd hotjava strict html//",
"-//w3c//dtd html 3 1995-03-24//",
"-//w3c//dtd html 3.2 draft//",
"-//w3c//dtd html 3.2 final//",
"-//w3c//dtd html 3.2//",
"-//w3c//dtd html 3.2s draft//",
"-//w3c//dtd html 4.0 frameset//",
"-//w3c//dtd html 4.0 transitional//",
"-//w3c//dtd html experimental 19960712//",
"-//w3c//dtd html experimental 970421//",
"-//w3c//dtd w3 html//",
"-//w3o//dtd w3 html 3.0//",
"-//webtechs//dtd mozilla html 2.0//",
"-//webtechs//dtd mozilla html//",
}
This diff is collapsed.
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"testing"
"unicode/utf8"
)
func TestEntityLength(t *testing.T) {
// We verify that the length of UTF-8 encoding of each value is <= 1 + len(key).
// The +1 comes from the leading "&". This property implies that the length of
// unescaped text is <= the length of escaped text.
for k, v := range entity {
if 1+len(k) < utf8.RuneLen(v) {
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
}
if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
}
}
for k, v := range entity2 {
if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v[0]) + string(v[1]))
}
}
}
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"bytes"
"strings"
"unicode/utf8"
)
// These replacements permit compatibility with old numeric entities that
// assumed Windows-1252 encoding.
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
var replacementTable = [...]rune{
'\u20AC', // First entry is what 0x80 should be replaced with.
'\u0081',
'\u201A',
'\u0192',
'\u201E',
'\u2026',
'\u2020',
'\u2021',
'\u02C6',
'\u2030',
'\u0160',
'\u2039',
'\u0152',
'\u008D',
'\u017D',
'\u008F',
'\u0090',
'\u2018',
'\u2019',
'\u201C',
'\u201D',
'\u2022',
'\u2013',
'\u2014',
'\u02DC',
'\u2122',
'\u0161',
'\u203A',
'\u0153',
'\u009D',
'\u017E',
'\u0178', // Last entry is 0x9F.
// 0x00->'\uFFFD' is handled programmatically.
// 0x0D->'\u000D' is a no-op.
}
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: b[src] == '&' && dst <= src.
// attribute should be true if parsing an attribute value.
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
// i starts at 1 because we already know that s[0] == '&'.
i, s := 1, b[src:]
if len(s) <= 1 {
b[dst] = b[src]
return dst + 1, src + 1
}
if s[i] == '#' {
if len(s) <= 3 { // We need to have at least "&#.".
b[dst] = b[src]
return dst + 1, src + 1
}
i++
c := s[i]
hex := false
if c == 'x' || c == 'X' {
hex = true
i++
}
x := '\x00'
for i < len(s) {
c = s[i]
i++
if hex {
if '0' <= c && c <= '9' {
x = 16*x + rune(c) - '0'
continue
} else if 'a' <= c && c <= 'f' {
x = 16*x + rune(c) - 'a' + 10
continue
} else if 'A' <= c && c <= 'F' {
x = 16*x + rune(c) - 'A' + 10
continue
}
} else if '0' <= c && c <= '9' {
x = 10*x + rune(c) - '0'
continue
}
if c != ';' {
i--
}
break
}
if i <= 3 { // No characters matched.
b[dst] = b[src]
return dst + 1, src + 1
}
if 0x80 <= x && x <= 0x9F {
// Replace characters from Windows-1252 with UTF-8 equivalents.
x = replacementTable[x-0x80]
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
// Replace invalid characters with the replacement character.
x = '\uFFFD'
}
return dst + utf8.EncodeRune(b[dst:], x), src + i
}
// Consume the maximum number of characters possible, with the
// consumed characters matching one of the named references.
for i < len(s) {
c := s[i]
i++
// Lower-cased characters are more common in entities, so we check for them first.
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
continue
}
if c != ';' {
i--
}
break
}
entityName := string(s[1:i])
if entityName == "" {
// No-op.
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
// No-op.
} else if x := entity[entityName]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
} else if x := entity2[entityName]; x[0] != 0 {
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
} else if !attribute {
maxLen := len(entityName) - 1
if maxLen > longestEntityWithoutSemicolon {
maxLen = longestEntityWithoutSemicolon
}
for j := maxLen; j > 1; j-- {
if x := entity[entityName[:j]]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
}
}
}
dst1, src1 = dst+i, src+i
copy(b[dst:dst1], b[src:src1])
return dst1, src1
}
// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
// attribute should be true if parsing an attribute value.
func unescape(b []byte, attribute bool) []byte {
for i, c := range b {
if c == '&' {
dst, src := unescapeEntity(b, i, i, attribute)
for src < len(b) {
c := b[src]
if c == '&' {
dst, src = unescapeEntity(b, dst, src, attribute)
} else {
b[dst] = c
dst, src = dst+1, src+1
}
}
return b[0:dst]
}
}
return b
}
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
func lower(b []byte) []byte {
for i, c := range b {
if 'A' <= c && c <= 'Z' {
b[i] = c + 'a' - 'A'
}
}
return b
}
const escapedChars = "&'<>\"\r"
func escape(w writer, s string) error {
i := strings.IndexAny(s, escapedChars)
for i != -1 {
if _, err := w.WriteString(s[:i]); err != nil {
return err
}
var esc string
switch s[i] {
case '&':
esc = "&amp;"
case '\'':
// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
esc = "&#39;"
case '<':
esc = "&lt;"
case '>':
esc = "&gt;"
case '"':
// "&#34;" is shorter than "&quot;".
esc = "&#34;"
case '\r':
esc = "&#13;"
default:
panic("unrecognized escape character")
}
s = s[i+1:]
if _, err := w.WriteString(esc); err != nil {
return err
}
i = strings.IndexAny(s, escapedChars)
}
_, err := w.WriteString(s)
return err
}
// EscapeString escapes special characters like "<" to become "&lt;". It
// escapes only five such characters: <, >, &, ' and ".
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true.
func EscapeString(s string) string {
if strings.IndexAny(s, escapedChars) == -1 {
return s
}
var buf bytes.Buffer
escape(&buf, s)
return buf.String()
}
// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
// larger range of entities than EscapeString escapes. For example, "&aacute;"
// unescapes to "á", as does "&#225;" and "&xE1;".
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true.
func UnescapeString(s string) string {
for _, c := range s {
if c == '&' {
return string(unescape([]byte(s), false))
}
}
return s
}
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This example demonstrates parsing HTML data and walking the resulting tree.
package html_test
import (
"exp/html"
"fmt"
"log"
"strings"
)
func ExampleParse() {
s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`
doc, err := html.Parse(strings.NewReader(s))
if err != nil {
log.Fatal(err)
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" {
fmt.Println(a.Val)
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
// Output:
// foo
// /bar/baz
}
This diff is collapsed.
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"exp/html/atom"
)
// A NodeType is the type of a Node.
type NodeType uint32
const (
ErrorNode NodeType = iota
TextNode
DocumentNode
ElementNode
CommentNode
DoctypeNode
scopeMarkerNode
)
// Section 12.2.3.3 says "scope markers are inserted when entering applet
// elements, buttons, object elements, marquees, table cells, and table
// captions, and are used to prevent formatting from 'leaking'".
var scopeMarker = Node{Type: scopeMarkerNode}
// A Node consists of a NodeType and some Data (tag name for element nodes,
// content for text) and are part of a tree of Nodes. Element nodes may also
// have a Namespace and contain a slice of Attributes. Data is unescaped, so
// that it looks like "a<b" rather than "a&lt;b". For element nodes, DataAtom
// is the atom for Data, or zero if Data is not a known tag name.
//
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
// "svg" is short for "http://www.w3.org/2000/svg".
type Node struct {
Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node
Type NodeType
DataAtom atom.Atom
Data string
Namespace string
Attr []Attribute
}
// InsertBefore inserts newChild as a child of n, immediately before oldChild
// in the sequence of n's children. oldChild may be nil, in which case newChild
// is appended to the end of n's children.
//
// It will panic if newChild already has a parent or siblings.
func (n *Node) InsertBefore(newChild, oldChild *Node) {
if newChild.Parent != nil || newChild.PrevSibling != nil || newChild.NextSibling != nil {
panic("html: InsertBefore called for an attached child Node")
}
var prev, next *Node
if oldChild != nil {
prev, next = oldChild.PrevSibling, oldChild
} else {
prev = n.LastChild
}
if prev != nil {
prev.NextSibling = newChild
} else {
n.FirstChild = newChild
}
if next != nil {
next.PrevSibling = newChild
} else {
n.LastChild = newChild
}
newChild.Parent = n
newChild.PrevSibling = prev
newChild.NextSibling = next
}
// AppendChild adds a node c as a child of n.
//
// It will panic if c already has a parent or siblings.
func (n *Node) AppendChild(c *Node) {
if c.Parent != nil || c.PrevSibling != nil || c.NextSibling != nil {
panic("html: AppendChild called for an attached child Node")
}
last := n.LastChild
if last != nil {
last.NextSibling = c
} else {
n.FirstChild = c
}
n.LastChild = c
c.Parent = n
c.PrevSibling = last
}
// RemoveChild removes a node c that is a child of n. Afterwards, c will have
// no parent and no siblings.
//
// It will panic if c's parent is not n.
func (n *Node) RemoveChild(c *Node) {
if c.Parent != n {
panic("html: RemoveChild called for a non-child Node")
}
if n.FirstChild == c {
n.FirstChild = c.NextSibling
}
if c.NextSibling != nil {
c.NextSibling.PrevSibling = c.PrevSibling
}
if n.LastChild == c {
n.LastChild = c.PrevSibling
}
if c.PrevSibling != nil {
c.PrevSibling.NextSibling = c.NextSibling
}
c.Parent = nil
c.PrevSibling = nil
c.NextSibling = nil
}
// reparentChildren reparents all of src's child nodes to dst.
func reparentChildren(dst, src *Node) {
for {
child := src.FirstChild
if child == nil {
break
}
src.RemoveChild(child)
dst.AppendChild(child)
}
}
// clone returns a new node with the same type, data and attributes.
// The clone has no parent, no siblings and no children.
func (n *Node) clone() *Node {
m := &Node{
Type: n.Type,
DataAtom: n.DataAtom,
Data: n.Data,
Attr: make([]Attribute, len(n.Attr)),
}
copy(m.Attr, n.Attr)
return m
}
// nodeStack is a stack of nodes.
type nodeStack []*Node
// pop pops the stack. It will panic if s is empty.
func (s *nodeStack) pop() *Node {
i := len(*s)
n := (*s)[i-1]
*s = (*s)[:i-1]
return n
}
// top returns the most recently pushed node, or nil if s is empty.
func (s *nodeStack) top() *Node {
if i := len(*s); i > 0 {
return (*s)[i-1]
}
return nil
}
// index returns the index of the top-most occurrence of n in the stack, or -1
// if n is not present.
func (s *nodeStack) index(n *Node) int {
for i := len(*s) - 1; i >= 0; i-- {
if (*s)[i] == n {
return i
}
}
return -1
}
// insert inserts a node at the given index.
func (s *nodeStack) insert(i int, n *Node) {
(*s) = append(*s, nil)
copy((*s)[i+1:], (*s)[i:])
(*s)[i] = n
}
// remove removes a node from the stack. It is a no-op if n is not present.
func (s *nodeStack) remove(n *Node) {
i := s.index(n)
if i == -1 {
return
}
copy((*s)[i:], (*s)[i+1:])
j := len(*s) - 1
(*s)[j] = nil
*s = (*s)[:j]
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment