exp/html/atom: faster, hash-based lookup.

exp/html/atom benchmark: benchmark old ns/op new ns/op delta BenchmarkLookup 199226 80770 -59.46% exp/html benchmark: benchmark old ns/op new ns/op delta BenchmarkParser 4864890 4510834 -7.28% BenchmarkHighLevelTokenizer 2209192 1969684 -10.84% benchmark old MB/s new MB/s speedup BenchmarkParser 16.07 17.33 1.08x BenchmarkHighLevelTokenizer 35.38 39.68 1.12x R=r CC=golang-dev https://golang.org/cl/6261054

exp/html/atom: faster, hash-based lookup.
exp/html/atom benchmark: benchmark old ns/op new ns/op delta BenchmarkLookup 199226 80770 -59.46% exp/html benchmark: benchmark old ns/op new ns/op delta BenchmarkParser 4864890 4510834 -7.28% BenchmarkHighLevelTokenizer 2209192 1969684 -10.84% benchmark old MB/s new MB/s speedup BenchmarkParser 16.07 17.33 1.08x BenchmarkHighLevelTokenizer 35.38 39.68 1.12x R=r CC=golang-dev https://golang.org/cl/6261054
d2a6098e · Nigel Tao · baf91c31 · d2a6098e · d2a6098e · d2a6098e
Commit d2a6098e authored Jun 01, 2012 by Nigel Tao
4 changed files
--- a/src/pkg/exp/html/atom/atom.go
+++ b/src/pkg/exp/html/atom/atom.go
@@ -6,33 +6,40 @@
 // frequently occurring HTML strings: lower-case tag names and attribute keys
 // such as "p" and "id".
 //
-// Sharing an atom's string representation between all elements with the same
-// tag can result in fewer string allocations when tokenizing and parsing HTML.
-// Integer comparisons are also generally faster than string comparisons.
+// Sharing an atom's name between all elements with the same tag can result in
+// fewer string allocations when tokenizing and parsing HTML. Integer
+// comparisons are also generally faster than string comparisons.
 //
-// An atom's particular code (such as atom.Div == 63) is not guaranteed to
-// stay the same between versions of this package. Neither is any ordering
-// guaranteed: whether atom.H1 < atom.H2 may also change. The codes are not
-// guaranteed to be dense. The only guarantees are that e.g. looking up "div"
-// will yield atom.Div, calling atom.Div.String will return "div", and
-// atom.Div != 0.
+// The value of an atom's particular code is not guaranteed to stay the same
+// between versions of this package. Neither is any ordering guaranteed:
+// whether atom.H1 < atom.H2 may also change. The codes are not guaranteed to
+// be dense. The only guarantees are that e.g. looking up "div" will yield
+// atom.Div, calling atom.Div.String will return "div", and atom.Div != 0.
 package atom

+// The hash function must be the same as the one used in gen.go
+func hash(s []byte) (h uint32) {
+	for i := 0; i < len(s); i++ {
+		h = h<<5 ^ h>>27 ^ uint32(s[i])
+	}
+	return h
+}
+
 // Atom is an integer code for a string. The zero value maps to "".
 type Atom int

-// String returns the atom's string representation.
+// String returns the atom's name.
 func (a Atom) String() string {
-	if a <= 0 || a > max {
-		return ""
-	}
+	if 0 <= a && a < Atom(len(table)) {
 		return table[a]
+	}
+	return ""
 }

 // Lookup returns the atom whose name is s. It returns zero if there is no
 // such atom.
 func Lookup(s []byte) Atom {
-	if len(s) == 0 {
+	if len(s) == 0 || len(s) > maxLen {
 		return 0
 	}
 	if len(s) == 1 {
@@ -42,15 +49,25 @@ func Lookup(s []byte) Atom {
 		}
 		return oneByteAtoms[x-'a']
 	}
-	// Binary search for the atom. Unlike sort.Search, this returns early on an exact match.
-	// TODO: this could be optimized further. For example, lo and hi could be initialized
-	// from s[0]. Separately, all the "onxxx" atoms could be moved into their own table.
-	lo, hi := Atom(1), 1+max
+	hs := hash(s)
+	// Binary search for hs. Unlike sort.Search, this returns early on an exact match.
+	// A loop invariant is that len(table[i]) == len(s) for all i in [lo, hi).
+	lo := Atom(loHi[len(s)])
+	hi := Atom(loHi[len(s)+1])
 	for lo < hi {
 		mid := (lo + hi) / 2
-		if cmp := compare(s, table[mid]); cmp == 0 {
+		if ht := hashes[mid]; hs == ht {
+			// The gen.go program ensures that each atom's name has a distinct hash.
+			// However, arbitrary strings may collide with the atom's name. We have
+			// to check that string(s) == table[mid].
+			t := table[mid]
+			for i, si := range s {
+				if si != t[i] {
+					return 0
+				}
+			}
 			return mid
-		} else if cmp > 0 {
+		} else if hs > ht {
 			lo = mid + 1
 		} else {
 			hi = mid
@@ -67,22 +84,3 @@ func String(s []byte) string {
 	}
 	return string(s)
 }
-
-// compare is like bytes.Compare, except that it takes one []byte argument and
-// one string argument, and returns negative/0/positive instead of -1/0/+1.
-func compare(s []byte, t string) int {
-	n := len(s)
-	if n > len(t) {
-		n = len(t)
-	}
-	for i, si := range s[:n] {
-		ti := t[i]
-		switch {
-		case si > ti:
-			return +1
-		case si < ti:
-			return -1
-		}
-	}
-	return len(s) - len(t)
-}
--- a/src/pkg/exp/html/atom/atom_test.go
+++ b/src/pkg/exp/html/atom/atom_test.go
@@ -5,6 +5,7 @@
 package atom

 import (
+	"sort"
 	"testing"
 )

@@ -42,6 +43,8 @@ func TestMisses(t *testing.T) {
 		"h7",
 		"onClick",
 		"λ",
+		// The following string has the same hash (0xa1d7fab7) as "onmouseover".
+		"\x00\x00\x00\x00\x00\x50\x18\xae\x38\xd0\xb7",
 	}
 	for _, tc := range testCases {
 		got := Lookup([]byte(tc))
@@ -50,3 +53,21 @@ func TestMisses(t *testing.T) {
 		}
 	}
 }
+
+func BenchmarkLookup(b *testing.B) {
+	sortedTable := make([]string, len(table))
+	copy(sortedTable, table[:])
+	sort.Strings(sortedTable)
+
+	x := make([][]byte, 1000)
+	for i := range x {
+		x[i] = []byte(sortedTable[i%len(sortedTable)])
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for _, s := range x {
+			Lookup(s)
+		}
+	}
+}
--- a/src/pkg/exp/html/atom/gen.go
+++ b/src/pkg/exp/html/atom/gen.go
@@ -13,9 +13,30 @@ package main

 import (
 	"fmt"
+	"os"
 	"sort"
 )

+// The hash function must be the same as the one used in atom.go
+func hash(s string) (h uint32) {
+	for i := 0; i < len(s); i++ {
+		h = h<<5 ^ h>>27 ^ uint32(s[i])
+	}
+	return h
+}
+
+// lhash returns a uint64 whose high 32 bits are len(s) and whose low 32 bits
+// are hash(s).
+func lhash(s string) uint64 {
+	return uint64(len(s))<<32 | uint64(hash(s))
+}
+
+type byLhash []string
+
+func (b byLhash) Len() int           { return len(b) }
+func (b byLhash) Less(i, j int) bool { return lhash(b[i]) < lhash(b[j]) }
+func (b byLhash) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
+
 // identifier converts s to a Go exported identifier.
 // It converts "div" to "Div" and "accept-charset" to "AcceptCharset".
 func identifier(s string) string {
@@ -36,43 +57,84 @@ func identifier(s string) string {
 }

 func main() {
-	m := map[string]bool{
+	// Construct a list of atoms, sorted by their lhash.
+	m0 := map[string]bool{
 		"": true,
 	}
 	for _, list := range [][]string{elements, attributes, eventHandlers, extra} {
 		for _, s := range list {
-			m[s] = true
+			m0[s] = true
 		}
 	}
-	atoms := make([]string, 0, len(m))
-	for s := range m {
+	atoms := make([]string, 0, len(m0))
+	for s := range m0 {
 		atoms = append(atoms, s)
 	}
-	sort.Strings(atoms)
+	sort.Sort(byLhash(atoms))

+	// Calculate the magic constants to output as table.go.
 	byInt := []string{}
 	byStr := map[string]int{}
 	ident := []string{}
+	lhashes := []uint64{}
+	maxLen := 0
 	for i, s := range atoms {
 		byInt = append(byInt, s)
 		byStr[s] = i
 		ident = append(ident, identifier(s))
+		lhashes = append(lhashes, lhash(s))
+		if maxLen < len(s) {
+			maxLen = len(s)
+		}
+	}
+
+	// Check for hash collisions.
+	m1 := map[uint64]int{}
+	for i, h := range lhashes {
+		h &= 1<<32 - 1
+		if j, ok := m1[h]; ok {
+			fmt.Fprintf(os.Stderr, "hash collision at 0x%08x: %q, %q\n", h, byInt[i], byInt[j])
+			os.Exit(1)
+		}
+		m1[h] = i
 	}

+	// Generate the Go code.
 	fmt.Printf("package atom\n\nconst (\n")
+	{
+		// Print the Atoms in alphabetical order.
+		lines := []string{}
 		for i, _ := range byInt {
 			if i == 0 {
 				continue
 			}
-		fmt.Printf("\t%s Atom = %d\n", ident[i], i)
+			lines = append(lines, fmt.Sprintf("\t%s Atom = %d", ident[i], i))
+		}
+		sort.Strings(lines)
+		for _, line := range lines {
+			fmt.Println(line)
 		}
 		fmt.Printf(")\n\n")
-	fmt.Printf("const max Atom = %d\n\n", len(byInt)-1)
-	fmt.Printf("var table = []string{\n")
+	}
+	fmt.Printf("const maxLen = %d\n\n", maxLen)
+	fmt.Printf("var table = [...]string{\n")
 	for _, s := range byInt {
 		fmt.Printf("\t%q,\n", s)
 	}
 	fmt.Printf("}\n\n")
+	fmt.Printf("var hashes = [...]uint32{\n")
+	for _, s := range byInt {
+		fmt.Printf("\t0x%08x,\n", hash(s))
+	}
+	fmt.Printf("}\n\n")
+	fmt.Printf("var loHi = [maxLen + 2]uint16{\n")
+	for n := 0; n <= maxLen; n++ {
+		fmt.Printf("\t%d,\n", sort.Search(len(byInt), func(i int) bool {
+			return int(lhashes[i]>>32) >= n
+		}))
+	}
+	fmt.Printf("\t%d,\n", len(byInt))
+	fmt.Printf("}\n\n")
 	fmt.Printf("var oneByteAtoms = [26]Atom{\n")
 	for i := 'a'; i <= 'z'; i++ {
 		val := "0"

--- a/src/pkg/exp/html/atom/table.go
+++ b/src/pkg/exp/html/atom/table.go