From 27df5a3b4cd0b800afff86d62db06153233525cf Mon Sep 17 00:00:00 2001 From: Kirill Smelkov <kirr@nexedi.com> Date: Tue, 26 Oct 2021 16:28:01 +0300 Subject: [PATCH] =?UTF-8?q?wcfs:=20xbtree:=20blib=20+=3D=20PPTreeSubSet,?= =?UTF-8?q?=20=CE=94PPTreeSubSet?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This data structures will be used in 螖Btail to maintain sef of tracked BTree nodes, and to represent 未 to such set. Some preliminary history: https://lab.nexedi.com/kirr/wendelin.core/commit/78f2f88b X wcfs/xbtree: Fix treediff(a, 酶) https://lab.nexedi.com/kirr/wendelin.core/commit/5324547c X wcfs/xbtree: root(a) must stay in trackSet even after treediff(a,酶) https://lab.nexedi.com/kirr/wendelin.core/commit/f65f775b X wcfs/xbtree: treediff(酶, b) https://lab.nexedi.com/kirr/wendelin.core/commit/66bc41ce X Fix bug in PPTreeSubSet.Difference - it was always leaving root node alive https://lab.nexedi.com/kirr/wendelin.core/commit/ddb28043 X rebuild: Don't return nil for empty 螖PPTreeSubSet - that leads to SIGSEGV https://lab.nexedi.com/kirr/wendelin.core/commit/a87cc6de X rebuild: tests: Don't recompute trackSet(keys1R2) several times Quoting PPTreeSubSet and 螖PPTreeSubSet documentation: ---- 8< ---- PPTreeSubSet represents PP-connected subset of tree node objects. It is PP(xleafs) where PP(node) maps node to {node, node.parent, node.parent,parent, ...} up to top root from where the node is reached. The nodes in the set are represented by their Oid. Usually PPTreeSubSet is built as PP(some-leafs), but in general the starting nodes are arbitrary. PPTreeSubSet can also have many root nodes, thus not necessarily representing a subset of a single tree. Usual set operations are provided: Union, Difference and Intersection. Nodes can be added into the set via AddPath. Path is reverse operation - it returns path to tree node given its oid. Every node in the set comes with .parent pointer. ~~~~ 螖PPTreeSubSet represents a change to PPTreeSubSet. It can be applied via PPTreeSubSet.Apply螖 . The result B of applying 未 to A is: B = A.xDifference(未.Del).xUnion(未.Add) (*) (*) NOTE 未.Del and 未.Add might have their leafs starting from non-leaf nodes in A/B. This situation arises when 未 represents a change in path to particular node, but that node itself does not change, for example: c* c / \ / 41* 42 41 | | | \ 22 43 46 43 | | | 44 22 44 Here nodes {c, 41} are changed, node 42 is unlinked, and node 46 is added. Nodes 43 and 44 stay unchanged. 未.Del = c-42-43 | c-41-22 未.Add = c-41-43 | c-41-46-22 The second component with "-22" builds from leaf, but the first component with "-43" builds from non-leaf node. 螖nchildNonLeafs = {43: +1} Only complete result of applying all - xfixup(-1, 螖nchildNonLeafs) - 未.Del, - 未.Add, and - xfixup(+1, 螖nchildNonLeafs) produces correctly PP-connected set. --- wcfs/internal/xbtree/blib/blib.go | 9 + wcfs/internal/xbtree/blib/pptreesubset.go | 539 ++++++++++++++++++ .../internal/xbtree/blib/pptreesubset_test.go | 123 ++++ 3 files changed, 671 insertions(+) create mode 100644 wcfs/internal/xbtree/blib/pptreesubset.go create mode 100644 wcfs/internal/xbtree/blib/pptreesubset_test.go diff --git a/wcfs/internal/xbtree/blib/blib.go b/wcfs/internal/xbtree/blib/blib.go index 49fbf5d0..931659df 100644 --- a/wcfs/internal/xbtree/blib/blib.go +++ b/wcfs/internal/xbtree/blib/blib.go @@ -25,6 +25,8 @@ import ( "math" "lab.nexedi.com/kirr/neo/go/zodb/btree" + + "lab.nexedi.com/nexedi/wendelin.core/wcfs/internal/set" ) // XXX instead of generics @@ -39,6 +41,8 @@ type KeyRange = btree.LKeyRange const KeyMax Key = math.MaxInt64 const KeyMin Key = math.MinInt64 +type setOid = set.Oid + // KStr formats key as string. func KStr(k Key) string { @@ -50,3 +54,8 @@ func KStr(k Key) string { } return fmt.Sprintf("%d", k) } + + +func panicf(format string, argv ...interface{}) { + panic(fmt.Sprintf(format, argv...)) +} diff --git a/wcfs/internal/xbtree/blib/pptreesubset.go b/wcfs/internal/xbtree/blib/pptreesubset.go new file mode 100644 index 00000000..5a5c93ca --- /dev/null +++ b/wcfs/internal/xbtree/blib/pptreesubset.go @@ -0,0 +1,539 @@ +// Copyright (C) 2018-2021 Nexedi SA and Contributors. +// Kirill Smelkov <kirr@nexedi.com> +// +// This program is free software: you can Use, Study, Modify and Redistribute +// it under the terms of the GNU General Public License version 3, or (at your +// option) any later version, as published by the Free Software Foundation. +// +// You can also Link and Combine this program with other software covered by +// the terms of any of the Free Software licenses or any of the Open Source +// Initiative approved licenses and Convey the resulting work. Corresponding +// source of such a combination shall include the source code for all other +// software used. +// +// This program is distributed WITHOUT ANY WARRANTY; without even the implied +// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// See COPYING file for full licensing terms. +// See https://www.nexedi.com/licensing for rationale and options. + +package blib +// PP-connected subset of tree nodes. + +import ( + "fmt" + + "lab.nexedi.com/kirr/neo/go/zodb" +) + +const tracePPSet = false +const debugPPSet = false + +// PPTreeSubSet represents PP-connected subset of tree node objects. +// +// It is +// +// PP(xleafs) +// +// where PP(node) maps node to {node, node.parent, node.parent,parent, ...} up +// to top root from where the node is reached. +// +// The nodes in the set are represented by their Oid. +// +// Usually PPTreeSubSet is built as PP(some-leafs), but in general the starting +// nodes are arbitrary. PPTreeSubSet can also have many root nodes, thus not +// necessarily representing a subset of a single tree. +// +// Usual set operations are provided: Union, Difference and Intersection. +// +// Nodes can be added into the set via AddPath. Path is reverse operation - it +// returns path to tree node given its oid. +// +// Every node in the set comes with .parent pointer. +// +// XXX we only allow single parent/root case and report "tree corrupt" otherwise. +type PPTreeSubSet map[zodb.Oid]*nodeInTree + +// nodeInTree represents tracking information about a node. +type nodeInTree struct { + parent zodb.Oid // parent node | InvalidOid for root + nchild int // number of direct children in PPTreeSubSet referring to this node +} + +// Parent returns parent of this node. +func (n *nodeInTree) Parent() zodb.Oid { + return n.parent +} + +// NChild returns number of children of this node in the tree subset. +func (n *nodeInTree) NChild() int { + return n.nchild +} + +// Has returns whether node is in the set. +func (S PPTreeSubSet) Has(oid zodb.Oid) bool { + _, ok := S[oid] + return ok +} + +// Path returns path leading to the node specified by oid. +// +// The node must be in the set. +func (S PPTreeSubSet) Path(oid zodb.Oid) (path []zodb.Oid) { + for { + t, ok := S[oid] + if !ok { + panicf("node %s is not in the set <- %v", oid, path) + } + + path = append([]zodb.Oid{oid}, path...) + oid = t.parent + + if oid == zodb.InvalidOid { + break + } + } + + return path +} + +// AddPath adds path to a node to the set. +// +// Note: embedded buckets (leaf node with InvalidOid) are removed from the path. +func (S PPTreeSubSet) AddPath(path []zodb.Oid) { + S.verify() + defer S.verify() + + l := len(path) + if l == 0 { + panic("empty path") + } + + // normalize path: remove embedded bucket and check whether it was an + // artificial empty tree. + path = NormPath(path) + + // go through path and add nodes to the set + parent := zodb.InvalidOid + var pt *nodeInTree = nil + for _, oid := range path { + if oid == zodb.InvalidOid { + panicf("path has node with invalid oid: %v", path) + } + + t, already := S[oid] + if !already { + t = &nodeInTree{parent: parent, nchild: 0} + S[oid] = t + } + if t.parent != parent { + // XXX -> error (e.g. due to corrupt data in ZODB) ? + panicf("node %s is reachable from multiple parents: %s %s", + oid, t.parent, parent) + } + + if pt != nil && !already { + pt.nchild++ + } + + parent = oid + pt = t + } +} + +// NormPath normalizes path. +// +// It removes embedded buckets and artificial empty trees. +// Returned slice is subslice of path and aliases its memory. +func NormPath(path []zodb.Oid) []zodb.Oid { + l := len(path) + + // don't keep track of artificial empty tree + if l == 1 && path[0] == zodb.InvalidOid { + return nil + } + + // don't explicitly keep track of embedded buckets - they all have + // InvalidOid, and thus, if kept in S, e.g. T/B1:a and another + // T/B2:b would lead to InvalidOid having multiple parents. + if l == 2 && path[1] == zodb.InvalidOid { + return path[:1] + } + + return path +} + +// ---- Union/Difference/Intersection ---- + +// Union returns U = PP(A.leafs | B.leafs) +// +// In other words it adds A and B nodes. +func (A PPTreeSubSet) Union(B PPTreeSubSet) PPTreeSubSet { + U := A.Clone() + U.UnionInplace(B) + return U +} + +// UnionInplace sets A = PP(A.leafs | B.leafs) +// +// In other words it adds B nodes to A. +func (A PPTreeSubSet) UnionInplace(B PPTreeSubSet) { + if tracePPSet { + fmt.Printf("\n\nUnion:\n") + fmt.Printf(" A: %s\n", A) + fmt.Printf(" B: %s\n", B) + defer fmt.Printf("->U: %s\n", A) + } + + A.verify() + B.verify() + defer A.verify() + + A.xUnionInplace(B) +} + +// Difference returns D = PP(A.leafs \ B.leafs) +// +// In other words it removes B nodes from A while still maintaining A as PP-connected. +func (A PPTreeSubSet) Difference(B PPTreeSubSet) PPTreeSubSet { + D := A.Clone() + D.DifferenceInplace(B) + return D +} + + +// DifferenceInplace sets A = PP(A.leafs \ B.leafs) +// +// In other words it removes B nodes from A while still maintaining A as PP-connected. +func (A PPTreeSubSet) DifferenceInplace(B PPTreeSubSet) { + if tracePPSet { + fmt.Printf("\n\nDifference:\n") + fmt.Printf(" A: %s\n", A) + fmt.Printf(" B: %s\n", B) + defer fmt.Printf("->D: %s\n", A) + } + + A.verify() + B.verify() + defer A.verify() + + A.xDifferenceInplace(B) +} + +// TODO Intersection + +func (A PPTreeSubSet) xUnionInplace(B PPTreeSubSet) { + if tracePPSet { + fmt.Printf("\n\n xUnion:\n") + fmt.Printf(" a: %s\n", A) + fmt.Printf(" b: %s\n", B) + defer fmt.Printf(" ->u: %s\n", A) + } + + 未nchild := map[zodb.Oid]int{} + + for oid, t2 := range B { + t, already := A[oid] + if !already { + t = &nodeInTree{parent: t2.parent, nchild: 0} + A[oid] = t + // remember to nchild++ in parent + if t.parent != zodb.InvalidOid { + 未nchild[t.parent] += 1 + } + } else { + if t2.parent != t.parent { + // XXX or verify this at Track time and require + // that update is passed only entries with the + // same .parent? (then it would be ok to panic here) + // XXX -> error (e.g. due to corrupt data in ZODB) + panicf("node %s is reachable from multiple parents: %s %s", + oid, t.parent, t2.parent) + } + } + } + + A.fixup(未nchild) +} + +func (A PPTreeSubSet) xDifferenceInplace(B PPTreeSubSet) { + if tracePPSet { + fmt.Printf("\n\n xDifference:\n") + fmt.Printf(" a: %s\n", A) + fmt.Printf(" b: %s\n", B) + defer fmt.Printf(" ->d: %s\n", A) + } + + 未nchild := map[zodb.Oid]int{} + + // remove B.leafs and their parents + for oid, t2 := range B { + if t2.nchild != 0 { + continue // not a leaf + } + + t, present := A[oid] + if !present { + continue // already not there + } + + if t2.parent != t.parent { + // XXX or verify this at Track time and require + // that update is passed only entries with the + // same .parent? (then it would be ok to panic here) + // XXX -> error (e.g. due to corrupt data in ZODB) + panicf("node %s is reachable from multiple parents: %s %s", + oid, t.parent, t2.parent) + } + + delete(A, oid) + if t.parent != zodb.InvalidOid { + 未nchild[t.parent] -= 1 + } + } + + A.fixup(未nchild) +} + +// fixup performs scheduled 未nchild adjustment. +func (A PPTreeSubSet) fixup(未nchild map[zodb.Oid]int) { + A.xfixup(+1, 未nchild) +} +func (A PPTreeSubSet) xfixup(sign int, 未nchild map[zodb.Oid]int) { + if debugPPSet { + ssign := "+" + if sign < 0 { + ssign = "-" + } + fmt.Printf("\n fixup:\n") + fmt.Printf(" 路: %s\n", A) + fmt.Printf(" %s未: %v\n", ssign, 未nchild) + defer fmt.Printf(" ->路: %s\n\n", A) + } + + gcq := []zodb.Oid{} + for oid, 未nc := range 未nchild { + t := A[oid] // t != nil as A is PP-connected + t.nchild += sign*未nc + if t.nchild == 0 { + gcq = append(gcq, oid) + } + } + + // GC parents that became to have .nchild == 0 + for _, oid := range gcq { + A.gc1(oid) + } +} + +// gc1 garbage-collects oid and cleans up its parent down-up. +func (S PPTreeSubSet) gc1(oid zodb.Oid) { + t, present := S[oid] + if !present { + return // already not there + } + if t.nchild != 0 { + panicf("gc %s %v (nchild != 0)", oid, t) + } + + delete(S, oid) + oid = t.parent + for oid != zodb.InvalidOid { + t := S[oid] + t.nchild-- + if t.nchild > 0 { + break + } + delete(S, oid) + oid = t.parent + } +} + + +// ---- verify ---- + +// verify checks internal consistency of S. +func (S PPTreeSubSet) verify() { + // TODO !debug -> return + + var badv []string + badf := func(format string, argv ...interface{}) { + badv = append(badv, fmt.Sprintf(format, argv...)) + } + defer func() { + if badv != nil { + emsg := "S.verify: fail:\n\n" + for _, bad := range badv { + emsg += fmt.Sprintf("- %s\n", bad) + } + emsg += fmt.Sprintf("\nS: %s\n", S) + panic(emsg) + } + }() + + // recompute {} oid -> children and verify .nchild against it + children := make(map[zodb.Oid]setOid, len(S)) + for oid, t := range S { + if t.parent != zodb.InvalidOid { + cc, ok := children[t.parent] + if !ok { + cc = make(setOid, 1) + children[t.parent] = cc + } + cc.Add(oid) + } + } + + for oid, t := range S { + cc := children[oid] + if t.nchild != len(cc) { + badf("[%s].nchild=%d children: %s", oid, t.nchild, cc) + } + } + + // verify that all pointed-to parents are present in the set (= PP-connected) + for oid := range children { + _, ok := S[oid] + if !ok { + badf("oid %s is pointed to via some .parent, but is not present in the set", oid) + } + } +} + + +// ---- misc ---- + +// Clone returns copy of the set. +func (orig PPTreeSubSet) Clone() PPTreeSubSet { + klon := make(PPTreeSubSet, len(orig)) + for oid, t := range orig { + klon[oid] = &nodeInTree{parent: t.parent, nchild: t.nchild} + } + return klon +} + +// Equal returns whether A == B. +func (A PPTreeSubSet) Equal(B PPTreeSubSet) bool { + if len(A) != len(B) { + return false + } + + for oid, ta := range A { + tb, ok := B[oid] + if !ok { + return false + } + + if !(ta.parent == tb.parent && ta.nchild == tb.nchild) { + return false + } + } + + return true +} + +// Empty returns whether set is empty. +func (S PPTreeSubSet) Empty() bool { + return len(S) == 0 +} + +func (t nodeInTree) String() string { + return fmt.Sprintf("{p%s c%d}", t.parent, t.nchild) +} + + +// ---- diff/patch ---- + +// 螖PPTreeSubSet represents a change to PPTreeSubSet. +// +// It can be applied via PPTreeSubSet.Apply螖 . +// +// The result B of applying 未 to A is: +// +// B = A.xDifference(未.Del).xUnion(未.Add) (*) +// +// (*) NOTE 未.Del and 未.Add might have their leafs starting from non-leaf nodes in A/B. +// This situation arises when 未 represents a change in path to particular +// node, but that node itself does not change, for example: +// +// c* c +// / \ / +// 41* 42 41 +// | | | \ +// 22 43 46 43 +// | | | +// 44 22 44 +// +// Here nodes {c, 41} are changed, node 42 is unlinked, and node 46 is added. +// Nodes 43 and 44 stay unchanged. +// +// 未.Del = c-42-43 | c-41-22 +// 未.Add = c-41-43 | c-41-46-22 +// +// The second component with "-22" builds from leaf, but the first +// component with "-43" builds from non-leaf node. +// +// 螖nchildNonLeafs = {43: +1} +// +// Only complete result of applying all +// +// - xfixup(-1, 螖nchildNonLeafs) +// - 未.Del, +// - 未.Add, and +// - xfixup(+1, 螖nchildNonLeafs) +// +// produces correctly PP-connected set. +type 螖PPTreeSubSet struct { + Del PPTreeSubSet + Add PPTreeSubSet + 螖nchildNonLeafs map[zodb.Oid]int +} + +// New螖PPTreeSubSet creates new empty 螖PPTreeSubSet. +func New螖PPTreeSubSet() *螖PPTreeSubSet { + return &螖PPTreeSubSet{ + Del: PPTreeSubSet{}, + Add: PPTreeSubSet{}, + 螖nchildNonLeafs: map[zodb.Oid]int{}, + } +} + +// Update updates 未 to be combination of 未+未2. +func (未 *螖PPTreeSubSet) Update(未2 *螖PPTreeSubSet) { + 未.Del.UnionInplace(未2.Del) + 未.Add.UnionInplace(未2.Add) + for oid, 未nc := range 未2.螖nchildNonLeafs { + 未.螖nchildNonLeafs[oid] += 未nc + } +} + +// Reverse changes 未=diff(A->B) to 未'=diff(A<-B). +func (未 *螖PPTreeSubSet) Reverse() { + 未.Del, 未.Add = 未.Add, 未.Del + // 螖nchildNonLeafs stays the same +} + + +// Apply螖 applies 未 to S. +// +// See 螖PPTreeSubSet documentation for details. +func (S PPTreeSubSet) Apply螖(未 *螖PPTreeSubSet) { + if tracePPSet { + fmt.Printf("\n\nApply螖\n") + fmt.Printf(" A: %s\n", S) + fmt.Printf(" -: %s\n", 未.Del) + fmt.Printf(" +: %s\n", 未.Add) + fmt.Printf(" x: %v\n", 未.螖nchildNonLeafs) + defer fmt.Printf("\n->B: %s\n", S) + } + + S.verify() + 未.Del.verify() + 未.Add.verify() + defer S.verify() + + S.xfixup(-1, 未.螖nchildNonLeafs) + S.xDifferenceInplace(未.Del) + S.xUnionInplace(未.Add) + S.xfixup(+1, 未.螖nchildNonLeafs) +} diff --git a/wcfs/internal/xbtree/blib/pptreesubset_test.go b/wcfs/internal/xbtree/blib/pptreesubset_test.go new file mode 100644 index 00000000..c90c72ca --- /dev/null +++ b/wcfs/internal/xbtree/blib/pptreesubset_test.go @@ -0,0 +1,123 @@ +// Copyright (C) 2021 Nexedi SA and Contributors. +// Kirill Smelkov <kirr@nexedi.com> +// +// This program is free software: you can Use, Study, Modify and Redistribute +// it under the terms of the GNU General Public License version 3, or (at your +// option) any later version, as published by the Free Software Foundation. +// +// You can also Link and Combine this program with other software covered by +// the terms of any of the Free Software licenses or any of the Open Source +// Initiative approved licenses and Convey the resulting work. Corresponding +// source of such a combination shall include the source code for all other +// software used. +// +// This program is distributed WITHOUT ANY WARRANTY; without even the implied +// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// See COPYING file for full licensing terms. +// See https://www.nexedi.com/licensing for rationale and options. + +package blib + +import ( + "strings" + "testing" + + "lab.nexedi.com/kirr/neo/go/zodb" +) + +func TestPPTreeSubSetOps(t *testing.T) { + const ( + a zodb.Oid = 0xa + iota + b + c + d + 酶 = zodb.InvalidOid + ) + type S = PPTreeSubSet + type testEntry struct { + A, B S + Union S + Difference S + } + E := func(A, B, U, D S) testEntry { + return testEntry{A, B, U, D} + } + + testv := []testEntry{ + E( + S{}, // A + S{}, // B + S{}, // U + S{}), // D + + E( + S{a:{酶,0}}, // A + S{a:{酶,0}}, // B + S{a:{酶,0}}, // U + S{}), // D + + E( + S{a:{酶,0}}, // A + S{b:{酶,0}}, // B + S{a:{酶,0}, b:{酶,0}}, // U + S{a:{酶,0}}), // D + + E( + S{a:{酶,1}, b:{a,0}}, // A + S{a:{酶,1}, c:{a,0}}, // B + S{a:{酶,2}, b:{a,0}, c:{a,0}}, // U + S{a:{酶,1}, b:{a,0}}), // D + + E( + S{a:{酶,1}, b:{a,1}, c:{b,0}}, // A + S{a:{酶,1}, b:{a,1}, d:{b,0}}, // B + S{a:{酶,1}, b:{a,2}, c:{b,0}, d:{b,0}}, // U + S{a:{酶,1}, b:{a,1}, c:{b,0}}), // D + + E( + S{a:{酶,1}, b:{a,0}}, // A + S{a:{酶,1}, b:{a,0}}, // B + S{a:{酶,1}, b:{a,0}}, // U + S{}), // D + + E( + S{a:{酶,1}, b:{a,1}, c:{b,0}}, // A + S{a:{酶,1}, b:{a,1}, c:{b,0}}, // B (=A) + S{a:{酶,1}, b:{a,1}, c:{b,0}}, // U (=A) + S{}), // D + } + + // assert1 asserts that result of op(A,B) == resOK. + assert1 := func(op string, A, B, res, resOK S) { + t.Helper() + if res.Equal(resOK) { + return + } + op1 := op[0:1] + t.Errorf("%s:\n A: %s\n B: %s\n ->%s: %s\n ok%s: %s\n", + strings.Title(op), A, B, op1, res, strings.ToUpper(op1), resOK) + } + + for _, tt := range testv { + Uab := tt.A.Union(tt.B) + Uba := tt.B.Union(tt.A) + Dab := tt.A.Difference(tt.B) + + assert1("union", tt.A, tt.B, Uab, tt.Union) + assert1("union", tt.B, tt.A, Uba, tt.Union) + assert1("difference", tt.A, tt.B, Dab, tt.Difference) + + Uaa := tt.A.Union(tt.A) + Ubb := tt.B.Union(tt.B) + Daa := tt.A.Difference(tt.A) + Dbb := tt.B.Difference(tt.B) + + assert1("union", tt.A, tt.A, Uaa, tt.A) + assert1("union", tt.B, tt.B, Ubb, tt.B) + assert1("difference", tt.A, tt.A, Daa, S{}) + assert1("difference", tt.B, tt.B, Dbb, S{}) + + // TODO also verify U/D properties like (A+B)\B + (A+B)\A + (A^B) == (A+B) ? + } +} -- 2.30.9