X review feedback on newapi

- add nodefs package overview; in particular describe what inode is so that there is no confusion about its meaning. - Draftly implement Lookup / Forget. They work not under global rawBridge.mu and still there should be no race of Lookup / Forget due to careful locking of inode -> (inode, ichild) + retrying. Add description of Forget semantic and what happens when we receive forget for a directory for which children are not forgotten yet. ( it is too late here now and I did not checked the implementation with a fresh head. I thought that it is better to release current state for discussion as I likely won't be able to work on newapi for at least another week ) - use atomics in DefaultNode setInode/inode; see Lookup and corresponding description nearby DefaultNode.setInode for why it is needed. - inode.{lookupCount,nodeID} are now protected not by global rawBridge.mu, but instead by inode.mu . - change Node operation to return Nodes, not Inode. In particulare Node.Lookup should now return Node. Inodes are internal index of nodefs VFS (see package description) and we should not load filesystem implementations to think about them where we can. Also it makes a more closed interface when filesystem works in terms it nodes completely. Also this way we offload filesystems for caring about tricky details of how to create inode for a hardlinked entry (see Lookup for details which handles about it) - Remove Node.Inode -> nodefs.InodeOf(Node). this way there will be no possibility to override Node.Inode and we can specify InodeOf semantic exactly in API docs. - unlockNodes: sort is not needed - lock/unlock Nodes: avoid duplicates (e.g. there can be duplicates if dir/a and dir/b are hardlinks to the same file. If we don't avoid duplicates lockNodes will deadlock) - made some other edits, part of them not complete...

X review feedback on newapi
- add nodefs package overview; in particular describe what inode is so that there is no confusion about its meaning. - Draftly implement Lookup / Forget. They work not under global rawBridge.mu and still there should be no race of Lookup / Forget due to careful locking of inode -> (inode, ichild) + retrying. Add description of Forget semantic and what happens when we receive forget for a directory for which children are not forgotten yet. ( it is too late here now and I did not checked the implementation with a fresh head. I thought that it is better to release current state for discussion as I likely won't be able to work on newapi for at least another week ) - use atomics in DefaultNode setInode/inode; see Lookup and corresponding description nearby DefaultNode.setInode for why it is needed. - inode.{lookupCount,nodeID} are now protected not by global rawBridge.mu, but instead by inode.mu . - change Node operation to return Nodes, not Inode. In particulare Node.Lookup should now return Node. Inodes are internal index of nodefs VFS (see package description) and we should not load filesystem implementations to think about them where we can. Also it makes a more closed interface when filesystem works in terms it nodes completely. Also this way we offload filesystems for caring about tricky details of how to create inode for a hardlinked entry (see Lookup for details which handles about it) - Remove Node.Inode -> nodefs.InodeOf(Node). this way there will be no possibility to override Node.Inode and we can specify InodeOf semantic exactly in API docs. - unlockNodes: sort is not needed - lock/unlock Nodes: avoid duplicates (e.g. there can be duplicates if dir/a and dir/b are hardlinks to the same file. If we don't avoid duplicates lockNodes will deadlock) - made some other edits, part of them not complete...
7e1f11bc · Kirill Smelkov · 9d7cb89b · 7e1f11bc · 7e1f11bc · 7e1f11bc
Commit 7e1f11bc authored Feb 15, 2019 by Kirill Smelkov
6 changed files
--- a/nodefs/api.go
+++ b/nodefs/api.go
@@ -2,6 +2,48 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
+// Package nodefs provides infrastructure to build tree-organized filesystems.
+//
+// A tree-organized filesystem is similar to UNIX or Plan 9 filesystem: it
+// consists of nodes with each node being either a file or a directory. Files
+// are located at tree leafs. A directory node can have other nodes as its
+// children and refer to each child by name unique through the directory.
+// There can be several paths leading from tree root to a particular node,
+// known as hard-linking, for example
+//
+//	    root
+//	    /  \
+//	  dir1 dir2
+//	    \  /
+//	    file
+//
+// A /-separated string path describes location of a node in the tree. For example
+//
+//	/dir1/file
+//
+// describes path root → dir1 → file.
+//
+// Each node is associated with integer ID uniquely identifying the node
+// throughout filesystem. The tree-level structure of any filesystem is
+// expressed through index-nodes (also known as "inode", see Inode) which
+// describe parent/child relation in between nodes and node-ID association.
+//
+// A particular filesystem should provide nodes with filesystem operations
+// implemented as defined by Node interface. When filesystem is mounted, its
+// root Node is associated with root of the tree, and the tree is further build
+// lazily when nodefs infrastructure needs to lookup children of nodes to
+// process client requests. For every new node, the filesystem infrastructure
+// automatically builds new index node and links it in the filesystem tree.
+// InodeOf can be used to get particular Inode associated with a Node.
+//
+// XXX ^^^ inodes cleaned on cache clean (FORGET).
+//
+// XXX describe how to mount.
+//
+// XXX node example with Lookup.
+//
+// XXX describe how to pre-add nodes to tree.
+//
 package nodefs
 import (
@@ -11,6 +53,15 @@ import (
 	"github.com/hanwen/go-fuse/fuse"
 )
+// InodeOf returns index-node associated with filesystem node.
+//
+// The identity of the Inode does not change over the lifetime of
+// the node object.
+func InodeOf(node Node) *Inode {
+	return node.inode()
+}
 /*
 NOSUBMIT: how to structure?
@@ -19,23 +70,27 @@ NOSUBMIT: how to structure?
 - one giant interface?
 - use raw types as args rather than mimicking Golang signatures?
+Every Node implementation must directly or indirectly embed DefaultNode.
 */
 type Node interface {
-	// setInode links the Inode to a Node.
+	// setInode and inode are used by nodefs internally to link Inode to a Node.
-	setInode(*Inode)
+	//
+	// When a new Node instance is created, e.g. on Lookup, it has nil Inode.
-	// Inode must return a non-nil associated inode structure. The
+	// Nodefs infrastructure will notice this and associate created Node with new Inode.
-	// identity of the Inode may not change over the lifetime of
+	//
-	// the object.
+	// See InodeOf for public API to retrieve an inode from Node.
-	Inode() *Inode
+	inode() *Inode
+	setInode(*Inode) (set bool)
-	// Lookup finds a child Inode. If a new Inode must be created,
-	// the inode does not have to be added to the tree.
+	// Lookup should find a direct child of the node by child name.
-	Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*Inode, fuse.Status)
+	//
+	// VFS makes sure to call Lookup only once for particular (node, name)
+	// pair.
+	Lookup(ctx context.Context, name string, out *fuse.EntryOut) (Node, fuse.Status)
 	Open(ctx context.Context, flags uint32) (fh File, fuseFlags uint32, code fuse.Status)
-	Create(ctx context.Context, name string, flags uint32, mode uint32) (inode *Inode, fh File, fuseFlags uint32, code fuse.Status)
+	Create(ctx context.Context, name string, flags uint32, mode uint32) (node Node, fh File, fuseFlags uint32, code fuse.Status)
 	Read(ctx context.Context, f File, dest []byte, off int64) (fuse.ReadResult, fuse.Status)

--- a/nodefs/bridge.go
+++ b/nodefs/bridge.go
--- a/nodefs/default.go
+++ b/nodefs/default.go
@@ -6,22 +6,47 @@ package nodefs
 import (
 	"context"
+	"sync/atomic"
 	"time"
+	"unsafe"
 	"github.com/hanwen/go-fuse/fuse"
 )
-// DefaultNode must be embedded in a Node implementation.
+// DefaultNode provides common base Node functionality.
+//
+// It must be embedded in any Node implementation.
 type DefaultNode struct {
-	inode *Inode
+	inode_ *Inode
 }
-func (dn *DefaultNode) setInode(n *Inode) {
+// set/retrieve inode.
-	dn.inode = n
+//
-}
+// node -> inode association, can be simultaneously tried to be set, if for e.g.
+//
-func (dn *DefaultNode) Inode() *Inode {
+//	    root
-	return dn.inode
+//	    /  \
+//	  dir1 dir2
+//	    \  /
+//	    file
+//
+// dir1.Lookup("file") and dir2.Lookup("file") are executed simultaneously.
+//
+// We use atomics so that only one set can win and rawBridge.Lookup cares to
+// cancel inode that loosed.
+//
+// To read node.inode atomic.LoadPointer is used, however it is not expensive
+// since it translates to regular MOVQ on amd64.
+func (dn *DefaultNode) setInode(inode *Inode) bool {
+	return atomic.CompareAndSwapPointer(
+		(*unsafe.Pointer)(unsafe.Pointer(&dn.inode_)),
+		nil, unsafe.Pointer(inode))
+}
+func (dn *DefaultNode) inode() *Inode {
+	return (*Inode)(atomic.LoadPointer(
+		(*unsafe.Pointer)(unsafe.Pointer(&dn.inode_))))
 }
 func (n *DefaultNode) Read(ctx context.Context, f File, dest []byte, off int64) (fuse.ReadResult, fuse.Status) {

--- a/nodefs/inode.go
+++ b/nodefs/inode.go
@@ -34,38 +34,93 @@ type Inode struct {
 	// Following data is mutable.
-	// Protected by bridge.mu
-	lookupCount uint64
-	nodeID      uint64
 	// mu protects the following mutable fields. When locking
 	// multiple Inodes, locks must be acquired using
 	// lockNodes/unlockNodes
 	mu sync.Mutex
-	// incremented every time the 'children' or 'parents' field is changed.
+	// changeCounter increments every time the below mutable state
+	// (lookupCount, nodeID, children, parents) is modified.
+	//
+	// This is used in places where we have to relock inode into inode
+	// group lock, and after locking the group we have to check if inode
+	// did not changed, and if it changed - retry the operation.
 	changeCounter uint32
+	lookupCount uint64
+	// ID of the inode; 0 if inode was forgotten.
+	// forgotten inodes are unlinked from parent and children, but could be
+	// still not yet removed from bridge.nodes .
+	nodeID      uint64
 	children      map[string]*Inode
 	parents       map[parentData]struct{}
 }
+// newInode creates creates new inode pointing to node.
+//
+// node -> inode association is NOT set.
+// the inode is _not_ yet has
+func newInode(node Node, mode uint32) *Inode {
+	inode := &Inode{
+		mode:    mode ^ 07777,
+		node:    node,
+		//bridge:  n.bridge,
+		parents: make(map[parentData]struct{}),
+	}
+	if mode&fuse.S_IFDIR != 0 {
+		inode.children = make(map[string]*Inode)
+	}
+	//node.setInode(ch)
+	return inode
+}
+// sortNodes rearranges inode group in consistent order.
+//
+// The nodes are ordered by their in-RAM address, which gives consistency
+// property: for any A and B inodes, sortNodes will either always order A < B,
+// or always order A > B.
+//
+// See lockNodes where this property is used to avoid deadlock when taking
+// locks on inode group.
 func sortNodes(ns []*Inode) {
 	sort.Slice(ns, func(i, j int) bool {
 		return uintptr(unsafe.Pointer(ns[i])) < uintptr(unsafe.Pointer(ns[j]))
 	})
 }
+// lockNodes locks group of inodes.
+//
+// It always lock the inodes in the same order - to avoid deadlocks.
+// It also avoids locking an inode more than once, if it was specified multiple times.
+// An example when an inode might be given multiple times is if dir/a and dir/b
+// are hardlinked to the same inode and the caller needs to take locks on dir children.
+//
+// It is valid to give nil nodes - those are simply ignored.
 func lockNodes(ns ...*Inode) {
 	sortNodes(ns)
+	var nprev *Inode
 	for _, n := range ns {
-		n.mu.Lock()
+		if n != nprev {
+			n.mu.Lock()
+			nprev = n
+		}
 	}
 }
+// unlockNodes releases locks taken by lockNodes.
 func unlockNodes(ns ...*Inode) {
+	// we don't need to unlock in the same order that was used in lockNodes.
+	// however it still helps to have nodes sorted to avoid duplicates.
 	sortNodes(ns)
+	var nprev *Inode
 	for _, n := range ns {
-		n.mu.Unlock()
+		if n != nprev {
+			n.mu.Unlock()
+			nprev = n
+		}
 	}
 }
@@ -74,9 +129,14 @@ func unlockNodes(ns ...*Inode) {
 // kernel has no way of reviving forgotten nodes by its own
 // initiative.
 func (n *Inode) Forgotten() bool {
+	/*
 	n.bridge.mu.Lock()
 	defer n.bridge.mu.Unlock()
 	return n.lookupCount == 0
+	*/
+	n.mu.Lock()
+	defer n.mu.Unlock()
+	return n.nodeID == 0
 }
 // Node returns the Node object implementing the file system operations.
@@ -155,14 +215,24 @@ func (n *Inode) FindChildByOpaqueID(name string, opaqueID uint64) *Inode {
 	return nil
 }
-func (n *Inode) addLookup(name string, child *Inode) {
+// setEntry does `iparent[name] = ichild` linking.
-	child.lookupCount++
+//
-	child.parents[parentData{name, n}] = struct{}{}
+// setEntry must not be called simultaneously for any of iparent or ichild.
-	n.children[name] = child
+// This, for example could be satisfied if both iparent and ichild are locked,
-	child.changeCounter++
+// but it could be also valid if only iparent is locked and ichild was just
-	n.changeCounter++
+// created and only one goroutine keeps referencing it.
+//
+// XXX also ichild.lookupCount++ ?
+func (iparent *Inode) setEntry(name string, ichild *Inode) {
+//	ichild.lookupCount++
+	ichild.parents[parentData{name, iparent}] = struct{}{}
+	iparent.children[name] = ichild
+	ichild.changeCounter++
+	iparent.changeCounter++
 }
+// XXX kill
+/*
 func (n *Inode) clearParents() {
 	for {
 		lockme := []*Inode{n}
@@ -191,7 +261,10 @@ func (n *Inode) clearParents() {
 		}
 	}
 }
+*/
+// XXX kill
+/*
 func (n *Inode) clearChildren() {
 	if n.mode != fuse.S_IFDIR {
 		return
@@ -226,12 +299,18 @@ func (n *Inode) clearChildren() {
 		}
 	}
+	// XXX not right - we cannot fully clear our children, because they can
+	// be also children of another directory.
+	//
+	// XXX also not right - the kernel can send FORGET(idir) but keep
+	// references to children inodes.
 	for _, ch := range lockme {
 		if ch != n {
 			ch.clearChildren()
 		}
 	}
 }
+*/
 // NewPersistentInode returns an Inode with a LookupCount == 1, ie. the
 // node will only get garbage collected if the kernel issues a forget

--- a/nodefs/loopback.go
+++ b/nodefs/loopback.go
@@ -37,11 +37,11 @@ type loopbackNode struct {
 }
 func (n *loopbackNode) path() string {
-	path := n.Inode().Path(nil)
+	path := InodeOf(n).Path(nil)
 	return filepath.Join(n.rootNode.root, path)
 }
-func (n *loopbackNode) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*Inode, fuse.Status) {
+func (n *loopbackNode) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (Node, fuse.Status) {
 	p := filepath.Join(n.path(), name)
 	st := syscall.Stat_t{}
@@ -52,17 +52,18 @@ func (n *loopbackNode) Lookup(ctx context.Context, name string, out *fuse.EntryO
 	out.Attr.FromStat(&st)
-	ch := n.Inode().FindChildByOpaqueID(name, out.Attr.Ino)
+	ch := InodeOf(n).FindChildByOpaqueID(name, out.Attr.Ino)
 	if ch != nil {
-		return ch, fuse.OK
+		return ch.Node(), fuse.OK
 	}
 	node := &loopbackNode{rootNode: n.rootNode}
-	ch = n.Inode().NewInode(node, out.Attr.Mode, out.Attr.Ino)
+	return node, fuse.OK
-	return ch, fuse.OK
+//	ch = n.Inode().NewInode(node, out.Attr.Mode, out.Attr.Ino)
+//	return ch, fuse.OK
 }
-func (n *loopbackNode) Create(ctx context.Context, name string, flags uint32, mode uint32) (inode *Inode, fh File, fuseFlags uint32, code fuse.Status) {
+func (n *loopbackNode) Create(ctx context.Context, name string, flags uint32, mode uint32) (node Node, fh File, fuseFlags uint32, code fuse.Status) {
 	p := filepath.Join(n.path(), name)
 	f, err := os.OpenFile(p, int(flags)|os.O_CREATE, os.FileMode(mode))
@@ -76,9 +77,9 @@ func (n *loopbackNode) Create(ctx context.Context, name string, flags uint32, mo
 		return nil, nil, 0, fuse.ToStatus(err)
 	}
-	node := &loopbackNode{rootNode: n.rootNode}
+	node = &loopbackNode{rootNode: n.rootNode}
-	ch := n.Inode().NewInode(node, st.Mode, st.Ino)
+//	ch := n.Inode().NewInode(node, st.Mode, st.Ino)
-	return ch, NewLoopbackFile(f), 0, fuse.OK
+	return node, NewLoopbackFile(f), 0, fuse.OK
 }
 func (n *loopbackNode) Open(ctx context.Context, flags uint32) (fh File, fuseFlags uint32, code fuse.Status) {

--- a/nodefs/simple_test.go
+++ b/nodefs/simple_test.go
@@ -6,10 +6,10 @@ package nodefs
 import (
 	"bytes"
-	"io"
 	"io/ioutil"
 	"os"
 	"path/filepath"
+	"runtime"
 	"testing"
 	"time"