Commit 91b79ba7 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] separate locking for vfsmounts

From: Maneesh Soni <maneesh@in.ibm.com>

While path walking we do follow_mount or follow_down which uses
dcache_lock for serialisation.  vfsmount related operations also use
dcache_lock for all updates. I think we can use a separate lock for
vfsmount related work and can improve path walking.

The following two patches does the same. The first one replaces
dcache_lock with new vfsmount_lock in namespace.c. The lock is
local to namespace.c and is not required outside. The second patch
uses RCU to have lock free lookup_mnt(). The patches are quite simple
and straight forward.

The lockmeter reults show reduced contention, and lock acquisitions
for dcache_lock while running dcachebench* on a 4-way SMP box

    SPINLOCKS         HOLD            WAIT
    UTIL  CON    MEAN(  MAX )   MEAN(  MAX )(% CPU)     TOTAL NOWAIT SPIN RJECT  NAME

  baselkm-2569:
    20.7% 20.9%  0.5us( 146us)  2.9us( 144us)(0.81%)  31590840 79.1% 20.9%    0%  dcache_lock
  mntlkm-2569:
    14.3% 13.6%  0.4us( 170us)  2.9us( 187us)(0.42%)  23071746 86.4% 13.6%    0%  dcache_lock

We get more than 8% improvement on 4-way SMP and 44% improvement on 16-way
NUMAQ while runing dcachebench*.

		Average (usecs/iteration)	Std. Deviation
		(lower is better)
4-way SMP
  2.5.69	15739.3				470.90
  2.5.69-mnt	14459.6				298.51

16-way NUMAQ
  2.5.69	120426.5			363.78
  2.5.69-mnt	 63225.8			427.60

*dcachebench is a microbenchmark written by Bill Hartner and is available at
http://www-124.ibm.com/developerworks/opensource/linuxperf/dcachebench/dcachebench.html

 vfsmount_lock.patch
 -------------------
 - Patch for replacing dcache_lock with new vfsmount_lock for all mount
   related operation. This removes the need to take dcache_lock while
   doing follow_mount or follow_down operations in path walking.

I re-ran dcachebench with 2.5.70 as base on 16-way NUMAQ box.

                	Average (usecs/iteration)       Std. Deviation
                	(lower is better)
16-way NUMAQ
2.5.70 				120710.9		 	230.67
 + vfsmount_lock.patch  	65209.6				242.97
    + lookup_mnt-rcu.patch 	64042.3				416.61

So just the lock splitting (vfsmount_lock.patch) gives almost similar benifits
parent 679c40a8
...@@ -1451,19 +1451,24 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size) ...@@ -1451,19 +1451,24 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry) int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry)
{ {
int result; int result;
unsigned long seq;
result = 0; result = 0;
for (;;) { do {
if (new_dentry != old_dentry) { seq = read_seqbegin(&rename_lock);
struct dentry * parent = new_dentry->d_parent; for (;;) {
if (parent == new_dentry) if (new_dentry != old_dentry) {
break; struct dentry * parent = new_dentry->d_parent;
new_dentry = parent; if (parent == new_dentry)
continue; break;
new_dentry = parent;
continue;
}
result = 1;
break;
} }
result = 1; } while (read_seqretry(&rename_lock, seq));
break;
}
return result; return result;
} }
......
...@@ -434,19 +434,17 @@ int follow_up(struct vfsmount **mnt, struct dentry **dentry) ...@@ -434,19 +434,17 @@ int follow_up(struct vfsmount **mnt, struct dentry **dentry)
return 1; return 1;
} }
/* no need for dcache_lock, as serialization is taken care in
* namespace.c
*/
static int follow_mount(struct vfsmount **mnt, struct dentry **dentry) static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
{ {
int res = 0; int res = 0;
while (d_mountpoint(*dentry)) { while (d_mountpoint(*dentry)) {
struct vfsmount *mounted; struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
spin_lock(&dcache_lock); if (!mounted)
mounted = lookup_mnt(*mnt, *dentry);
if (!mounted) {
spin_unlock(&dcache_lock);
break; break;
} *mnt = mounted;
*mnt = mntget(mounted);
spin_unlock(&dcache_lock);
dput(*dentry); dput(*dentry);
mntput(mounted->mnt_parent); mntput(mounted->mnt_parent);
*dentry = dget(mounted->mnt_root); *dentry = dget(mounted->mnt_root);
...@@ -455,21 +453,21 @@ static int follow_mount(struct vfsmount **mnt, struct dentry **dentry) ...@@ -455,21 +453,21 @@ static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
return res; return res;
} }
/* no need for dcache_lock, as serialization is taken care in
* namespace.c
*/
static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry) static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
{ {
struct vfsmount *mounted; struct vfsmount *mounted;
spin_lock(&dcache_lock);
mounted = lookup_mnt(*mnt, *dentry); mounted = lookup_mnt(*mnt, *dentry);
if (mounted) { if (mounted) {
*mnt = mntget(mounted); *mnt = mounted;
spin_unlock(&dcache_lock);
dput(*dentry); dput(*dentry);
mntput(mounted->mnt_parent); mntput(mounted->mnt_parent);
*dentry = dget(mounted->mnt_root); *dentry = dget(mounted->mnt_root);
return 1; return 1;
} }
spin_unlock(&dcache_lock);
return 0; return 0;
} }
......
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
extern int __init init_rootfs(void); extern int __init init_rootfs(void);
extern int __init sysfs_init(void); extern int __init sysfs_init(void);
/* spinlock for vfsmount related operations, inplace of dcache_lock */
spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
static struct list_head *mount_hashtable; static struct list_head *mount_hashtable;
static int hash_mask, hash_bits; static int hash_mask, hash_bits;
static kmem_cache_t *mnt_cache; static kmem_cache_t *mnt_cache;
...@@ -66,30 +68,38 @@ void free_vfsmnt(struct vfsmount *mnt) ...@@ -66,30 +68,38 @@ void free_vfsmnt(struct vfsmount *mnt)
kmem_cache_free(mnt_cache, mnt); kmem_cache_free(mnt_cache, mnt);
} }
/*
* Now, lookup_mnt increments the ref count before returning
* the vfsmount struct.
*/
struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{ {
struct list_head * head = mount_hashtable + hash(mnt, dentry); struct list_head * head = mount_hashtable + hash(mnt, dentry);
struct list_head * tmp = head; struct list_head * tmp = head;
struct vfsmount *p; struct vfsmount *p, *found = NULL;
spin_lock(&vfsmount_lock);
for (;;) { for (;;) {
tmp = tmp->next; tmp = tmp->next;
p = NULL; p = NULL;
if (tmp == head) if (tmp == head)
break; break;
p = list_entry(tmp, struct vfsmount, mnt_hash); p = list_entry(tmp, struct vfsmount, mnt_hash);
if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
found = mntget(p);
break; break;
}
} }
return p; spin_unlock(&vfsmount_lock);
return found;
} }
static int check_mnt(struct vfsmount *mnt) static int check_mnt(struct vfsmount *mnt)
{ {
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
while (mnt->mnt_parent != mnt) while (mnt->mnt_parent != mnt)
mnt = mnt->mnt_parent; mnt = mnt->mnt_parent;
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
return mnt == current->namespace->root; return mnt == current->namespace->root;
} }
...@@ -263,15 +273,15 @@ void umount_tree(struct vfsmount *mnt) ...@@ -263,15 +273,15 @@ void umount_tree(struct vfsmount *mnt)
mnt = list_entry(kill.next, struct vfsmount, mnt_list); mnt = list_entry(kill.next, struct vfsmount, mnt_list);
list_del_init(&mnt->mnt_list); list_del_init(&mnt->mnt_list);
if (mnt->mnt_parent == mnt) { if (mnt->mnt_parent == mnt) {
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
} else { } else {
struct nameidata old_nd; struct nameidata old_nd;
detach_mnt(mnt, &old_nd); detach_mnt(mnt, &old_nd);
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
path_release(&old_nd); path_release(&old_nd);
} }
mntput(mnt); mntput(mnt);
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
} }
} }
...@@ -324,17 +334,17 @@ static int do_umount(struct vfsmount *mnt, int flags) ...@@ -324,17 +334,17 @@ static int do_umount(struct vfsmount *mnt, int flags)
} }
down_write(&current->namespace->sem); down_write(&current->namespace->sem);
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
if (atomic_read(&sb->s_active) == 1) { if (atomic_read(&sb->s_active) == 1) {
/* last instance - try to be smart */ /* last instance - try to be smart */
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
lock_kernel(); lock_kernel();
DQUOT_OFF(sb); DQUOT_OFF(sb);
acct_auto_close(sb); acct_auto_close(sb);
unlock_kernel(); unlock_kernel();
security_sb_umount_close(mnt); security_sb_umount_close(mnt);
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
} }
retval = -EBUSY; retval = -EBUSY;
if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) { if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
...@@ -342,7 +352,7 @@ static int do_umount(struct vfsmount *mnt, int flags) ...@@ -342,7 +352,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
umount_tree(mnt); umount_tree(mnt);
retval = 0; retval = 0;
} }
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
if (retval) if (retval)
security_sb_umount_busy(mnt); security_sb_umount_busy(mnt);
up_write(&current->namespace->sem); up_write(&current->namespace->sem);
...@@ -449,18 +459,18 @@ static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry) ...@@ -449,18 +459,18 @@ static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry)
q = clone_mnt(p, p->mnt_root); q = clone_mnt(p, p->mnt_root);
if (!q) if (!q)
goto Enomem; goto Enomem;
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list); list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, &nd); attach_mnt(q, &nd);
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
} }
} }
return res; return res;
Enomem: Enomem:
if (res) { if (res) {
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
umount_tree(res); umount_tree(res);
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
} }
return NULL; return NULL;
} }
...@@ -485,7 +495,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd) ...@@ -485,7 +495,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
goto out_unlock; goto out_unlock;
err = -ENOENT; err = -ENOENT;
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) { if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
struct list_head head; struct list_head head;
...@@ -495,7 +505,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd) ...@@ -495,7 +505,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
mntget(mnt); mntget(mnt);
err = 0; err = 0;
} }
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
out_unlock: out_unlock:
up(&nd->dentry->d_inode->i_sem); up(&nd->dentry->d_inode->i_sem);
if (!err) if (!err)
...@@ -532,9 +542,9 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse) ...@@ -532,9 +542,9 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
if (mnt) { if (mnt) {
err = graft_tree(mnt, nd); err = graft_tree(mnt, nd);
if (err) { if (err) {
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
umount_tree(mnt); umount_tree(mnt);
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
} else } else
mntput(mnt); mntput(mnt);
} }
...@@ -599,7 +609,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name) ...@@ -599,7 +609,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
if (IS_DEADDIR(nd->dentry->d_inode)) if (IS_DEADDIR(nd->dentry->d_inode))
goto out1; goto out1;
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry)) if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
goto out2; goto out2;
...@@ -623,7 +633,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name) ...@@ -623,7 +633,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
detach_mnt(old_nd.mnt, &parent_nd); detach_mnt(old_nd.mnt, &parent_nd);
attach_mnt(old_nd.mnt, nd); attach_mnt(old_nd.mnt, nd);
out2: out2:
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
out1: out1:
up(&nd->dentry->d_inode->i_sem); up(&nd->dentry->d_inode->i_sem);
out: out:
...@@ -804,9 +814,9 @@ int copy_namespace(int flags, struct task_struct *tsk) ...@@ -804,9 +814,9 @@ int copy_namespace(int flags, struct task_struct *tsk)
down_write(&tsk->namespace->sem); down_write(&tsk->namespace->sem);
/* First pass: copy the tree topology */ /* First pass: copy the tree topology */
new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root); new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
list_add_tail(&new_ns->list, &new_ns->root->mnt_list); list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
/* Second pass: switch the tsk->fs->* elements */ /* Second pass: switch the tsk->fs->* elements */
if (fs) { if (fs) {
...@@ -1027,7 +1037,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p ...@@ -1027,7 +1037,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
if (new_nd.mnt->mnt_root != new_nd.dentry) if (new_nd.mnt->mnt_root != new_nd.dentry)
goto out2; /* not a mountpoint */ goto out2; /* not a mountpoint */
tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */ tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
if (tmp != new_nd.mnt) { if (tmp != new_nd.mnt) {
for (;;) { for (;;) {
if (tmp->mnt_parent == tmp) if (tmp->mnt_parent == tmp)
...@@ -1044,7 +1054,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p ...@@ -1044,7 +1054,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
detach_mnt(user_nd.mnt, &root_parent); detach_mnt(user_nd.mnt, &root_parent);
attach_mnt(user_nd.mnt, &old_nd); attach_mnt(user_nd.mnt, &old_nd);
attach_mnt(new_nd.mnt, &root_parent); attach_mnt(new_nd.mnt, &root_parent);
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
chroot_fs_refs(&user_nd, &new_nd); chroot_fs_refs(&user_nd, &new_nd);
security_sb_post_pivotroot(&user_nd, &new_nd); security_sb_post_pivotroot(&user_nd, &new_nd);
error = 0; error = 0;
...@@ -1061,7 +1071,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p ...@@ -1061,7 +1071,7 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
unlock_kernel(); unlock_kernel();
return error; return error;
out3: out3:
spin_unlock(&dcache_lock); spin_unlock(&vfsmount_lock);
goto out2; goto out2;
} }
......
...@@ -307,20 +307,22 @@ static int proc_check_root(struct inode *inode) ...@@ -307,20 +307,22 @@ static int proc_check_root(struct inode *inode)
base = dget(current->fs->root); base = dget(current->fs->root);
read_unlock(&current->fs->lock); read_unlock(&current->fs->lock);
spin_lock(&dcache_lock); spin_lock(&vfsmount_lock);
de = root; de = root;
mnt = vfsmnt; mnt = vfsmnt;
while (vfsmnt != our_vfsmnt) { while (vfsmnt != our_vfsmnt) {
if (vfsmnt == vfsmnt->mnt_parent) if (vfsmnt == vfsmnt->mnt_parent) {
spin_unlock(&vfsmount_lock);
goto out; goto out;
}
de = vfsmnt->mnt_mountpoint; de = vfsmnt->mnt_mountpoint;
vfsmnt = vfsmnt->mnt_parent; vfsmnt = vfsmnt->mnt_parent;
} }
spin_unlock(&vfsmount_lock);
if (!is_subdir(de, base)) if (!is_subdir(de, base))
goto out; goto out;
spin_unlock(&dcache_lock);
exit: exit:
dput(base); dput(base);
...@@ -329,7 +331,6 @@ static int proc_check_root(struct inode *inode) ...@@ -329,7 +331,6 @@ static int proc_check_root(struct inode *inode)
mntput(mnt); mntput(mnt);
return res; return res;
out: out:
spin_unlock(&dcache_lock);
res = -EACCES; res = -EACCES;
goto exit; goto exit;
} }
......
...@@ -54,6 +54,7 @@ extern void free_vfsmnt(struct vfsmount *mnt); ...@@ -54,6 +54,7 @@ extern void free_vfsmnt(struct vfsmount *mnt);
extern struct vfsmount *alloc_vfsmnt(const char *name); extern struct vfsmount *alloc_vfsmnt(const char *name);
extern struct vfsmount *do_kern_mount(const char *fstype, int flags, extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
const char *name, void *data); const char *name, void *data);
extern spinlock_t vfsmount_lock;
#endif #endif
#endif /* _LINUX_MOUNT_H */ #endif /* _LINUX_MOUNT_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment