Commit 8129ed29 authored by Oleg Nesterov's avatar Oleg Nesterov

change sb_writers to use percpu_rw_semaphore

We can remove everything from struct sb_writers except frozen
and add the array of percpu_rw_semaphore's instead.

This patch doesn't remove sb_writers->wait_unfrozen yet, we keep
it for get_super_thawed(). We will probably remove it later.

This change tries to address the following problems:

	- Firstly, __sb_start_write() looks simply buggy. It does
	  __sb_end_write() if it sees ->frozen, but if it migrates
	  to another CPU before percpu_counter_dec(), sb_wait_write()
	  can wrongly succeed if there is another task which holds
	  the same "semaphore": sb_wait_write() can miss the result
	  of the previous percpu_counter_inc() but see the result
	  of this percpu_counter_dec().

	- As Dave Hansen reports, it is suboptimal. The trivial
	  microbenchmark that writes to a tmpfs file in a loop runs
	  12% faster if we change this code to rely on RCU and kill
	  the memory barriers.

	- This code doesn't look simple. It would be better to rely
	  on the generic locking code.

	  According to Dave, this change adds the same performance
	  improvement.

Note: with this change both freeze_super() and thaw_super() will do
synchronize_sched_expedited() 3 times. This is just ugly. But:

	- This will be "fixed" by the rcu_sync changes we are going
	  to merge. After that freeze_super()->percpu_down_write()
	  will use synchronize_sched(), and thaw_super() won't use
	  synchronize() at all.

	  This doesn't need any changes in fs/super.c.

	- Once we merge rcu_sync changes, we can also change super.c
	  so that all wb_write->rw_sem's will share the single ->rss
	  in struct sb_writes, then freeze_super() will need only one
	  synchronize_sched().
Signed-off-by: default avatarOleg Nesterov <oleg@redhat.com>
Reviewed-by: default avatarJan Kara <jack@suse.com>
parent 853b39a7
...@@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work) ...@@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work)
int i; int i;
for (i = 0; i < SB_FREEZE_LEVELS; i++) for (i = 0; i < SB_FREEZE_LEVELS; i++)
percpu_counter_destroy(&s->s_writers.counter[i]); percpu_free_rwsem(&s->s_writers.rw_sem[i]);
kfree(s); kfree(s);
} }
...@@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) ...@@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
goto fail; goto fail;
for (i = 0; i < SB_FREEZE_LEVELS; i++) { for (i = 0; i < SB_FREEZE_LEVELS; i++) {
if (percpu_counter_init(&s->s_writers.counter[i], 0, if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
GFP_KERNEL) < 0) sb_writers_name[i],
&type->s_writers_key[i]))
goto fail; goto fail;
lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
&type->s_writers_key[i], 0);
} }
init_waitqueue_head(&s->s_writers.wait);
init_waitqueue_head(&s->s_writers.wait_unfrozen); init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info; s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags; s->s_flags = flags;
...@@ -1161,47 +1159,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) ...@@ -1161,47 +1159,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
*/ */
void __sb_end_write(struct super_block *sb, int level) void __sb_end_write(struct super_block *sb, int level)
{ {
percpu_counter_dec(&sb->s_writers.counter[level-1]); percpu_up_read(sb->s_writers.rw_sem + level-1);
/*
* Make sure s_writers are updated before we wake up waiters in
* freeze_super().
*/
smp_mb();
if (waitqueue_active(&sb->s_writers.wait))
wake_up(&sb->s_writers.wait);
rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
} }
EXPORT_SYMBOL(__sb_end_write); EXPORT_SYMBOL(__sb_end_write);
static int do_sb_start_write(struct super_block *sb, int level, bool wait,
unsigned long ip)
{
if (wait)
rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip);
retry:
if (unlikely(sb->s_writers.frozen >= level)) {
if (!wait)
return 0;
wait_event(sb->s_writers.wait_unfrozen,
sb->s_writers.frozen < level);
}
percpu_counter_inc(&sb->s_writers.counter[level-1]);
/*
* Make sure counter is updated before we check for frozen.
* freeze_super() first sets frozen and then checks the counter.
*/
smp_mb();
if (unlikely(sb->s_writers.frozen >= level)) {
__sb_end_write(sb, level);
goto retry;
}
if (!wait)
rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip);
return 1;
}
/* /*
* This is an internal function, please use sb_start_{write,pagefault,intwrite} * This is an internal function, please use sb_start_{write,pagefault,intwrite}
* instead. * instead.
...@@ -1209,7 +1170,7 @@ static int do_sb_start_write(struct super_block *sb, int level, bool wait, ...@@ -1209,7 +1170,7 @@ static int do_sb_start_write(struct super_block *sb, int level, bool wait,
int __sb_start_write(struct super_block *sb, int level, bool wait) int __sb_start_write(struct super_block *sb, int level, bool wait)
{ {
bool force_trylock = false; bool force_trylock = false;
int ret; int ret = 1;
#ifdef CONFIG_LOCKDEP #ifdef CONFIG_LOCKDEP
/* /*
...@@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) ...@@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait)
int i; int i;
for (i = 0; i < level - 1; i++) for (i = 0; i < level - 1; i++)
if (lock_is_held(&sb->s_writers.lock_map[i])) { if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
force_trylock = true; force_trylock = true;
break; break;
} }
} }
#endif #endif
ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_); if (wait && !force_trylock)
percpu_down_read(sb->s_writers.rw_sem + level-1);
else
ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
WARN_ON(force_trylock & !ret); WARN_ON(force_trylock & !ret);
return ret; return ret;
} }
...@@ -1243,15 +1208,11 @@ EXPORT_SYMBOL(__sb_start_write); ...@@ -1243,15 +1208,11 @@ EXPORT_SYMBOL(__sb_start_write);
* @level: type of writers we wait for (normal vs page fault) * @level: type of writers we wait for (normal vs page fault)
* *
* This function waits until there are no writers of given type to given file * This function waits until there are no writers of given type to given file
* system. Caller of this function should make sure there can be no new writers * system.
* of type @level before calling this function. Otherwise this function can
* livelock.
*/ */
static void sb_wait_write(struct super_block *sb, int level) static void sb_wait_write(struct super_block *sb, int level)
{ {
s64 writers; percpu_down_write(sb->s_writers.rw_sem + level-1);
rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
/* /*
* We are going to return to userspace and forget about this lock, the * We are going to return to userspace and forget about this lock, the
* ownership goes to the caller of thaw_super() which does unlock. * ownership goes to the caller of thaw_super() which does unlock.
...@@ -1262,24 +1223,18 @@ static void sb_wait_write(struct super_block *sb, int level) ...@@ -1262,24 +1223,18 @@ static void sb_wait_write(struct super_block *sb, int level)
* this leads to lockdep false-positives, so currently we do the early * this leads to lockdep false-positives, so currently we do the early
* release right after acquire. * release right after acquire.
*/ */
rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
}
do {
DEFINE_WAIT(wait);
/* static void sb_freeze_unlock(struct super_block *sb)
* We use a barrier in prepare_to_wait() to separate setting {
* of frozen and checking of the counter int level;
*/
prepare_to_wait(&sb->s_writers.wait, &wait,
TASK_UNINTERRUPTIBLE);
writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); for (level = 0; level < SB_FREEZE_LEVELS; ++level)
if (writers) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
schedule();
finish_wait(&sb->s_writers.wait, &wait); for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
} while (writers); percpu_up_write(sb->s_writers.rw_sem + level);
} }
/** /**
...@@ -1338,20 +1293,14 @@ int freeze_super(struct super_block *sb) ...@@ -1338,20 +1293,14 @@ int freeze_super(struct super_block *sb)
return 0; return 0;
} }
/* From now on, no new normal writers can start */
sb->s_writers.frozen = SB_FREEZE_WRITE; sb->s_writers.frozen = SB_FREEZE_WRITE;
smp_wmb();
/* Release s_umount to preserve sb_start_write -> s_umount ordering */ /* Release s_umount to preserve sb_start_write -> s_umount ordering */
up_write(&sb->s_umount); up_write(&sb->s_umount);
sb_wait_write(sb, SB_FREEZE_WRITE); sb_wait_write(sb, SB_FREEZE_WRITE);
down_write(&sb->s_umount);
/* Now we go and block page faults... */ /* Now we go and block page faults... */
down_write(&sb->s_umount);
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
smp_wmb();
sb_wait_write(sb, SB_FREEZE_PAGEFAULT); sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
/* All writers are done so after syncing there won't be dirty data */ /* All writers are done so after syncing there won't be dirty data */
...@@ -1359,7 +1308,6 @@ int freeze_super(struct super_block *sb) ...@@ -1359,7 +1308,6 @@ int freeze_super(struct super_block *sb)
/* Now wait for internal filesystem counter */ /* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS; sb->s_writers.frozen = SB_FREEZE_FS;
smp_wmb();
sb_wait_write(sb, SB_FREEZE_FS); sb_wait_write(sb, SB_FREEZE_FS);
if (sb->s_op->freeze_fs) { if (sb->s_op->freeze_fs) {
...@@ -1368,7 +1316,7 @@ int freeze_super(struct super_block *sb) ...@@ -1368,7 +1316,7 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR printk(KERN_ERR
"VFS:Filesystem freeze failed\n"); "VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN; sb->s_writers.frozen = SB_UNFROZEN;
smp_wmb(); sb_freeze_unlock(sb);
wake_up(&sb->s_writers.wait_unfrozen); wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb); deactivate_locked_super(sb);
return ret; return ret;
...@@ -1400,8 +1348,10 @@ int thaw_super(struct super_block *sb) ...@@ -1400,8 +1348,10 @@ int thaw_super(struct super_block *sb)
return -EINVAL; return -EINVAL;
} }
if (sb->s_flags & MS_RDONLY) if (sb->s_flags & MS_RDONLY) {
sb->s_writers.frozen = SB_UNFROZEN;
goto out; goto out;
}
if (sb->s_op->unfreeze_fs) { if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb); error = sb->s_op->unfreeze_fs(sb);
...@@ -1413,12 +1363,11 @@ int thaw_super(struct super_block *sb) ...@@ -1413,12 +1363,11 @@ int thaw_super(struct super_block *sb)
} }
} }
out:
sb->s_writers.frozen = SB_UNFROZEN; sb->s_writers.frozen = SB_UNFROZEN;
smp_wmb(); sb_freeze_unlock(sb);
out:
wake_up(&sb->s_writers.wait_unfrozen); wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb); deactivate_locked_super(sb);
return 0; return 0;
} }
EXPORT_SYMBOL(thaw_super); EXPORT_SYMBOL(thaw_super);
#ifndef _LINUX_FS_H #ifndef _LINUX_FS_H
#define _LINUX_FS_H #define _LINUX_FS_H
#include <linux/linkage.h> #include <linux/linkage.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/kdev_t.h> #include <linux/kdev_t.h>
...@@ -31,6 +30,7 @@ ...@@ -31,6 +30,7 @@
#include <linux/percpu-rwsem.h> #include <linux/percpu-rwsem.h>
#include <linux/blk_types.h> #include <linux/blk_types.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/percpu-rwsem.h>
#include <asm/byteorder.h> #include <asm/byteorder.h>
#include <uapi/linux/fs.h> #include <uapi/linux/fs.h>
...@@ -1275,16 +1275,9 @@ enum { ...@@ -1275,16 +1275,9 @@ enum {
#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
struct sb_writers { struct sb_writers {
/* Counters for counting writers at each level */ int frozen; /* Is sb frozen? */
struct percpu_counter counter[SB_FREEZE_LEVELS]; wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */
wait_queue_head_t wait; /* queue for waiting for struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
writers / faults to finish */
int frozen; /* Is sb frozen? */
wait_queue_head_t wait_unfrozen; /* queue for waiting for
sb to be thawed */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map lock_map[SB_FREEZE_LEVELS];
#endif
}; };
struct super_block { struct super_block {
...@@ -1393,9 +1386,9 @@ void __sb_end_write(struct super_block *sb, int level); ...@@ -1393,9 +1386,9 @@ void __sb_end_write(struct super_block *sb, int level);
int __sb_start_write(struct super_block *sb, int level, bool wait); int __sb_start_write(struct super_block *sb, int level, bool wait);
#define __sb_writers_acquired(sb, lev) \ #define __sb_writers_acquired(sb, lev) \
rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_) percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_writers_release(sb, lev) \ #define __sb_writers_release(sb, lev) \
rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_) percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
/** /**
* sb_end_write - drop write access to a superblock * sb_end_write - drop write access to a superblock
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment