Commit 15cf3b7a authored by Al Viro's avatar Al Viro

Merge branch 'sb_writers_pcpu_rwsem' of...

Merge branch 'sb_writers_pcpu_rwsem' of git://git.kernel.org/pub/scm/linux/kernel/git/oleg/misc into for-next
parents 2c6625cd 8129ed29
......@@ -87,7 +87,6 @@ config KPROBES_ON_FTRACE
config UPROBES
def_bool n
select PERCPU_RWSEM
help
Uprobes is the user-space counterpart to kprobes: they
enable instrumentation applications (such as 'perf probe')
......
......@@ -1638,9 +1638,7 @@ static void do_async_commit(struct work_struct *work)
* Tell lockdep about it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
rwsem_acquire_read(
&ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
0, 1, _THIS_IP_);
__sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS);
current->journal_info = ac->newtrans;
......@@ -1679,9 +1677,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
* async commit thread will be the one to unlock it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
rwsem_release(
&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1, _THIS_IP_);
__sb_writers_release(root->fs_info->sb, SB_FREEZE_FS);
schedule_work(&ac->work);
......
......@@ -135,6 +135,24 @@ static unsigned long super_cache_count(struct shrinker *shrink,
return total_objects;
}
static void destroy_super_work(struct work_struct *work)
{
struct super_block *s = container_of(work, struct super_block,
destroy_work);
int i;
for (i = 0; i < SB_FREEZE_LEVELS; i++)
percpu_free_rwsem(&s->s_writers.rw_sem[i]);
kfree(s);
}
static void destroy_super_rcu(struct rcu_head *head)
{
struct super_block *s = container_of(head, struct super_block, rcu);
INIT_WORK(&s->destroy_work, destroy_super_work);
schedule_work(&s->destroy_work);
}
/**
* destroy_super - frees a superblock
* @s: superblock to free
......@@ -143,16 +161,13 @@ static unsigned long super_cache_count(struct shrinker *shrink,
*/
static void destroy_super(struct super_block *s)
{
int i;
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
for (i = 0; i < SB_FREEZE_LEVELS; i++)
percpu_counter_destroy(&s->s_writers.counter[i]);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
kfree(s->s_subtype);
kfree(s->s_options);
kfree_rcu(s, rcu);
call_rcu(&s->rcu, destroy_super_rcu);
}
/**
......@@ -178,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
goto fail;
for (i = 0; i < SB_FREEZE_LEVELS; i++) {
if (percpu_counter_init(&s->s_writers.counter[i], 0,
GFP_KERNEL) < 0)
if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
sb_writers_name[i],
&type->s_writers_key[i]))
goto fail;
lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
&type->s_writers_key[i], 0);
}
init_waitqueue_head(&s->s_writers.wait);
init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
......@@ -1146,72 +1159,46 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
*/
void __sb_end_write(struct super_block *sb, int level)
{
percpu_counter_dec(&sb->s_writers.counter[level-1]);
/*
* Make sure s_writers are updated before we wake up waiters in
* freeze_super().
*/
smp_mb();
if (waitqueue_active(&sb->s_writers.wait))
wake_up(&sb->s_writers.wait);
rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
percpu_up_read(sb->s_writers.rw_sem + level-1);
}
EXPORT_SYMBOL(__sb_end_write);
#ifdef CONFIG_LOCKDEP
/*
* We want lockdep to tell us about possible deadlocks with freezing but
* it's it bit tricky to properly instrument it. Getting a freeze protection
* works as getting a read lock but there are subtle problems. XFS for example
* gets freeze protection on internal level twice in some cases, which is OK
* only because we already hold a freeze protection also on higher level. Due
* to these cases we have to tell lockdep we are doing trylock when we
* already hold a freeze protection for a higher freeze level.
*/
static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
unsigned long ip)
{
int i;
if (!trylock) {
for (i = 0; i < level - 1; i++)
if (lock_is_held(&sb->s_writers.lock_map[i])) {
trylock = true;
break;
}
}
rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
}
#endif
/*
* This is an internal function, please use sb_start_{write,pagefault,intwrite}
* instead.
*/
int __sb_start_write(struct super_block *sb, int level, bool wait)
{
retry:
if (unlikely(sb->s_writers.frozen >= level)) {
if (!wait)
return 0;
wait_event(sb->s_writers.wait_unfrozen,
sb->s_writers.frozen < level);
}
bool force_trylock = false;
int ret = 1;
#ifdef CONFIG_LOCKDEP
acquire_freeze_lock(sb, level, !wait, _RET_IP_);
#endif
percpu_counter_inc(&sb->s_writers.counter[level-1]);
/*
* Make sure counter is updated before we check for frozen.
* freeze_super() first sets frozen and then checks the counter.
* We want lockdep to tell us about possible deadlocks with freezing
* but it's it bit tricky to properly instrument it. Getting a freeze
* protection works as getting a read lock but there are subtle
* problems. XFS for example gets freeze protection on internal level
* twice in some cases, which is OK only because we already hold a
* freeze protection also on higher level. Due to these cases we have
* to use wait == F (trylock mode) which must not fail.
*/
smp_mb();
if (unlikely(sb->s_writers.frozen >= level)) {
__sb_end_write(sb, level);
goto retry;
if (wait) {
int i;
for (i = 0; i < level - 1; i++)
if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
force_trylock = true;
break;
}
}
return 1;
#endif
if (wait && !force_trylock)
percpu_down_read(sb->s_writers.rw_sem + level-1);
else
ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
WARN_ON(force_trylock & !ret);
return ret;
}
EXPORT_SYMBOL(__sb_start_write);
......@@ -1221,37 +1208,33 @@ EXPORT_SYMBOL(__sb_start_write);
* @level: type of writers we wait for (normal vs page fault)
*
* This function waits until there are no writers of given type to given file
* system. Caller of this function should make sure there can be no new writers
* of type @level before calling this function. Otherwise this function can
* livelock.
* system.
*/
static void sb_wait_write(struct super_block *sb, int level)
{
s64 writers;
percpu_down_write(sb->s_writers.rw_sem + level-1);
/*
* We just cycle-through lockdep here so that it does not complain
* about returning with lock to userspace
* We are going to return to userspace and forget about this lock, the
* ownership goes to the caller of thaw_super() which does unlock.
*
* FIXME: we should do this before return from freeze_super() after we
* called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super()
* should re-acquire these locks before s_op->unfreeze_fs(sb). However
* this leads to lockdep false-positives, so currently we do the early
* release right after acquire.
*/
rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
do {
DEFINE_WAIT(wait);
percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
}
/*
* We use a barrier in prepare_to_wait() to separate setting
* of frozen and checking of the counter
*/
prepare_to_wait(&sb->s_writers.wait, &wait,
TASK_UNINTERRUPTIBLE);
static void sb_freeze_unlock(struct super_block *sb)
{
int level;
writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
if (writers)
schedule();
for (level = 0; level < SB_FREEZE_LEVELS; ++level)
percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
finish_wait(&sb->s_writers.wait, &wait);
} while (writers);
for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
percpu_up_write(sb->s_writers.rw_sem + level);
}
/**
......@@ -1310,20 +1293,14 @@ int freeze_super(struct super_block *sb)
return 0;
}
/* From now on, no new normal writers can start */
sb->s_writers.frozen = SB_FREEZE_WRITE;
smp_wmb();
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
up_write(&sb->s_umount);
sb_wait_write(sb, SB_FREEZE_WRITE);
down_write(&sb->s_umount);
/* Now we go and block page faults... */
down_write(&sb->s_umount);
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
smp_wmb();
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
/* All writers are done so after syncing there won't be dirty data */
......@@ -1331,7 +1308,6 @@ int freeze_super(struct super_block *sb)
/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
smp_wmb();
sb_wait_write(sb, SB_FREEZE_FS);
if (sb->s_op->freeze_fs) {
......@@ -1340,7 +1316,7 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
smp_wmb();
sb_freeze_unlock(sb);
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return ret;
......@@ -1372,8 +1348,10 @@ int thaw_super(struct super_block *sb)
return -EINVAL;
}
if (sb->s_flags & MS_RDONLY)
if (sb->s_flags & MS_RDONLY) {
sb->s_writers.frozen = SB_UNFROZEN;
goto out;
}
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
......@@ -1385,12 +1363,11 @@ int thaw_super(struct super_block *sb)
}
}
out:
sb->s_writers.frozen = SB_UNFROZEN;
smp_wmb();
sb_freeze_unlock(sb);
out:
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return 0;
}
EXPORT_SYMBOL(thaw_super);
......@@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc(
* We may pass freeze protection with a transaction. So tell lockdep
* we released it.
*/
rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
1, _THIS_IP_);
__sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
/*
* We hand off the transaction to the completion thread now, so
* clear the flag here.
......@@ -171,8 +170,7 @@ xfs_setfilesize_ioend(
* Similarly for freeze protection.
*/
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
0, 1, _THIS_IP_);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
......
#ifndef _LINUX_FS_H
#define _LINUX_FS_H
#include <linux/linkage.h>
#include <linux/wait.h>
#include <linux/kdev_t.h>
......@@ -30,6 +29,8 @@
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/blk_types.h>
#include <linux/workqueue.h>
#include <linux/percpu-rwsem.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
......@@ -1274,16 +1275,9 @@ enum {
#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
struct sb_writers {
/* Counters for counting writers at each level */
struct percpu_counter counter[SB_FREEZE_LEVELS];
wait_queue_head_t wait; /* queue for waiting for
writers / faults to finish */
int frozen; /* Is sb frozen? */
wait_queue_head_t wait_unfrozen; /* queue for waiting for
sb to be thawed */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map lock_map[SB_FREEZE_LEVELS];
#endif
int frozen; /* Is sb frozen? */
wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */
struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};
struct super_block {
......@@ -1375,7 +1369,7 @@ struct super_block {
struct list_lru s_dentry_lru ____cacheline_aligned_in_smp;
struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
struct rcu_head rcu;
struct work_struct destroy_work;
/*
* Indicates how deep in a filesystem stack this SB is
*/
......@@ -1391,6 +1385,11 @@ extern struct timespec current_fs_time(struct super_block *sb);
void __sb_end_write(struct super_block *sb, int level);
int __sb_start_write(struct super_block *sb, int level, bool wait);
#define __sb_writers_acquired(sb, lev) \
percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_writers_release(sb, lev) \
percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
/**
* sb_end_write - drop write access to a superblock
* @sb: the super we wrote to
......
......@@ -16,6 +16,7 @@ struct percpu_rw_semaphore {
};
extern void percpu_down_read(struct percpu_rw_semaphore *);
extern int percpu_down_read_trylock(struct percpu_rw_semaphore *);
extern void percpu_up_read(struct percpu_rw_semaphore *);
extern void percpu_down_write(struct percpu_rw_semaphore *);
......@@ -31,4 +32,23 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
__percpu_init_rwsem(brw, #brw, &rwsem_key); \
})
#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
bool read, unsigned long ip)
{
lock_release(&sem->rw_sem.dep_map, 1, ip);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
if (!read)
sem->rw_sem.owner = NULL;
#endif
}
static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
bool read, unsigned long ip)
{
lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
}
#endif
......@@ -925,7 +925,6 @@ config NUMA_BALANCING_DEFAULT_ENABLED
menuconfig CGROUPS
bool "Control Group support"
select KERNFS
select PERCPU_RWSEM
help
This option adds support for grouping sets of processes together, for
use with process control subsystems such as Cpusets, CFS, memory
......
obj-y += mutex.o semaphore.o rwsem.o
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
......@@ -25,6 +25,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
......@@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
__up_read(&brw->rw_sem);
}
int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
{
if (unlikely(!update_fast_ctr(brw, +1))) {
if (!__down_read_trylock(&brw->rw_sem))
return 0;
atomic_inc(&brw->slow_read_ctr);
__up_read(&brw->rw_sem);
}
rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
return 1;
}
void percpu_up_read(struct percpu_rw_semaphore *brw)
{
rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
......
......@@ -53,9 +53,6 @@ config GENERIC_IO
config STMP_DEVICE
bool
config PERCPU_RWSEM
bool
config ARCH_USE_CMPXCHG_LOCKREF
bool
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment