Commit ca748c39 authored by Jason Gunthorpe's avatar Jason Gunthorpe Committed by Doug Ledford

RDMA/umem: Get rid of per_mm->notifier_count

This is intrinsically racy and the scheme is simply unnecessary. New MR
registration can wait for any on going invalidation to fully complete.

      CPU0                              CPU1
                                  if (atomic_read())
 if (atomic_dec_and_test() &&
     !list_empty())
  { /* not taken */ }
                                       list_add()

Putting the new UMEM into some kind of purgatory until another invalidate
rolls through..

Instead hold the read side of the umem_rwsem across the pair'd start/end
and get rid of the racy 'deferred add' approach.

Since all umem's in the rbt are always ready to go, also get rid of the
mn_counters_active stuff.
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parent f27a0d50
......@@ -80,83 +80,29 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
{
mutex_lock(&umem_odp->umem_mutex);
/* Only update private counters for this umem if it has them.
* Otherwise skip it. All page faults will be delayed for this umem. */
if (umem_odp->mn_counters_active) {
int notifiers_count = umem_odp->notifiers_count++;
if (notifiers_count == 0)
/* Initialize the completion object for waiting on
* notifiers. Since notifier_count is zero, no one
* should be waiting right now. */
if (umem_odp->notifiers_count++ == 0)
/*
* Initialize the completion object for waiting on
* notifiers. Since notifier_count is zero, no one should be
* waiting right now.
*/
reinit_completion(&umem_odp->notifier_completion);
}
mutex_unlock(&umem_odp->umem_mutex);
}
static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
{
mutex_lock(&umem_odp->umem_mutex);
/* Only update private counters for this umem if it has them.
* Otherwise skip it. All page faults will be delayed for this umem. */
if (umem_odp->mn_counters_active) {
/*
* This sequence increase will notify the QP page fault that
* the page that is going to be mapped in the spte could have
* been freed.
* This sequence increase will notify the QP page fault that the page
* that is going to be mapped in the spte could have been freed.
*/
++umem_odp->notifiers_seq;
if (--umem_odp->notifiers_count == 0)
complete_all(&umem_odp->notifier_completion);
}
mutex_unlock(&umem_odp->umem_mutex);
}
/* Account for a new mmu notifier in an ib_ucontext. */
static void
ib_ucontext_notifier_start_account(struct ib_ucontext_per_mm *per_mm)
{
atomic_inc(&per_mm->notifier_count);
}
/* Account for a terminating mmu notifier in an ib_ucontext.
*
* Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
* the function takes the semaphore itself. */
static void ib_ucontext_notifier_end_account(struct ib_ucontext_per_mm *per_mm)
{
int zero_notifiers = atomic_dec_and_test(&per_mm->notifier_count);
if (zero_notifiers &&
!list_empty(&per_mm->no_private_counters)) {
/* No currently running mmu notifiers. Now is the chance to
* add private accounting to all previously added umems. */
struct ib_umem_odp *odp_data, *next;
/* Prevent concurrent mmu notifiers from working on the
* no_private_counters list. */
down_write(&per_mm->umem_rwsem);
/* Read the notifier_count again, with the umem_rwsem
* semaphore taken for write. */
if (!atomic_read(&per_mm->notifier_count)) {
list_for_each_entry_safe(odp_data, next,
&per_mm->no_private_counters,
no_private_counters) {
mutex_lock(&odp_data->umem_mutex);
odp_data->mn_counters_active = true;
list_del(&odp_data->no_private_counters);
complete_all(&odp_data->notifier_completion);
mutex_unlock(&odp_data->umem_mutex);
}
}
up_write(&per_mm->umem_rwsem);
}
}
static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
u64 start, u64 end, void *cookie)
{
......@@ -186,7 +132,6 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn,
if (!per_mm->context->invalidate_range)
return;
ib_ucontext_notifier_start_account(per_mm);
down_read(&per_mm->umem_rwsem);
rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0,
ULLONG_MAX,
......@@ -231,14 +176,9 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
else if (!down_read_trylock(&per_mm->umem_rwsem))
return -EAGAIN;
ib_ucontext_notifier_start_account(per_mm);
ret = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start,
end,
return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end,
invalidate_range_start_trampoline,
blockable, NULL);
up_read(&per_mm->umem_rwsem);
return ret;
}
static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
......@@ -259,17 +199,10 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
if (!per_mm->context->invalidate_range)
return;
/*
* TODO: we currently bail out if there is any sleepable work to be done
* in ib_umem_notifier_invalidate_range_start so we shouldn't really block
* here. But this is ugly and fragile.
*/
down_read(&per_mm->umem_rwsem);
rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start,
end,
invalidate_range_end_trampoline, true, NULL);
up_read(&per_mm->umem_rwsem);
ib_ucontext_notifier_end_account(per_mm);
}
static const struct mmu_notifier_ops ib_umem_notifiers = {
......@@ -287,12 +220,6 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
rbt_ib_umem_insert(&umem_odp->interval_tree,
&per_mm->umem_tree);
if (likely(!atomic_read(&per_mm->notifier_count)))
umem_odp->mn_counters_active = true;
else
list_add(&umem_odp->no_private_counters,
&per_mm->no_private_counters);
up_write(&per_mm->umem_rwsem);
}
......@@ -305,10 +232,7 @@ static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
rbt_ib_umem_remove(&umem_odp->interval_tree,
&per_mm->umem_tree);
if (!umem_odp->mn_counters_active) {
list_del(&umem_odp->no_private_counters);
complete_all(&umem_odp->notifier_completion);
}
up_write(&per_mm->umem_rwsem);
}
......@@ -327,7 +251,6 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx,
per_mm->mm = mm;
per_mm->umem_tree = RB_ROOT_CACHED;
init_rwsem(&per_mm->umem_rwsem);
INIT_LIST_HEAD(&per_mm->no_private_counters);
rcu_read_lock();
per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
......
......@@ -67,15 +67,9 @@ struct ib_umem_odp {
struct mutex umem_mutex;
void *private; /* for the HW driver to use. */
/* When false, use the notifier counter in the ucontext struct. */
bool mn_counters_active;
int notifiers_seq;
int notifiers_count;
/* A linked list of umems that don't have private mmu notifier
* counters yet. */
struct list_head no_private_counters;
/* Tree tracking */
struct umem_odp_node interval_tree;
......@@ -99,11 +93,8 @@ struct ib_ucontext_per_mm {
struct rb_root_cached umem_tree;
/* Protects umem_tree */
struct rw_semaphore umem_rwsem;
atomic_t notifier_count;
struct mmu_notifier mn;
/* A list of umems that don't have private mmu notifier counters yet. */
struct list_head no_private_counters;
unsigned int odp_mrs_count;
struct list_head ucontext_list;
......@@ -162,12 +153,6 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
* and the ucontext umem_mutex semaphore locked for read).
*/
/* Do not allow page faults while the new ib_umem hasn't seen a state
* with zero notifiers yet, and doesn't have its own valid set of
* private counters. */
if (!umem_odp->mn_counters_active)
return 1;
if (unlikely(umem_odp->notifiers_count))
return 1;
if (umem_odp->notifiers_seq != mmu_seq)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment