Commit 99cb252f authored by Jason Gunthorpe's avatar Jason Gunthorpe

mm/mmu_notifier: add an interval tree notifier

Of the 13 users of mmu_notifiers, 8 of them use only
invalidate_range_start/end() and immediately intersect the
mmu_notifier_range with some kind of internal list of VAs.  4 use an
interval tree (i915_gem, radeon_mn, umem_odp, hfi1). 4 use a linked list
of some kind (scif_dma, vhost, gntdev, hmm)

And the remaining 5 either don't use invalidate_range_start() or do some
special thing with it.

It turns out that building a correct scheme with an interval tree is
pretty complicated, particularly if the use case is synchronizing against
another thread doing get_user_pages().  Many of these implementations have
various subtle and difficult to fix races.

This approach puts the interval tree as common code at the top of the mmu
notifier call tree and implements a shareable locking scheme.

It includes:
 - An interval tree tracking VA ranges, with per-range callbacks
 - A read/write locking scheme for the interval tree that avoids
   sleeping in the notifier path (for OOM killer)
 - A sequence counter based collision-retry locking scheme to tell
   device page fault that a VA range is being concurrently invalidated.

This is based on various ideas:
- hmm accumulates invalidated VA ranges and releases them when all
  invalidates are done, via active_invalidate_ranges count.
  This approach avoids having to intersect the interval tree twice (as
  umem_odp does) at the potential cost of a longer device page fault.

- kvm/umem_odp use a sequence counter to drive the collision retry,
  via invalidate_seq

- a deferred work todo list on unlock scheme like RTNL, via deferred_list.
  This makes adding/removing interval tree members more deterministic

- seqlock, except this version makes the seqlock idea multi-holder on the
  write side by protecting it with active_invalidate_ranges and a spinlock

To minimize MM overhead when only the interval tree is being used, the
entire SRCU and hlist overheads are dropped using some simple
branches. Similarly the interval tree overhead is dropped when in hlist
mode.

The overhead from the mandatory spinlock is broadly the same as most of
existing users which already had a lock (or two) of some sort on the
invalidation path.

Link: https://lore.kernel.org/r/20191112202231.3856-3-jgg@ziepe.caAcked-by: default avatarChristian König <christian.koenig@amd.com>
Tested-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Tested-by: default avatarRalph Campbell <rcampbell@nvidia.com>
Reviewed-by: default avatarJohn Hubbard <jhubbard@nvidia.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent 56f434f4
...@@ -6,10 +6,12 @@ ...@@ -6,10 +6,12 @@
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/mm_types.h> #include <linux/mm_types.h>
#include <linux/srcu.h> #include <linux/srcu.h>
#include <linux/interval_tree.h>
struct mmu_notifier_mm; struct mmu_notifier_mm;
struct mmu_notifier; struct mmu_notifier;
struct mmu_notifier_range; struct mmu_notifier_range;
struct mmu_interval_notifier;
/** /**
* enum mmu_notifier_event - reason for the mmu notifier callback * enum mmu_notifier_event - reason for the mmu notifier callback
...@@ -32,6 +34,9 @@ struct mmu_notifier_range; ...@@ -32,6 +34,9 @@ struct mmu_notifier_range;
* access flags). User should soft dirty the page in the end callback to make * access flags). User should soft dirty the page in the end callback to make
* sure that anyone relying on soft dirtyness catch pages that might be written * sure that anyone relying on soft dirtyness catch pages that might be written
* through non CPU mappings. * through non CPU mappings.
*
* @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
* that the mm refcount is zero and the range is no longer accessible.
*/ */
enum mmu_notifier_event { enum mmu_notifier_event {
MMU_NOTIFY_UNMAP = 0, MMU_NOTIFY_UNMAP = 0,
...@@ -39,6 +44,7 @@ enum mmu_notifier_event { ...@@ -39,6 +44,7 @@ enum mmu_notifier_event {
MMU_NOTIFY_PROTECTION_VMA, MMU_NOTIFY_PROTECTION_VMA,
MMU_NOTIFY_PROTECTION_PAGE, MMU_NOTIFY_PROTECTION_PAGE,
MMU_NOTIFY_SOFT_DIRTY, MMU_NOTIFY_SOFT_DIRTY,
MMU_NOTIFY_RELEASE,
}; };
#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
...@@ -222,6 +228,26 @@ struct mmu_notifier { ...@@ -222,6 +228,26 @@ struct mmu_notifier {
unsigned int users; unsigned int users;
}; };
/**
* struct mmu_interval_notifier_ops
* @invalidate: Upon return the caller must stop using any SPTEs within this
* range. This function can sleep. Return false only if sleeping
* was required but mmu_notifier_range_blockable(range) is false.
*/
struct mmu_interval_notifier_ops {
bool (*invalidate)(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq);
};
struct mmu_interval_notifier {
struct interval_tree_node interval_tree;
const struct mmu_interval_notifier_ops *ops;
struct mm_struct *mm;
struct hlist_node deferred_item;
unsigned long invalidate_seq;
};
#ifdef CONFIG_MMU_NOTIFIER #ifdef CONFIG_MMU_NOTIFIER
#ifdef CONFIG_LOCKDEP #ifdef CONFIG_LOCKDEP
...@@ -263,6 +289,81 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn, ...@@ -263,6 +289,81 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm); struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *mn, extern void mmu_notifier_unregister(struct mmu_notifier *mn,
struct mm_struct *mm); struct mm_struct *mm);
unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
struct mm_struct *mm, unsigned long start,
unsigned long length,
const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
struct mmu_interval_notifier *mni, struct mm_struct *mm,
unsigned long start, unsigned long length,
const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni);
/**
* mmu_interval_set_seq - Save the invalidation sequence
* @mni - The mni passed to invalidate
* @cur_seq - The cur_seq passed to the invalidate() callback
*
* This must be called unconditionally from the invalidate callback of a
* struct mmu_interval_notifier_ops under the same lock that is used to call
* mmu_interval_read_retry(). It updates the sequence number for later use by
* mmu_interval_read_retry(). The provided cur_seq will always be odd.
*
* If the caller does not call mmu_interval_read_begin() or
* mmu_interval_read_retry() then this call is not required.
*/
static inline void mmu_interval_set_seq(struct mmu_interval_notifier *mni,
unsigned long cur_seq)
{
WRITE_ONCE(mni->invalidate_seq, cur_seq);
}
/**
* mmu_interval_read_retry - End a read side critical section against a VA range
* mni: The range
* seq: The return of the paired mmu_interval_read_begin()
*
* This MUST be called under a user provided lock that is also held
* unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
*
* Each call should be paired with a single mmu_interval_read_begin() and
* should be used to conclude the read side.
*
* Returns true if an invalidation collided with this critical section, and
* the caller should retry.
*/
static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *mni,
unsigned long seq)
{
return mni->invalidate_seq != seq;
}
/**
* mmu_interval_check_retry - Test if a collision has occurred
* mni: The range
* seq: The return of the matching mmu_interval_read_begin()
*
* This can be used in the critical section between mmu_interval_read_begin()
* and mmu_interval_read_retry(). A return of true indicates an invalidation
* has collided with this critical region and a future
* mmu_interval_read_retry() will return true.
*
* False is not reliable and only suggests a collision may not have
* occured. It can be called many times and does not have to hold the user
* provided lock.
*
* This call can be used as part of loops and other expensive operations to
* expedite a retry.
*/
static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *mni,
unsigned long seq)
{
/* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
return READ_ONCE(mni->invalidate_seq) != seq;
}
extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
......
...@@ -284,6 +284,7 @@ config VIRT_TO_BUS ...@@ -284,6 +284,7 @@ config VIRT_TO_BUS
config MMU_NOTIFIER config MMU_NOTIFIER
bool bool
select SRCU select SRCU
select INTERVAL_TREE
config KSM config KSM
bool "Enable KSM for page merging" bool "Enable KSM for page merging"
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment