Commit 93065ac7 authored by Michal Hocko's avatar Michal Hocko Committed by Linus Torvalds

mm, oom: distinguish blockable mode for mmu notifiers

There are several blockable mmu notifiers which might sleep in
mmu_notifier_invalidate_range_start and that is a problem for the
oom_reaper because it needs to guarantee a forward progress so it cannot
depend on any sleepable locks.

Currently we simply back off and mark an oom victim with blockable mmu
notifiers as done after a short sleep.  That can result in selecting a new
oom victim prematurely because the previous one still hasn't torn its
memory down yet.

We can do much better though.  Even if mmu notifiers use sleepable locks
there is no reason to automatically assume those locks are held.  Moreover
majority of notifiers only care about a portion of the address space and
there is absolutely zero reason to fail when we are unmapping an unrelated
range.  Many notifiers do really block and wait for HW which is harder to
handle and we have to bail out though.

This patch handles the low hanging fruit.
__mmu_notifier_invalidate_range_start gets a blockable flag and callbacks
are not allowed to sleep if the flag is set to false.  This is achieved by
using trylock instead of the sleepable lock for most callbacks and
continue as long as we do not block down the call chain.

I think we can improve that even further because there is a common pattern
to do a range lookup first and then do something about that.  The first
part can be done without a sleeping lock in most cases AFAICS.

The oom_reaper end then simply retries if there is at least one notifier
which couldn't make any progress in !blockable mode.  A retry loop is
already implemented to wait for the mmap_sem and this is basically the
same thing.

The simplest way for driver developers to test this code path is to wrap
userspace code which uses these notifiers into a memcg and set the hard
limit to hit the oom.  This can be done e.g.  after the test faults in all
the mmu notifier managed memory and set the hard limit to something really
small.  Then we are looking for a proper process tear down.

[akpm@linux-foundation.org: coding style fixes]
[akpm@linux-foundation.org: minor code simplification]
Link: http://lkml.kernel.org/r/20180716115058.5559-1-mhocko@kernel.orgSigned-off-by: default avatarMichal Hocko <mhocko@suse.com>
Acked-by: Christian König <christian.koenig@amd.com> # AMD notifiers
Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx and umem_odp
Reported-by: default avatarDavid Rientjes <rientjes@google.com>
Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
Cc: Dennis Dalessandro <dennis.dalessandro@intel.com>
Cc: Sudeep Dutt <sudeep.dutt@intel.com>
Cc: Ashutosh Dixit <ashutosh.dixit@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent c2343d27
...@@ -7305,8 +7305,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) ...@@ -7305,8 +7305,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
} }
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end) unsigned long start, unsigned long end,
bool blockable)
{ {
unsigned long apic_address; unsigned long apic_address;
...@@ -7317,6 +7318,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, ...@@ -7317,6 +7318,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (start <= apic_address && apic_address < end) if (start <= apic_address && apic_address < end)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
return 0;
} }
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
......
...@@ -178,12 +178,18 @@ void amdgpu_mn_unlock(struct amdgpu_mn *mn) ...@@ -178,12 +178,18 @@ void amdgpu_mn_unlock(struct amdgpu_mn *mn)
* *
* @amn: our notifier * @amn: our notifier
*/ */
static void amdgpu_mn_read_lock(struct amdgpu_mn *amn) static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable)
{ {
mutex_lock(&amn->read_lock); if (blockable)
mutex_lock(&amn->read_lock);
else if (!mutex_trylock(&amn->read_lock))
return -EAGAIN;
if (atomic_inc_return(&amn->recursion) == 1) if (atomic_inc_return(&amn->recursion) == 1)
down_read_non_owner(&amn->lock); down_read_non_owner(&amn->lock);
mutex_unlock(&amn->read_lock); mutex_unlock(&amn->read_lock);
return 0;
} }
/** /**
...@@ -239,10 +245,11 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, ...@@ -239,10 +245,11 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
* Block for operations on BOs to finish and mark pages as accessed and * Block for operations on BOs to finish and mark pages as accessed and
* potentially dirty. * potentially dirty.
*/ */
static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end,
bool blockable)
{ {
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
struct interval_tree_node *it; struct interval_tree_node *it;
...@@ -250,17 +257,28 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, ...@@ -250,17 +257,28 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
/* notification is exclusive, but interval is inclusive */ /* notification is exclusive, but interval is inclusive */
end -= 1; end -= 1;
amdgpu_mn_read_lock(amn); /* TODO we should be able to split locking for interval tree and
* amdgpu_mn_invalidate_node
*/
if (amdgpu_mn_read_lock(amn, blockable))
return -EAGAIN;
it = interval_tree_iter_first(&amn->objects, start, end); it = interval_tree_iter_first(&amn->objects, start, end);
while (it) { while (it) {
struct amdgpu_mn_node *node; struct amdgpu_mn_node *node;
if (!blockable) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
node = container_of(it, struct amdgpu_mn_node, it); node = container_of(it, struct amdgpu_mn_node, it);
it = interval_tree_iter_next(it, start, end); it = interval_tree_iter_next(it, start, end);
amdgpu_mn_invalidate_node(node, start, end); amdgpu_mn_invalidate_node(node, start, end);
} }
return 0;
} }
/** /**
...@@ -275,10 +293,11 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, ...@@ -275,10 +293,11 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
* necessitates evicting all user-mode queues of the process. The BOs * necessitates evicting all user-mode queues of the process. The BOs
* are restorted in amdgpu_mn_invalidate_range_end_hsa. * are restorted in amdgpu_mn_invalidate_range_end_hsa.
*/ */
static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end,
bool blockable)
{ {
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
struct interval_tree_node *it; struct interval_tree_node *it;
...@@ -286,13 +305,19 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, ...@@ -286,13 +305,19 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
/* notification is exclusive, but interval is inclusive */ /* notification is exclusive, but interval is inclusive */
end -= 1; end -= 1;
amdgpu_mn_read_lock(amn); if (amdgpu_mn_read_lock(amn, blockable))
return -EAGAIN;
it = interval_tree_iter_first(&amn->objects, start, end); it = interval_tree_iter_first(&amn->objects, start, end);
while (it) { while (it) {
struct amdgpu_mn_node *node; struct amdgpu_mn_node *node;
struct amdgpu_bo *bo; struct amdgpu_bo *bo;
if (!blockable) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
node = container_of(it, struct amdgpu_mn_node, it); node = container_of(it, struct amdgpu_mn_node, it);
it = interval_tree_iter_next(it, start, end); it = interval_tree_iter_next(it, start, end);
...@@ -304,6 +329,8 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, ...@@ -304,6 +329,8 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
amdgpu_amdkfd_evict_userptr(mem, mm); amdgpu_amdkfd_evict_userptr(mem, mm);
} }
} }
return 0;
} }
/** /**
......
...@@ -112,10 +112,11 @@ static void del_object(struct i915_mmu_object *mo) ...@@ -112,10 +112,11 @@ static void del_object(struct i915_mmu_object *mo)
mo->attached = false; mo->attached = false;
} }
static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end,
bool blockable)
{ {
struct i915_mmu_notifier *mn = struct i915_mmu_notifier *mn =
container_of(_mn, struct i915_mmu_notifier, mn); container_of(_mn, struct i915_mmu_notifier, mn);
...@@ -124,7 +125,7 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, ...@@ -124,7 +125,7 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
LIST_HEAD(cancelled); LIST_HEAD(cancelled);
if (RB_EMPTY_ROOT(&mn->objects.rb_root)) if (RB_EMPTY_ROOT(&mn->objects.rb_root))
return; return 0;
/* interval ranges are inclusive, but invalidate range is exclusive */ /* interval ranges are inclusive, but invalidate range is exclusive */
end--; end--;
...@@ -132,6 +133,10 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, ...@@ -132,6 +133,10 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
spin_lock(&mn->lock); spin_lock(&mn->lock);
it = interval_tree_iter_first(&mn->objects, start, end); it = interval_tree_iter_first(&mn->objects, start, end);
while (it) { while (it) {
if (!blockable) {
spin_unlock(&mn->lock);
return -EAGAIN;
}
/* The mmu_object is released late when destroying the /* The mmu_object is released late when destroying the
* GEM object so it is entirely possible to gain a * GEM object so it is entirely possible to gain a
* reference on an object in the process of being freed * reference on an object in the process of being freed
...@@ -154,6 +159,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, ...@@ -154,6 +159,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
if (!list_empty(&cancelled)) if (!list_empty(&cancelled))
flush_workqueue(mn->wq); flush_workqueue(mn->wq);
return 0;
} }
static const struct mmu_notifier_ops i915_gem_userptr_notifier = { static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
......
...@@ -118,19 +118,27 @@ static void radeon_mn_release(struct mmu_notifier *mn, ...@@ -118,19 +118,27 @@ static void radeon_mn_release(struct mmu_notifier *mn,
* We block for all BOs between start and end to be idle and * We block for all BOs between start and end to be idle and
* unmap them by move them into system domain again. * unmap them by move them into system domain again.
*/ */
static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end,
bool blockable)
{ {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct ttm_operation_ctx ctx = { false, false }; struct ttm_operation_ctx ctx = { false, false };
struct interval_tree_node *it; struct interval_tree_node *it;
int ret = 0;
/* notification is exclusive, but interval is inclusive */ /* notification is exclusive, but interval is inclusive */
end -= 1; end -= 1;
mutex_lock(&rmn->lock); /* TODO we should be able to split locking for interval tree and
* the tear down.
*/
if (blockable)
mutex_lock(&rmn->lock);
else if (!mutex_trylock(&rmn->lock))
return -EAGAIN;
it = interval_tree_iter_first(&rmn->objects, start, end); it = interval_tree_iter_first(&rmn->objects, start, end);
while (it) { while (it) {
...@@ -138,6 +146,11 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, ...@@ -138,6 +146,11 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
struct radeon_bo *bo; struct radeon_bo *bo;
long r; long r;
if (!blockable) {
ret = -EAGAIN;
goto out_unlock;
}
node = container_of(it, struct radeon_mn_node, it); node = container_of(it, struct radeon_mn_node, it);
it = interval_tree_iter_next(it, start, end); it = interval_tree_iter_next(it, start, end);
...@@ -166,7 +179,10 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, ...@@ -166,7 +179,10 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
} }
} }
out_unlock:
mutex_unlock(&rmn->lock); mutex_unlock(&rmn->lock);
return ret;
} }
static const struct mmu_notifier_ops radeon_mn_ops = { static const struct mmu_notifier_ops radeon_mn_ops = {
......
...@@ -186,6 +186,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, ...@@ -186,6 +186,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn,
rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
ULLONG_MAX, ULLONG_MAX,
ib_umem_notifier_release_trampoline, ib_umem_notifier_release_trampoline,
true,
NULL); NULL);
up_read(&context->umem_rwsem); up_read(&context->umem_rwsem);
} }
...@@ -207,22 +208,31 @@ static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, ...@@ -207,22 +208,31 @@ static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
return 0; return 0;
} }
static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end,
bool blockable)
{ {
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
int ret;
if (!context->invalidate_range) if (!context->invalidate_range)
return; return 0;
if (blockable)
down_read(&context->umem_rwsem);
else if (!down_read_trylock(&context->umem_rwsem))
return -EAGAIN;
ib_ucontext_notifier_start_account(context); ib_ucontext_notifier_start_account(context);
down_read(&context->umem_rwsem); ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
end, end,
invalidate_range_start_trampoline, NULL); invalidate_range_start_trampoline,
blockable, NULL);
up_read(&context->umem_rwsem); up_read(&context->umem_rwsem);
return ret;
} }
static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
...@@ -242,10 +252,15 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, ...@@ -242,10 +252,15 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
if (!context->invalidate_range) if (!context->invalidate_range)
return; return;
/*
* TODO: we currently bail out if there is any sleepable work to be done
* in ib_umem_notifier_invalidate_range_start so we shouldn't really block
* here. But this is ugly and fragile.
*/
down_read(&context->umem_rwsem); down_read(&context->umem_rwsem);
rbt_ib_umem_for_each_in_range(&context->umem_tree, start, rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
end, end,
invalidate_range_end_trampoline, NULL); invalidate_range_end_trampoline, true, NULL);
up_read(&context->umem_rwsem); up_read(&context->umem_rwsem);
ib_ucontext_notifier_end_account(context); ib_ucontext_notifier_end_account(context);
} }
...@@ -798,6 +813,7 @@ EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); ...@@ -798,6 +813,7 @@ EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
u64 start, u64 last, u64 start, u64 last,
umem_call_back cb, umem_call_back cb,
bool blockable,
void *cookie) void *cookie)
{ {
int ret_val = 0; int ret_val = 0;
...@@ -809,6 +825,9 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, ...@@ -809,6 +825,9 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
for (node = rbt_ib_umem_iter_first(root, start, last - 1); for (node = rbt_ib_umem_iter_first(root, start, last - 1);
node; node = next) { node; node = next) {
/* TODO move the blockable decision up to the callback */
if (!blockable)
return -EAGAIN;
next = rbt_ib_umem_iter_next(node, start, last - 1); next = rbt_ib_umem_iter_next(node, start, last - 1);
umem = container_of(node, struct ib_umem_odp, interval_tree); umem = container_of(node, struct ib_umem_odp, interval_tree);
ret_val = cb(umem->umem, start, last, cookie) || ret_val; ret_val = cb(umem->umem, start, last, cookie) || ret_val;
......
...@@ -67,9 +67,9 @@ struct mmu_rb_handler { ...@@ -67,9 +67,9 @@ struct mmu_rb_handler {
static unsigned long mmu_node_start(struct mmu_rb_node *); static unsigned long mmu_node_start(struct mmu_rb_node *);
static unsigned long mmu_node_last(struct mmu_rb_node *); static unsigned long mmu_node_last(struct mmu_rb_node *);
static void mmu_notifier_range_start(struct mmu_notifier *, static int mmu_notifier_range_start(struct mmu_notifier *,
struct mm_struct *, struct mm_struct *,
unsigned long, unsigned long); unsigned long, unsigned long, bool);
static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
unsigned long, unsigned long); unsigned long, unsigned long);
static void do_remove(struct mmu_rb_handler *handler, static void do_remove(struct mmu_rb_handler *handler,
...@@ -284,10 +284,11 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, ...@@ -284,10 +284,11 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
handler->ops->remove(handler->ops_arg, node); handler->ops->remove(handler->ops_arg, node);
} }
static void mmu_notifier_range_start(struct mmu_notifier *mn, static int mmu_notifier_range_start(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end,
bool blockable)
{ {
struct mmu_rb_handler *handler = struct mmu_rb_handler *handler =
container_of(mn, struct mmu_rb_handler, mn); container_of(mn, struct mmu_rb_handler, mn);
...@@ -313,6 +314,8 @@ static void mmu_notifier_range_start(struct mmu_notifier *mn, ...@@ -313,6 +314,8 @@ static void mmu_notifier_range_start(struct mmu_notifier *mn,
if (added) if (added)
queue_work(handler->wq, &handler->del_work); queue_work(handler->wq, &handler->del_work);
return 0;
} }
/* /*
......
...@@ -488,7 +488,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) ...@@ -488,7 +488,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
down_read(&ctx->umem_rwsem); down_read(&ctx->umem_rwsem);
rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
mr_leaf_free, imr); mr_leaf_free, true, imr);
up_read(&ctx->umem_rwsem); up_read(&ctx->umem_rwsem);
wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
......
...@@ -200,15 +200,18 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn, ...@@ -200,15 +200,18 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn,
schedule_work(&scif_info.misc_work); schedule_work(&scif_info.misc_work);
} }
static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end,
bool blockable)
{ {
struct scif_mmu_notif *mmn; struct scif_mmu_notif *mmn;
mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
scif_rma_destroy_tcw(mmn, start, end - start); scif_rma_destroy_tcw(mmn, start, end - start);
return 0;
} }
static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
......
...@@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state *gru) ...@@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state *gru)
/* /*
* MMUOPS notifier callout functions * MMUOPS notifier callout functions
*/ */
static void gru_invalidate_range_start(struct mmu_notifier *mn, static int gru_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long end) unsigned long start, unsigned long end,
bool blockable)
{ {
struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
ms_notifier); ms_notifier);
...@@ -231,6 +232,8 @@ static void gru_invalidate_range_start(struct mmu_notifier *mn, ...@@ -231,6 +232,8 @@ static void gru_invalidate_range_start(struct mmu_notifier *mn,
gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms, gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
start, end, atomic_read(&gms->ms_range_active)); start, end, atomic_read(&gms->ms_range_active));
gru_flush_tlb_range(gms, start, end - start); gru_flush_tlb_range(gms, start, end - start);
return 0;
} }
static void gru_invalidate_range_end(struct mmu_notifier *mn, static void gru_invalidate_range_end(struct mmu_notifier *mn,
......
...@@ -479,18 +479,25 @@ static const struct vm_operations_struct gntdev_vmops = { ...@@ -479,18 +479,25 @@ static const struct vm_operations_struct gntdev_vmops = {
/* ------------------------------------------------------------------ */ /* ------------------------------------------------------------------ */
static bool in_range(struct gntdev_grant_map *map,
unsigned long start, unsigned long end)
{
if (!map->vma)
return false;
if (map->vma->vm_start >= end)
return false;
if (map->vma->vm_end <= start)
return false;
return true;
}
static void unmap_if_in_range(struct gntdev_grant_map *map, static void unmap_if_in_range(struct gntdev_grant_map *map,
unsigned long start, unsigned long end) unsigned long start, unsigned long end)
{ {
unsigned long mstart, mend; unsigned long mstart, mend;
int err; int err;
if (!map->vma)
return;
if (map->vma->vm_start >= end)
return;
if (map->vma->vm_end <= start)
return;
mstart = max(start, map->vma->vm_start); mstart = max(start, map->vma->vm_start);
mend = min(end, map->vma->vm_end); mend = min(end, map->vma->vm_end);
pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
...@@ -503,21 +510,40 @@ static void unmap_if_in_range(struct gntdev_grant_map *map, ...@@ -503,21 +510,40 @@ static void unmap_if_in_range(struct gntdev_grant_map *map,
WARN_ON(err); WARN_ON(err);
} }
static void mn_invl_range_start(struct mmu_notifier *mn, static int mn_invl_range_start(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long end) unsigned long start, unsigned long end,
bool blockable)
{ {
struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
struct gntdev_grant_map *map; struct gntdev_grant_map *map;
int ret = 0;
/* TODO do we really need a mutex here? */
if (blockable)
mutex_lock(&priv->lock);
else if (!mutex_trylock(&priv->lock))
return -EAGAIN;
mutex_lock(&priv->lock);
list_for_each_entry(map, &priv->maps, next) { list_for_each_entry(map, &priv->maps, next) {
if (in_range(map, start, end)) {
ret = -EAGAIN;
goto out_unlock;
}
unmap_if_in_range(map, start, end); unmap_if_in_range(map, start, end);
} }
list_for_each_entry(map, &priv->freeable_maps, next) { list_for_each_entry(map, &priv->freeable_maps, next) {
if (in_range(map, start, end)) {
ret = -EAGAIN;
goto out_unlock;
}
unmap_if_in_range(map, start, end); unmap_if_in_range(map, start, end);
} }
out_unlock:
mutex_unlock(&priv->lock); mutex_unlock(&priv->lock);
return ret;
} }
static void mn_release(struct mmu_notifier *mn, static void mn_release(struct mmu_notifier *mn,
......
...@@ -1289,8 +1289,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, ...@@ -1289,8 +1289,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
} }
#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end); unsigned long start, unsigned long end, bool blockable);
#ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
......
...@@ -151,13 +151,15 @@ struct mmu_notifier_ops { ...@@ -151,13 +151,15 @@ struct mmu_notifier_ops {
* address space but may still be referenced by sptes until * address space but may still be referenced by sptes until
* the last refcount is dropped. * the last refcount is dropped.
* *
* If both of these callbacks cannot block, and invalidate_range * If blockable argument is set to false then the callback cannot
* cannot block, mmu_notifier_ops.flags should have * sleep and has to return with -EAGAIN. 0 should be returned
* MMU_INVALIDATE_DOES_NOT_BLOCK set. * otherwise.
*
*/ */
void (*invalidate_range_start)(struct mmu_notifier *mn, int (*invalidate_range_start)(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long end); unsigned long start, unsigned long end,
bool blockable);
void (*invalidate_range_end)(struct mmu_notifier *mn, void (*invalidate_range_end)(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long end); unsigned long start, unsigned long end);
...@@ -229,8 +231,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, ...@@ -229,8 +231,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address); unsigned long address);
extern void __mmu_notifier_change_pte(struct mm_struct *mm, extern void __mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte); unsigned long address, pte_t pte);
extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
unsigned long start, unsigned long end); unsigned long start, unsigned long end,
bool blockable);
extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
unsigned long start, unsigned long end, unsigned long start, unsigned long end,
bool only_end); bool only_end);
...@@ -281,7 +284,15 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, ...@@ -281,7 +284,15 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
unsigned long start, unsigned long end) unsigned long start, unsigned long end)
{ {
if (mm_has_notifiers(mm)) if (mm_has_notifiers(mm))
__mmu_notifier_invalidate_range_start(mm, start, end); __mmu_notifier_invalidate_range_start(mm, start, end, true);
}
static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
if (mm_has_notifiers(mm))
return __mmu_notifier_invalidate_range_start(mm, start, end, false);
return 0;
} }
static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
...@@ -461,6 +472,12 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, ...@@ -461,6 +472,12 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
{ {
} }
static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
return 0;
}
static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
unsigned long start, unsigned long end) unsigned long start, unsigned long end)
{ {
......
...@@ -95,7 +95,7 @@ static inline int check_stable_address_space(struct mm_struct *mm) ...@@ -95,7 +95,7 @@ static inline int check_stable_address_space(struct mm_struct *mm)
return 0; return 0;
} }
void __oom_reap_task_mm(struct mm_struct *mm); bool __oom_reap_task_mm(struct mm_struct *mm);
extern unsigned long oom_badness(struct task_struct *p, extern unsigned long oom_badness(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask, struct mem_cgroup *memcg, const nodemask_t *nodemask,
......
...@@ -119,7 +119,8 @@ typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, ...@@ -119,7 +119,8 @@ typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
*/ */
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
u64 start, u64 end, u64 start, u64 end,
umem_call_back cb, void *cookie); umem_call_back cb,
bool blockable, void *cookie);
/* /*
* Find first region intersecting with address range. * Find first region intersecting with address range.
......
...@@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) ...@@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
up_write(&hmm->mirrors_sem); up_write(&hmm->mirrors_sem);
} }
static void hmm_invalidate_range_start(struct mmu_notifier *mn, static int hmm_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end,
bool blockable)
{ {
struct hmm *hmm = mm->hmm; struct hmm *hmm = mm->hmm;
VM_BUG_ON(!hmm); VM_BUG_ON(!hmm);
atomic_inc(&hmm->sequence); atomic_inc(&hmm->sequence);
return 0;
} }
static void hmm_invalidate_range_end(struct mmu_notifier *mn, static void hmm_invalidate_range_end(struct mmu_notifier *mn,
......
...@@ -3064,7 +3064,7 @@ void exit_mmap(struct mm_struct *mm) ...@@ -3064,7 +3064,7 @@ void exit_mmap(struct mm_struct *mm)
* reliably test it. * reliably test it.
*/ */
mutex_lock(&oom_lock); mutex_lock(&oom_lock);
__oom_reap_task_mm(mm); (void)__oom_reap_task_mm(mm);
mutex_unlock(&oom_lock); mutex_unlock(&oom_lock);
set_bit(MMF_OOM_SKIP, &mm->flags); set_bit(MMF_OOM_SKIP, &mm->flags);
......
...@@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, ...@@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
srcu_read_unlock(&srcu, id); srcu_read_unlock(&srcu, id);
} }
void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
unsigned long start, unsigned long end) unsigned long start, unsigned long end,
bool blockable)
{ {
struct mmu_notifier *mn; struct mmu_notifier *mn;
int ret = 0;
int id; int id;
id = srcu_read_lock(&srcu); id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_start) if (mn->ops->invalidate_range_start) {
mn->ops->invalidate_range_start(mn, mm, start, end); int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable);
if (_ret) {
pr_info("%pS callback failed with %d in %sblockable context.\n",
mn->ops->invalidate_range_start, _ret,
!blockable ? "non-" : "");
ret = _ret;
}
}
} }
srcu_read_unlock(&srcu, id); srcu_read_unlock(&srcu, id);
return ret;
} }
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
......
...@@ -487,9 +487,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); ...@@ -487,9 +487,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list; static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock); static DEFINE_SPINLOCK(oom_reaper_lock);
void __oom_reap_task_mm(struct mm_struct *mm) bool __oom_reap_task_mm(struct mm_struct *mm)
{ {
struct vm_area_struct *vma; struct vm_area_struct *vma;
bool ret = true;
/* /*
* Tell all users of get_user/copy_from_user etc... that the content * Tell all users of get_user/copy_from_user etc... that the content
...@@ -519,12 +520,17 @@ void __oom_reap_task_mm(struct mm_struct *mm) ...@@ -519,12 +520,17 @@ void __oom_reap_task_mm(struct mm_struct *mm)
struct mmu_gather tlb; struct mmu_gather tlb;
tlb_gather_mmu(&tlb, mm, start, end); tlb_gather_mmu(&tlb, mm, start, end);
mmu_notifier_invalidate_range_start(mm, start, end); if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
ret = false;
continue;
}
unmap_page_range(&tlb, vma, start, end, NULL); unmap_page_range(&tlb, vma, start, end, NULL);
mmu_notifier_invalidate_range_end(mm, start, end); mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end); tlb_finish_mmu(&tlb, start, end);
} }
} }
return ret;
} }
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
...@@ -553,18 +559,6 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) ...@@ -553,18 +559,6 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
goto unlock_oom; goto unlock_oom;
} }
/*
* If the mm has invalidate_{start,end}() notifiers that could block,
* sleep to give the oom victim some more time.
* TODO: we really want to get rid of this ugly hack and make sure that
* notifiers cannot block for unbounded amount of time
*/
if (mm_has_blockable_invalidate_notifiers(mm)) {
up_read(&mm->mmap_sem);
schedule_timeout_idle(HZ);
goto unlock_oom;
}
/* /*
* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
* work on the mm anymore. The check for MMF_OOM_SKIP must run * work on the mm anymore. The check for MMF_OOM_SKIP must run
...@@ -579,7 +573,12 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) ...@@ -579,7 +573,12 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
trace_start_task_reaping(tsk->pid); trace_start_task_reaping(tsk->pid);
__oom_reap_task_mm(mm); /* failed to reap part of the address space. Try again later */
if (!__oom_reap_task_mm(mm)) {
up_read(&mm->mmap_sem);
ret = false;
goto unlock_oom;
}
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm, task_pid_nr(tsk), tsk->comm,
......
...@@ -140,9 +140,10 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); ...@@ -140,9 +140,10 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
static unsigned long long kvm_createvm_count; static unsigned long long kvm_createvm_count;
static unsigned long long kvm_active_vms; static unsigned long long kvm_active_vms;
__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end) unsigned long start, unsigned long end, bool blockable)
{ {
return 0;
} }
bool kvm_is_reserved_pfn(kvm_pfn_t pfn) bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
...@@ -360,13 +361,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, ...@@ -360,13 +361,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
srcu_read_unlock(&kvm->srcu, idx); srcu_read_unlock(&kvm->srcu, idx);
} }
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end,
bool blockable)
{ {
struct kvm *kvm = mmu_notifier_to_kvm(mn); struct kvm *kvm = mmu_notifier_to_kvm(mn);
int need_tlb_flush = 0, idx; int need_tlb_flush = 0, idx;
int ret;
idx = srcu_read_lock(&kvm->srcu); idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock); spin_lock(&kvm->mmu_lock);
...@@ -384,9 +387,11 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, ...@@ -384,9 +387,11 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
spin_unlock(&kvm->mmu_lock); spin_unlock(&kvm->mmu_lock);
kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable);
srcu_read_unlock(&kvm->srcu, idx); srcu_read_unlock(&kvm->srcu, idx);
return ret;
} }
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment