Commit f25a546e authored by Jason Gunthorpe's avatar Jason Gunthorpe

RDMA/odp: Use mmu_interval_notifier_insert()

Replace the internal interval tree based mmu notifier with the new common
mmu_interval_notifier_insert() API. This removes a lot of code and fixes a
deadlock that can be triggered in ODP:

 zap_page_range()
  mmu_notifier_invalidate_range_start()
   [..]
    ib_umem_notifier_invalidate_range_start()
       down_read(&per_mm->umem_rwsem)
  unmap_single_vma()
    [..]
      __split_huge_page_pmd()
        mmu_notifier_invalidate_range_start()
        [..]
           ib_umem_notifier_invalidate_range_start()
              down_read(&per_mm->umem_rwsem)   // DEADLOCK

        mmu_notifier_invalidate_range_end()
           up_read(&per_mm->umem_rwsem)
  mmu_notifier_invalidate_range_end()
     up_read(&per_mm->umem_rwsem)

The umem_rwsem is held across the range_start/end as the ODP algorithm for
invalidate_range_end cannot tolerate changes to the interval
tree. However, due to the nested invalidation regions the second
down_read() can deadlock if there are competing writers. The new core code
provides an alternative scheme to solve this problem.

Fixes: ca748c39 ("RDMA/umem: Get rid of per_mm->notifier_count")
Link: https://lore.kernel.org/r/20191112202231.3856-6-jgg@ziepe.caTested-by: default avatarArtemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent 107e8998
...@@ -2617,7 +2617,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) ...@@ -2617,7 +2617,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_config);
SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, get_vf_stats);
SET_DEVICE_OP(dev_ops, init_port); SET_DEVICE_OP(dev_ops, init_port);
SET_DEVICE_OP(dev_ops, invalidate_range);
SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_accept);
SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_add_ref);
SET_DEVICE_OP(dev_ops, iw_connect); SET_DEVICE_OP(dev_ops, iw_connect);
......
This diff is collapsed.
...@@ -1263,8 +1263,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); ...@@ -1263,8 +1263,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void); int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void); void mlx5_ib_odp_cleanup(void);
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
unsigned long end);
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
size_t nentries, struct mlx5_ib_mr *mr, int flags); size_t nentries, struct mlx5_ib_mr *mr, int flags);
...@@ -1294,11 +1292,10 @@ mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, ...@@ -1294,11 +1292,10 @@ mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
{ {
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp,
unsigned long start,
unsigned long end){};
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
extern const struct mmu_interval_notifier_ops mlx5_mn_ops;
/* Needed for rep profile */ /* Needed for rep profile */
void __mlx5_ib_remove(struct mlx5_ib_dev *dev, void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
const struct mlx5_ib_profile *profile, const struct mlx5_ib_profile *profile,
......
...@@ -743,7 +743,8 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, ...@@ -743,7 +743,8 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
if (access_flags & IB_ACCESS_ON_DEMAND) { if (access_flags & IB_ACCESS_ON_DEMAND) {
struct ib_umem_odp *odp; struct ib_umem_odp *odp;
odp = ib_umem_odp_get(udata, start, length, access_flags); odp = ib_umem_odp_get(udata, start, length, access_flags,
&mlx5_mn_ops);
if (IS_ERR(odp)) { if (IS_ERR(odp)) {
mlx5_ib_dbg(dev, "umem get failed (%ld)\n", mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
PTR_ERR(odp)); PTR_ERR(odp));
......
...@@ -241,17 +241,26 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) ...@@ -241,17 +241,26 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
xa_unlock(&imr->implicit_children); xa_unlock(&imr->implicit_children);
} }
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
unsigned long end) const struct mmu_notifier_range *range,
unsigned long cur_seq)
{ {
struct ib_umem_odp *umem_odp =
container_of(mni, struct ib_umem_odp, notifier);
struct mlx5_ib_mr *mr; struct mlx5_ib_mr *mr;
const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
sizeof(struct mlx5_mtt)) - 1; sizeof(struct mlx5_mtt)) - 1;
u64 idx = 0, blk_start_idx = 0; u64 idx = 0, blk_start_idx = 0;
unsigned long start;
unsigned long end;
int in_block = 0; int in_block = 0;
u64 addr; u64 addr;
if (!mmu_notifier_range_blockable(range))
return false;
mutex_lock(&umem_odp->umem_mutex); mutex_lock(&umem_odp->umem_mutex);
mmu_interval_set_seq(mni, cur_seq);
/* /*
* If npages is zero then umem_odp->private may not be setup yet. This * If npages is zero then umem_odp->private may not be setup yet. This
* does not complete until after the first page is mapped for DMA. * does not complete until after the first page is mapped for DMA.
...@@ -260,8 +269,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ...@@ -260,8 +269,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
goto out; goto out;
mr = umem_odp->private; mr = umem_odp->private;
start = max_t(u64, ib_umem_start(umem_odp), start); start = max_t(u64, ib_umem_start(umem_odp), range->start);
end = min_t(u64, ib_umem_end(umem_odp), end); end = min_t(u64, ib_umem_end(umem_odp), range->end);
/* /*
* Iteration one - zap the HW's MTTs. The notifiers_count ensures that * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
...@@ -312,8 +321,13 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ...@@ -312,8 +321,13 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
destroy_unused_implicit_child_mr(mr); destroy_unused_implicit_child_mr(mr);
out: out:
mutex_unlock(&umem_odp->umem_mutex); mutex_unlock(&umem_odp->umem_mutex);
return true;
} }
const struct mmu_interval_notifier_ops mlx5_mn_ops = {
.invalidate = mlx5_ib_invalidate_range,
};
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
{ {
struct ib_odp_caps *caps = &dev->odp_caps; struct ib_odp_caps *caps = &dev->odp_caps;
...@@ -414,7 +428,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, ...@@ -414,7 +428,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
idx * MLX5_IMR_MTT_SIZE, idx * MLX5_IMR_MTT_SIZE,
MLX5_IMR_MTT_SIZE); MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
if (IS_ERR(odp)) if (IS_ERR(odp))
return ERR_CAST(odp); return ERR_CAST(odp);
...@@ -600,8 +614,9 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, ...@@ -600,8 +614,9 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
u64 user_va, size_t bcnt, u32 *bytes_mapped, u64 user_va, size_t bcnt, u32 *bytes_mapped,
u32 flags) u32 flags)
{ {
int current_seq, page_shift, ret, np; int page_shift, ret, np;
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
unsigned long current_seq;
u64 access_mask; u64 access_mask;
u64 start_idx, page_mask; u64 start_idx, page_mask;
...@@ -613,12 +628,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, ...@@ -613,12 +628,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
if (odp->umem.writable && !downgrade) if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT; access_mask |= ODP_WRITE_ALLOWED_BIT;
current_seq = READ_ONCE(odp->notifiers_seq); current_seq = mmu_interval_read_begin(&odp->notifier);
/*
* Ensure the sequence number is valid for some time before we call
* gup.
*/
smp_rmb();
np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask, np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask,
current_seq); current_seq);
...@@ -626,7 +636,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, ...@@ -626,7 +636,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
return np; return np;
mutex_lock(&odp->umem_mutex); mutex_lock(&odp->umem_mutex);
if (!ib_umem_mmu_notifier_retry(odp, current_seq)) { if (!mmu_interval_read_retry(&odp->notifier, current_seq)) {
/* /*
* No need to check whether the MTTs really belong to * No need to check whether the MTTs really belong to
* this MR, since ib_umem_odp_map_dma_pages already * this MR, since ib_umem_odp_map_dma_pages already
...@@ -656,19 +666,6 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, ...@@ -656,19 +666,6 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
return np << (page_shift - PAGE_SHIFT); return np << (page_shift - PAGE_SHIFT);
out: out:
if (ret == -EAGAIN) {
unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
if (!wait_for_completion_timeout(&odp->notifier_completion,
timeout)) {
mlx5_ib_warn(
mr->dev,
"timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
current_seq, odp->notifiers_seq,
odp->notifiers_count);
}
}
return ret; return ret;
} }
...@@ -1609,7 +1606,6 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) ...@@ -1609,7 +1606,6 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
static const struct ib_device_ops mlx5_ib_dev_odp_ops = { static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
.advise_mr = mlx5_ib_advise_mr, .advise_mr = mlx5_ib_advise_mr,
.invalidate_range = mlx5_ib_invalidate_range,
}; };
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
......
...@@ -35,11 +35,11 @@ ...@@ -35,11 +35,11 @@
#include <rdma/ib_umem.h> #include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h> #include <rdma/ib_verbs.h>
#include <linux/interval_tree.h>
struct ib_umem_odp { struct ib_umem_odp {
struct ib_umem umem; struct ib_umem umem;
struct ib_ucontext_per_mm *per_mm; struct mmu_interval_notifier notifier;
struct pid *tgid;
/* /*
* An array of the pages included in the on-demand paging umem. * An array of the pages included in the on-demand paging umem.
...@@ -62,13 +62,8 @@ struct ib_umem_odp { ...@@ -62,13 +62,8 @@ struct ib_umem_odp {
struct mutex umem_mutex; struct mutex umem_mutex;
void *private; /* for the HW driver to use. */ void *private; /* for the HW driver to use. */
int notifiers_seq;
int notifiers_count;
int npages; int npages;
/* Tree tracking */
struct interval_tree_node interval_tree;
/* /*
* An implicit odp umem cannot be DMA mapped, has 0 length, and serves * An implicit odp umem cannot be DMA mapped, has 0 length, and serves
* only as an anchor for the driver to hold onto the per_mm. FIXME: * only as an anchor for the driver to hold onto the per_mm. FIXME:
...@@ -77,7 +72,6 @@ struct ib_umem_odp { ...@@ -77,7 +72,6 @@ struct ib_umem_odp {
*/ */
bool is_implicit_odp; bool is_implicit_odp;
struct completion notifier_completion;
unsigned int page_shift; unsigned int page_shift;
}; };
...@@ -89,13 +83,13 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) ...@@ -89,13 +83,13 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
/* Returns the first page of an ODP umem. */ /* Returns the first page of an ODP umem. */
static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp) static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp)
{ {
return umem_odp->interval_tree.start; return umem_odp->notifier.interval_tree.start;
} }
/* Returns the address of the page after the last one of an ODP umem. */ /* Returns the address of the page after the last one of an ODP umem. */
static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp) static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp)
{ {
return umem_odp->interval_tree.last + 1; return umem_odp->notifier.interval_tree.last + 1;
} }
static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
...@@ -119,21 +113,15 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) ...@@ -119,21 +113,15 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_ucontext_per_mm { struct ib_umem_odp *
struct mmu_notifier mn; ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size,
struct pid *tgid; int access, const struct mmu_interval_notifier_ops *ops);
struct rb_root_cached umem_tree;
/* Protects umem_tree */
struct rw_semaphore umem_rwsem;
};
struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
size_t size, int access);
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
int access); int access);
struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, struct ib_umem_odp *
unsigned long addr, size_t size); ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr,
size_t size,
const struct mmu_interval_notifier_ops *ops);
void ib_umem_odp_release(struct ib_umem_odp *umem_odp); void ib_umem_odp_release(struct ib_umem_odp *umem_odp);
int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
...@@ -143,39 +131,11 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, ...@@ -143,39 +131,11 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
u64 bound); u64 bound);
typedef int (*umem_call_back)(struct ib_umem_odp *item, u64 start, u64 end,
void *cookie);
/*
* Call the callback on each ib_umem in the range. Returns the logical or of
* the return values of the functions called.
*/
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
u64 start, u64 end,
umem_call_back cb,
bool blockable, void *cookie);
static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
unsigned long mmu_seq)
{
/*
* This code is strongly based on the KVM code from
* mmu_notifier_retry. Should be called with
* the relevant locks taken (umem_odp->umem_mutex
* and the ucontext umem_mutex semaphore locked for read).
*/
if (unlikely(umem_odp->notifiers_count))
return 1;
if (umem_odp->notifiers_seq != mmu_seq)
return 1;
return 0;
}
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, static inline struct ib_umem_odp *
unsigned long addr, ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size,
size_t size, int access) int access, const struct mmu_interval_notifier_ops *ops)
{ {
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
......
...@@ -2422,8 +2422,6 @@ struct ib_device_ops { ...@@ -2422,8 +2422,6 @@ struct ib_device_ops {
u64 iova); u64 iova);
int (*unmap_fmr)(struct list_head *fmr_list); int (*unmap_fmr)(struct list_head *fmr_list);
int (*dealloc_fmr)(struct ib_fmr *fmr); int (*dealloc_fmr)(struct ib_fmr *fmr);
void (*invalidate_range)(struct ib_umem_odp *umem_odp,
unsigned long start, unsigned long end);
int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device, struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment