Commit 02891844 authored by Axel Rasmussen's avatar Axel Rasmussen Committed by Andrew Morton

mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs

UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new PTE
to resolve a missing fault, one can install a write-protected one.  This
is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in combination.

This was motivated by testing HugeTLB HGM [1], and in particular its
interaction with userfaultfd features.  Existing userfaultfd code supports
using WP and MINOR modes together (i.e.  you can register an area with
both enabled), but without this CONTINUE flag the combination is in
practice unusable.

So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing as
UFFDIO_COPY_MODE_WP, but for *minor* faults.

Update the selftest to do some very basic exercising of the new flag.

Update Documentation/ to describe how these flags are used (neither the
COPY nor the new CONTINUE versions of this mode flag were described there
before).

[1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/

Link: https://lkml.kernel.org/r/20230314221250.682452-5-axelrasmussen@google.comSigned-off-by: default avatarAxel Rasmussen <axelrasmussen@google.com>
Acked-by: default avatarPeter Xu <peterx@redhat.com>
Acked-by: default avatarMike Rapoport (IBM) <rppt@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent d9712937
...@@ -236,6 +236,14 @@ newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED ...@@ -236,6 +236,14 @@ newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED
and set the feature bit in advance to make sure none ptes will also be and set the feature bit in advance to make sure none ptes will also be
write protected even upon anonymous memory. write protected even upon anonymous memory.
When using ``UFFDIO_REGISTER_MODE_WP`` in combination with either
``UFFDIO_REGISTER_MODE_MISSING`` or ``UFFDIO_REGISTER_MODE_MINOR``, when
resolving missing / minor faults with ``UFFDIO_COPY`` or ``UFFDIO_CONTINUE``
respectively, it may be desirable for the new page / mapping to be
write-protected (so future writes will also result in a WP fault). These ioctls
support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP``
respectively) to configure the mapping this way.
QEMU/KVM QEMU/KVM
======== ========
......
...@@ -1893,6 +1893,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) ...@@ -1893,6 +1893,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
struct uffdio_continue uffdio_continue; struct uffdio_continue uffdio_continue;
struct uffdio_continue __user *user_uffdio_continue; struct uffdio_continue __user *user_uffdio_continue;
struct userfaultfd_wake_range range; struct userfaultfd_wake_range range;
uffd_flags_t flags = 0;
user_uffdio_continue = (struct uffdio_continue __user *)arg; user_uffdio_continue = (struct uffdio_continue __user *)arg;
...@@ -1917,13 +1918,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) ...@@ -1917,13 +1918,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
uffdio_continue.range.start) { uffdio_continue.range.start) {
goto out; goto out;
} }
if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
UFFDIO_CONTINUE_MODE_WP))
goto out; goto out;
if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) { if (mmget_not_zero(ctx->mm)) {
ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
uffdio_continue.range.len, uffdio_continue.range.len,
&ctx->mmap_changing); &ctx->mmap_changing, flags);
mmput(ctx->mm); mmput(ctx->mm);
} else { } else {
return -ESRCH; return -ESRCH;
......
...@@ -83,7 +83,8 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, ...@@ -83,7 +83,8 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
unsigned long len, unsigned long len,
atomic_t *mmap_changing); atomic_t *mmap_changing);
extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long len, atomic_t *mmap_changing); unsigned long len, atomic_t *mmap_changing,
uffd_flags_t flags);
extern int mwriteprotect_range(struct mm_struct *dst_mm, extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len, unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing); bool enable_wp, atomic_t *mmap_changing);
......
...@@ -305,6 +305,13 @@ struct uffdio_writeprotect { ...@@ -305,6 +305,13 @@ struct uffdio_writeprotect {
struct uffdio_continue { struct uffdio_continue {
struct uffdio_range range; struct uffdio_range range;
#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
/*
* UFFDIO_CONTINUE_MODE_WP will map the page write protected on
* the fly. UFFDIO_CONTINUE_MODE_WP is available only if the
* write protected ioctl is implemented for the range
* according to the uffdio_register.ioctls.
*/
#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1)
__u64 mode; __u64 mode;
/* /*
......
...@@ -693,10 +693,11 @@ ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start, ...@@ -693,10 +693,11 @@ ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
} }
ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing) unsigned long len, atomic_t *mmap_changing,
uffd_flags_t flags)
{ {
return mfill_atomic(dst_mm, start, 0, len, mmap_changing, return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
uffd_flags_set_mode(0, MFILL_ATOMIC_CONTINUE)); uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
} }
long uffd_wp_range(struct vm_area_struct *dst_vma, long uffd_wp_range(struct vm_area_struct *dst_vma,
......
...@@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len) ...@@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len)
req.range.start = start; req.range.start = start;
req.range.len = len; req.range.len = len;
req.mode = 0; req.mode = 0;
if (test_uffdio_wp)
req.mode |= UFFDIO_CONTINUE_MODE_WP;
if (ioctl(ufd, UFFDIO_CONTINUE, &req)) if (ioctl(ufd, UFFDIO_CONTINUE, &req))
err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
...@@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void) ...@@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void)
uffdio_register.range.start = (unsigned long)area_dst_alias; uffdio_register.range.start = (unsigned long)area_dst_alias;
uffdio_register.range.len = nr_pages * page_size; uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
if (test_uffdio_wp)
uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
err("register failure"); err("register failure");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment