• David Hildenbrand's avatar
    mm/userfaultfd: propagate uffd-wp bit when PTE-mapping the huge zeropage · 42b2af2c
    David Hildenbrand authored
    Currently, we'd lose the userfaultfd-wp marker when PTE-mapping a huge
    zeropage, resulting in the next write faults in the PMD range not
    triggering uffd-wp events.
    
    Various actions (partial MADV_DONTNEED, partial mremap, partial munmap,
    partial mprotect) could trigger this.  However, most importantly,
    un-protecting a single sub-page from the userfaultfd-wp handler when
    processing a uffd-wp event will PTE-map the shared huge zeropage and lose
    the uffd-wp bit for the remainder of the PMD.
    
    Let's properly propagate the uffd-wp bit to the PMDs.
    
     #define _GNU_SOURCE
     #include <stdio.h>
     #include <stdlib.h>
     #include <stdint.h>
     #include <stdbool.h>
     #include <inttypes.h>
     #include <fcntl.h>
     #include <unistd.h>
     #include <errno.h>
     #include <poll.h>
     #include <pthread.h>
     #include <sys/mman.h>
     #include <sys/syscall.h>
     #include <sys/ioctl.h>
     #include <linux/userfaultfd.h>
    
     static size_t pagesize;
     static int uffd;
     static volatile bool uffd_triggered;
    
     #define barrier() __asm__ __volatile__("": : :"memory")
    
     static void uffd_wp_range(char *start, size_t size, bool wp)
     {
     	struct uffdio_writeprotect uffd_writeprotect;
    
     	uffd_writeprotect.range.start = (unsigned long) start;
     	uffd_writeprotect.range.len = size;
     	if (wp) {
     		uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
     	} else {
     		uffd_writeprotect.mode = 0;
     	}
     	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
     		fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
     		exit(1);
     	}
     }
    
     static void *uffd_thread_fn(void *arg)
     {
     	static struct uffd_msg msg;
     	ssize_t nread;
    
     	while (1) {
     		struct pollfd pollfd;
     		int nready;
    
     		pollfd.fd = uffd;
     		pollfd.events = POLLIN;
     		nready = poll(&pollfd, 1, -1);
     		if (nready == -1) {
     			fprintf(stderr, "poll() failed: %d\n", errno);
     			exit(1);
     		}
    
     		nread = read(uffd, &msg, sizeof(msg));
     		if (nread <= 0)
     			continue;
    
     		if (msg.event != UFFD_EVENT_PAGEFAULT ||
     		    !(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP)) {
     			printf("FAIL: wrong uffd-wp event fired\n");
     			exit(1);
     		}
    
     		/* un-protect the single page. */
     		uffd_triggered = true;
     		uffd_wp_range((char *)(uintptr_t)msg.arg.pagefault.address,
     			      pagesize, false);
     	}
     	return arg;
     }
    
     static int setup_uffd(char *map, size_t size)
     {
     	struct uffdio_api uffdio_api;
     	struct uffdio_register uffdio_register;
     	pthread_t thread;
    
     	uffd = syscall(__NR_userfaultfd,
     		       O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
     	if (uffd < 0) {
     		fprintf(stderr, "syscall() failed: %d\n", errno);
     		return -errno;
     	}
    
     	uffdio_api.api = UFFD_API;
     	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
     	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
     		fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
     		return -errno;
     	}
    
     	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
     		fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
     		return -ENOSYS;
     	}
    
     	uffdio_register.range.start = (unsigned long) map;
     	uffdio_register.range.len = size;
     	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
     	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
     		fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
     		return -errno;
     	}
    
     	pthread_create(&thread, NULL, uffd_thread_fn, NULL);
    
     	return 0;
     }
    
     int main(void)
     {
     	const size_t size = 4 * 1024 * 1024ull;
     	char *map, *cur;
    
     	pagesize = getpagesize();
    
     	map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
     	if (map == MAP_FAILED) {
     		fprintf(stderr, "mmap() failed\n");
     		return -errno;
     	}
    
     	if (madvise(map, size, MADV_HUGEPAGE)) {
     		fprintf(stderr, "MADV_HUGEPAGE failed\n");
     		return -errno;
     	}
    
     	if (setup_uffd(map, size))
     		return 1;
    
     	/* Read the whole range, populating zeropages. */
     	madvise(map, size, MADV_POPULATE_READ);
    
     	/* Write-protect the whole range. */
     	uffd_wp_range(map, size, true);
    
     	/* Make sure uffd-wp triggers on each page. */
     	for (cur = map; cur < map + size; cur += pagesize) {
     		uffd_triggered = false;
    
     		barrier();
     		/* Trigger a write fault. */
     		*cur = 1;
     		barrier();
    
     		if (!uffd_triggered) {
     			printf("FAIL: uffd-wp did not trigger\n");
     			return 1;
     		}
     	}
    
     	printf("PASS: uffd-wp triggered\n");
     	return 0;
     }
    
    Link: https://lkml.kernel.org/r/20230302175423.589164-1-david@redhat.com
    Fixes: e06f1e1d ("userfaultfd: wp: enabled write protection in userfaultfd API")
    Signed-off-by: default avatarDavid Hildenbrand <david@redhat.com>
    Acked-by: default avatarPeter Xu <peterx@redhat.com>
    Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
    Cc: Andrea Arcangeli <aarcange@redhat.com>
    Cc: Jerome Glisse <jglisse@redhat.com>
    Cc: Shaohua Li <shli@fb.com>
    Cc: <stable@vger.kernel.org>
    Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
    42b2af2c
huge_memory.c 88.3 KB