Commit 9a293c2d by Kirill Smelkov

bigfile/virtmem: Userspace Virtual Memory Manager

Does similar things to what kernel does - users can mmap file parts into
address space and access them read/write. The manager will be getting
invoked by hardware/OS kernel for cases when there is no page loaded for
read, or when a previousle read-only page is being written to.

Additionally to features provided in kernel, it support to be used to
store back changes in transactional way (see fileh_dirty_writeout()) and
potentially use huge pages for mappings (though this is currently TODO)
1 parent 9065e2b9
......@@ -62,6 +62,19 @@ LOADLIBES=lib/bug.c lib/utils.c 3rdparty/ccan/ccan/tap/tap.c
TESTS := $(patsubst %.c,%,$(wildcard bigfile/tests/test_*.c))
test : test.t test.fault test.asan test.tsan test.vgmem test.vghel test.vgdrd
# TODO move XFAIL markers into *.c
# TSAN fails on test_virtmem (http://code.google.com/p/thread-sanitizer/issues/detail?id=75)
# NOTE the bug was fixed in compiler-rt 20140917 (6afe775d)
# -> we can remove this xfail when the fix propagates to gcc/clang release
XFAIL_bigfile/tests/test_virtmem.tsanrun := y
# Before calling our SIGSEGV handler, Memcheck first reports "invalid read|write" error.
# A solution could be to tell memcheck via VALGRIND_MAKE_MEM_DEFINED that VMA
# address space is ok to access _before_ handling pagefault.
# http://valgrind.org/docs/manual/mc-manual.html#mc-manual.clientreqs
XFAIL_bigfile/tests/test_virtmem.vgmemrun := y
# extract what goes after RUNWITH: marker from command source, or empty if no marker
runwith = $(shell grep -oP '(?<=^// RUNWITH: ).*' $(basename $1).c)
......
......@@ -24,6 +24,10 @@
* read/write, and tail to vma_on_pagefault().
*/
#include <wendelin/bigfile/virtmem.h>
#include <wendelin/bigfile/file.h>
#include <wendelin/bigfile/ram.h>
#include <wendelin/bigfile/pagemap.h>
#include <wendelin/bug.h>
#include <signal.h>
......@@ -44,6 +48,7 @@ static void on_pagefault(int sig, siginfo_t *si, void *_uc)
{
struct ucontext *uc = _uc;
unsigned write;
VMA *vma;
BUG_ON(sig != SIGSEGV);
BUG_ON(si->si_signo != SIGSEGV);
......@@ -63,8 +68,9 @@ static void on_pagefault(int sig, siginfo_t *si, void *_uc)
// XXX locking
/* (1) addr -> vma ;lookup VMA covering faulting memory address */
// TODO
goto dont_handle;
vma = virt_lookup_vma(si->si_addr);
if (!vma)
goto dont_handle; /* fault outside registered file slices */
/* now, since we found faulting address in registered memory areas, we know
* we should serve this pagefault. */
......@@ -76,7 +82,7 @@ static void on_pagefault(int sig, siginfo_t *si, void *_uc)
/* save/restore errno XXX & the like ? */
int save_errno = errno;
// TODO handle pagefault at si->si_addr / write
vma_on_pagefault(vma, (uintptr_t)si->si_addr, write);
errno = save_errno;
......
......@@ -21,7 +21,9 @@
*/
#include <wendelin/bigfile/ram.h>
#include <wendelin/bigfile/file.h>
#include <wendelin/bigfile/virtmem.h>
#include <wendelin/bigfile/pagemap.h>
#include <wendelin/utils.h>
#include <wendelin/bug.h>
......
......@@ -21,6 +21,7 @@
// XXX better link with it
#include "../ram.c"
#include "../pagemap.c"
#include "../virtmem.c"
#include "../ram_shmfs.c"
......
......@@ -26,6 +26,10 @@
*/
// XXX better link with it
#include "../virtmem.c"
#include "../pagemap.c"
#include "../ram.c"
#include "../ram_shmfs.c"
#include "../pagefault.c"
#include <ccan/tap/tap.h>
......@@ -33,6 +37,8 @@
#include <stdio.h>
#include <string.h>
#include "../../t/t_utils.h"
static void prefault()
{
......@@ -64,6 +70,115 @@ void fault_write()
}
/* fault in loadblk (= doublefault) */
void fault_in_loadblk()
{
RAM *ram;
BigFileH fh;
VMA vma_struct, *vma = &vma_struct;
size_t PS;
int err;
diag("testing pagefault v.s. fault in loadblk");
// XXX save/restore sigaction ?
ok1(!pagefault_init());
ram = ram_new(NULL,NULL);
ok1(ram);
PS = ram->pagesize;
/* loadblk, simulating error in storage layer, touches memory in vma for
* another blk -> doublefault */
int faulty_loadblk(BigFile *file, blk_t blk, void *buf)
{
/* touch page[1] - should crash here */
b(vma, 1*PS);
return 0;
}
const struct bigfile_ops faulty_ops = {
.loadblk = faulty_loadblk,
};
BigFile f = {
.blksize = ram->pagesize, /* artificial */
.file_ops = &faulty_ops,
};
err = fileh_open(&fh, &f, ram);
ok1(!err);
err = fileh_mmap(vma, &fh, 0, 2);
ok1(!err);
/* touch page[0] - should dive into loadblk and doublefault there */
prefault();
b(vma, 0);
}
/* fault in storeblk (single fault - but should die) */
void fault_in_storeblk()
{
RAM *ram;
BigFileH fh;
VMA vma_struct, *vma = &vma_struct;
size_t PS;
int err;
diag("testing pagefault v.s. fault in storeblk");
// XXX save/restore sigaction ?
ok1(!pagefault_init());
ram = ram_new(NULL,NULL);
ok1(ram);
PS = ram->pagesize;
/* empty loadblk - memory will just stay as it is (all 0) */
int empty_loadblk(BigFile *file, blk_t blk, void *buf)
{ return 0; }
/* storeblk "incorrectly" accesses other protected memory which should be
* catched and SIGSEGV */
int faulty_storeblk(BigFile *file, blk_t blk, const void *buf)
{
/* read page[1] - should crash here */
b(vma, 1*PS);
return 0;
}
const struct bigfile_ops faulty_ops = {
.loadblk = empty_loadblk,
.storeblk = faulty_storeblk,
};
BigFile f = {
.blksize = ram->pagesize, /* artificial */
.file_ops = &faulty_ops,
};
err = fileh_open(&fh, &f, ram);
ok1(!err);
err = fileh_mmap(vma, &fh, 0, 2);
ok1(!err);
/* write to page[0] -> page[0] becomes dirty */
b(vma, 0) = 1;
/* writeout calls storeblk which faults */
prefault();
fileh_dirty_writeout(&fh, WRITEOUT_STORE);
}
static const struct {
const char *name;
void (*test)(void);
......@@ -72,6 +187,8 @@ static const struct {
// name func-where-it-dies
{"faultr", fault_read}, // on_pagefault
{"faultw", fault_write}, // on_pagefault
{"fault_loadblk", fault_in_loadblk}, // faulty_loadblk
{"fault_storeblk", fault_in_storeblk}, // faulty_storeblk
};
int main(int argc, char *argv[])
......
......@@ -18,15 +18,62 @@
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*
* ~~~~~~~~
*
* Virtual memory connects BigFile content and RAM pages into file memory
* mappings.
*
* Read access to mapped pages cause their on-demand loading, and write access
* marks modified pages as dirty. Dirty pages then can be on request either
* written out back to file or discarded.
*/
#include <stdint.h>
#include <wendelin/list.h>
#include <wendelin/bigfile/types.h>
#include <wendelin/bigfile/pagemap.h>
#include <ccan/bitmap/bitmap.h> // XXX can't forward-decl for bitmap
typedef struct RAM RAM;
typedef struct RAMH RAMH;
typedef struct Page Page;
typedef struct BigFile BigFile;
/* BigFile Handle
*
* BigFile handle is a representation of file snapshot that could be locally
* modified in-memory. The changes could be later either discarded or stored
* back to file. One file can have many opened handles each with its own
* modifications and optionally ram.
*/
struct BigFileH {
BigFile *file;
/* ram handle, backing this fileh mappings */
RAMH *ramh;
/* fileh mappings (list of VMA)
* NOTE current design assumes there will be not many mappings
* so instead of backpointers from pages to vma mapping entries, we'll
* scan all page->fileh->mmaps to overlap with page.
*/
struct list_head mmaps; /* _ -> vma->same_fileh */
/* {} f_pgoffset -> page */
PageMap pagemap;
/* Page - describes fixed-size item of physical RAM associated with content from file */
// XXX not sure we need this
// -> currently is used to know whether to join ZODB DataManager serving ZBigFile
// XXX maybe change into dirty_list in the future?
unsigned dirty : 1;
};
typedef struct BigFileH BigFileH;
/* Page - describes fixed-size item of physical RAM associated with content from fileh */
enum PageState {
PAGE_EMPTY = 0, /* file content has not been loaded yet */
PAGE_LOADED = 1, /* file content has been loaded and was not modified */
......@@ -37,6 +84,10 @@ typedef enum PageState PageState;
struct Page {
PageState state;
/* wrt fileh - associated with */
BigFileH *fileh;
pgoff_t f_pgoffset;
/* wrt ram - associated with */
RAMH* ramh;
pgoff_t ramh_pgoffset;
......@@ -49,6 +100,146 @@ struct Page {
typedef struct Page Page;
/* VMA - virtual memory area representing one fileh mapping
*
* NOTE areas may not overlap in virtual address space
* (in file space they can overlap).
*/
typedef struct VMA VMA;
struct VMA {
uintptr_t addr_start, addr_stop; /* [addr_start, addr_stop) */
BigFileH *fileh; /* for which fileh */
pgoff_t f_pgoffset; /* where starts, in pages */
/* FIXME For approximation 0, VMA(s) are kept in sorted doubly-linked
* list, which is not good for lookup/add/remove performance O(n), but easy to
* program. This should be ok for first draft, as there are not many fileh
* views taken simultaneously.
*
* TODO for better performance, some binary-search-tree should be used.
*/
struct list_head virt_list; /* (virtmem.c::vma_list -> _) */
/* VMA's for the same fileh (fileh->mmaps -> _) */
struct list_head same_fileh;
/* whether corresponding to pgoffset-f_offset page is mapped in this VMA */
bitmap *page_ismappedv; /* len ~ Δaddr / pagesize */
};
/*****************************
* API for clients *
*****************************/
/* open handle for a BigFile
*
* @fileh[out] BigFileH handle to initialize for this open
* @file
* @ram RAM that will back created fileh mappings
*
* @return 0 - ok, !0 - fail
*/
int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram);
/* close fileh
*
* it's an error to call fileh_close with existing mappings
*/
void fileh_close(BigFileH *fileh);
/* map fileh part into memory
*
* This "maps" fileh part [pgoffset, pglen) in pages into process address space.
*
* @vma[out] vma to initialize for this mmap
* @return 0 - ok, !0 - fail
*/
int fileh_mmap(VMA *vma, BigFileH *fileh, pgoff_t pgoffset, pgoff_t pglen);
/* unmap mapping created by fileh_mmap()
*
* This removes mapping created by fileh_mmap() from process address space.
* Changes made to fileh pages are preserved (to e.g. either other mappings and
* later commit/discard).
*/
void vma_unmap(VMA *vma);
/* what to do at writeout */
enum WriteoutFlags {
/* store dirty pages back to file
*
* - call file.storeblk() for all dirty pages;
* - pages state remains PAGE_DIRTY.
*
* to "finish" the storage use WRITEOUT_MARKSTORED in the same or separate
* call.
*/
WRITEOUT_STORE = 1 << 0,
/* mark dirty pages as stored to file ok
*
* pages state becomes PAGE_LOADED and all mmaps are updated to map pages as
* R/O to track further writes.
*/
WRITEOUT_MARKSTORED = 1 << 1,
};
/* write changes made to fileh memory back to file
*
* Perform write-related actions according to flags (see WriteoutFlags).
*
* @return 0 - ok !0 - fail
* NOTE single WRITEOUT_MARKSTORED can not fail.
*
* No guarantee is made about atomicity - e.g. if this call fails, some
* pages could be written and some left in memory in dirty state.
*/
int fileh_dirty_writeout(BigFileH *fileh, enum WriteoutFlags flags);
/* discard changes made to fileh memory
*
* For each fileh dirty page:
*
* - it is unmapped from all mmaps;
* - its content is discarded;
* - its backing memory is released to OS.
*/
void fileh_dirty_discard(BigFileH *fileh);
/* pagefault handler
*
* serves read/write access to protected memory: loads data from file on demand
* and tracks which pages were made dirty.
*
* (clients call this indirectly via triggering SIGSEGV on read/write to memory)
*/
void vma_on_pagefault(VMA *vma, uintptr_t addr, int write);
int pagefault_init(void); /* in pagefault.c */
/* release some non-dirty ram back to OS; protect PROT_NONE related mappings
*
* This should be called when system is low on memory - it will scan through
* RAM pages and release some LRU non-dirty pages ram memory back to OS.
*
* (this is usually done automatically under memory pressure)
*
* @return how many RAM pages were reclaimed
* XXX int -> size_t ?
*/
int ram_reclaim(RAM *ram);
/************
* Internal *
************/
......@@ -69,8 +260,18 @@ void page_incref(Page *page);
void page_decref(Page *page);
/* lookup VMA by addr */
VMA *virt_lookup_vma(void *addr);
void virt_register_vma(VMA *vma);
void virt_unregister_vma(VMA *vma);
/* allocate virtual memory address space */
void *mem_valloc(void *addr, size_t len);
void *mem_xvalloc(void *addr, size_t len);
// XXX is this needed? think more
/* what happens on out-of-memory */
void OOM(void);
#endif
/* Demo program, that shows 2 memory pages can be combined into 1 bigger
* _contiguous_ memory area via shm / mmap. The idea is that this way we'll
* combine array pages into larger slice on client __getslice__ requests and
* the result would be usual contiguous ndarray while pages of it could live in
* different places in memory.
*
* Unfortunately there is no way to mmap-duplicate pages for MAP_ANONYMOUS, so
* the way it is done is via a file in tmpfs (on /dev/shm/ via posix shm):
*
* https://groups.google.com/forum/#!topic/comp.os.linux.development.system/Prx7ExCzsv4
*/
#include <sys/mman.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <assert.h>
#define TRACE(msg, ...) do { \
fprintf(stderr, msg, ##__VA_ARGS__); \
fprintf(stderr, "\n"); \
} while (0)
void die(const char *msg)
{
perror(msg);
exit(1);
}
int main()
{
uint8_t *page1, *page2, *page12, *p;
size_t len;
int f, err;
len = 1*4096; /* XXX = 1 page */
/* TODO - choose name a-la mktemp and loop changing it if EEXIST */
f = shm_open("/array", O_RDWR | O_CREAT | O_EXCL,
S_IRUSR | S_IWUSR);
if (f < 0)
die("shm_open");
/*
* unlink so that the file is removed on only memory mapping(s) are left.
* All mappings will be released upon program exit and so the memory
* resources would release too
*/
err = shm_unlink("/array");
if (err)
perror("shm_unlink");
/* whole memory-segment size */
err = ftruncate(f, len);
if (err < 0)
die("ftruncate");
/* page1 - memory view onto array page[0] */
page1 = mmap(/*addr=*/NULL, len,
PROT_READ | PROT_WRITE,
MAP_SHARED, // | MAP_HUGETLB | MAP_UNINITIALIZED ?
f, 0);
if (page1 == MAP_FAILED)
die("mmap page1");
TRACE("mmap page1 ok");
page1[0] = 1;
TRACE("store page1 ok (%i)", page1[0]);
/* page2 - memory view onto array page[0] (content should be identical to page1) */
page2 = mmap(/*addr=*/NULL, len,
PROT_READ | PROT_WRITE,
MAP_SHARED, // | MAP_HUGETLB | MAP_UNINITIALIZED ?
f, 0);
if (page2 == MAP_FAILED)
die("mmap page2");
TRACE("mmap page2 ok (%i)", page2[0]);
assert(page2[0] == 1);
/* alloc 2*page contiguous VMA */
page12 = mmap(NULL, 2*len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (page12 == MAP_FAILED)
die("mmap page12");
TRACE("stub page12 ok");
/* page12[0] -> array.page[0] */
p = mmap(&page12[0*len], len, PROT_READ, MAP_SHARED | MAP_FIXED, f, 0);
if (p == MAP_FAILED || (p != &page12[0*len]))
die("mmap page12.0");
/* page12[1] -> array.page[0] */
p = mmap(&page12[1*len], len, PROT_READ, MAP_SHARED | MAP_FIXED, f, 0);
if (p == MAP_FAILED || (p != &page12[1*len]))
die("mmap page12.1");
TRACE("page12 ok (%i %i)", page12[0], page12[len]);
assert(page12[0] == 1);
assert(page12[len] == 1);
page1[0] = 33;
TRACE("page12 ok (%i %i)", page12[0], page12[len]);
assert(page12[0] == 33);
assert(page12[len] == 33);
page2[0] = 45;
TRACE("page12 ok (%i %i)", page12[0], page12[len]);
assert(page12[0] == 45);
assert(page12[len] == 45);
/* should segfault - we only requested PROT_READ */
TRACE("will segfault...");
page12[0] = 55;
return 0;
}
#include "t_utils.h"
#include <wendelin/utils.h>
static const struct ram_ops ram_limited_ops;
static const struct ramh_ops ramh_limited_ops;
RAMLimited *ram_limited_new(RAM *backend, size_t alloc_max)
{
RAMLimited *ram;
ram = zalloc(sizeof(*ram));
if (!ram)
return NULL;
ram->backend = backend;
ram->pagesize = backend->pagesize;
/* NOTE allocated pages will be linked here (instead of backend->lru_list)
* automatically, as upper code thinks _we_ allocated the page */
INIT_LIST_HEAD(&ram->lru_list);
ram->alloc_max = alloc_max;
ram->nalloc = 0;
ram->ram_ops = &ram_limited_ops;
return ram;
}
struct RAMHLimited {
RAMH;
RAMH *backend;
};
typedef struct RAMHLimited RAMHLimited;
size_t ram_limited_get_current_maxsize(RAM *ram0)
{
RAMLimited *ram = upcast(RAMLimited *, ram0);
return ram_get_current_maxsize(ram->backend);
}
RAMH *ram_limited_ramh_open(RAM *ram0)
{
RAMLimited *ram = upcast(RAMLimited *, ram0);
RAMHLimited *ramh;
ramh = zalloc(sizeof(*ramh));
if (!ramh)
goto out;
ramh->backend = ramh_open(ram->backend);
if (!ramh->backend)
goto out;
ramh->ram = ram;
ramh->ramh_ops = &ramh_limited_ops;
return ramh;
out:
free(ramh);
return NULL;
}
void ram_limited_close(RAM *ram0)
{
//RAMLimited *ram = upcast(RAMLimited *, ram0);
// XXX close if owning?
// ram_close(ram->backend);
// TODO free(self) ?
}
static const struct ram_ops ram_limited_ops = {
.get_current_maxsize = ram_limited_get_current_maxsize,
.ramh_open = ram_limited_ramh_open,
.close = ram_limited_close,
};
pgoff_t ramh_limited_alloc_page(RAMH *ramh0, pgoff_t pgoffset_hint)
{
RAMHLimited *ramh = upcast(RAMHLimited *, ramh0);
RAMLimited *ram = upcast(RAMLimited *, ramh->ram);
pgoff_t pgoff;
/* deny allocation when max #pages already allocated */
if (ram->nalloc >= ram->alloc_max)
return RAMH_PGOFF_ALLOCFAIL;
pgoff = ramh->backend->ramh_ops->alloc_page(ramh->backend, pgoffset_hint);
if (pgoff != RAMH_PGOFF_ALLOCFAIL)
ram->nalloc++;
return pgoff;
}
void ramh_limited_drop_memory(RAMH *ramh0, pgoff_t ramh_pgoffset)
{
RAMHLimited *ramh = upcast(RAMHLimited *, ramh0);
RAMLimited *ram = upcast(RAMLimited *, ramh->ram);
ramh->backend->ramh_ops->drop_memory(ramh->backend, ramh_pgoffset);
ram->nalloc--;
}
void *ramh_limited_mmap_page(RAMH *ramh0, pgoff_t ramh_pgoffset, void *addr, int prot)
{
RAMHLimited *ramh = upcast(RAMHLimited *, ramh0);
return ramh->backend->ramh_ops->mmap_page(ramh->backend, ramh_pgoffset, addr, prot);
}
void ramh_limited_close(RAMH *ramh0)
{
RAMHLimited *ramh = upcast(RAMHLimited *, ramh0);
ramh->backend->ramh_ops->close(ramh->backend);
// TODO free(self) ?
}
static const struct ramh_ops ramh_limited_ops = {
.alloc_page = ramh_limited_alloc_page,
.drop_memory = ramh_limited_drop_memory,
.mmap_page = ramh_limited_mmap_page,
.close = ramh_limited_close,
};
#ifndef _WENDELIN_TESTING_UTILS_H_
#define _WENDELIN_TESTING_UTILS_H_
/* Wendelin.bigfile | various testing utilities
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*/
#include <wendelin/bigfile/ram.h>
/* access to vma memory as byte[] and blk_t[] */
#define b(vma, idx) ( ((volatile uint8_t *)vma->addr_start) [ idx ] )
#define B(vma, idx) ( ((volatile blk_t *)vma->addr_start) [ idx ] )
/* RAM with limit on #allocated pages
*
* NOTE allocated pages are linked to ->lru_list and backend->lru_list will be empty.
*/
struct RAMLimited {
RAM;
RAM *backend;
size_t alloc_max;
size_t nalloc;
};
typedef struct RAMLimited RAMLimited;
RAMLimited *ram_limited_new(RAM *backend, size_t alloc_max);
#endif
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!