Commit fae045cc authored by Kirill Smelkov's avatar Kirill Smelkov

bigfile/virtmem: Introduce "mmap overlay" mode

with the intention to later use WCFS through it.

Before this patch virtmem had only one mode: a BigFile backend was
providing loadblk and storeblk methods, and on every block access
loadblk was called to load block data into allocated RAM page.

However with WCFS virtmem won't be needed to do anything to load data -
because loading from head/bigfile/f mmaped through OS will be handled by
OS directly. Thus for wcfs, that leaves virtmem only to handle dirtying
and writeout.

-> Introduce "mmap overlay" mode into virtmem to handle WCFS-like
BigFile backends - that can provide read-only base layer suitable for
mmapping.

This patch is organized as follows:

- fileh_open is added flags argument to indicate which mode to use for
  opened fileh. BigFileH is added .mmap_overlay bitfield correspondingly.
  (virtmem.h)

- struct bigfile_ops is extended with 3 optional methods that a BigFile
  backend might provide to support mmap-overlay mode:

  * mmap_setup_read,
  * remmap_blk_read, and
  * munmap

  (see file.h changes for documentation of this new interface)

- if opened with MMAP_OVERLAY flag, virtmem is using those methods to
  organize VMA views backed by read-only base mmap layer and writeout
  for such VMAs (virtmem.c)

- a test is added to exercise MMAP_OVERLAY virtmem mode (test_virtmem.c)

- everything else, including bigfile.py, is switched to use
  DONT_MMAP_OVERLAY unconditionally for now.

In internal comments inside virtmem new mode is interchangeable called
"mmap overlay" and "wcfs", even though wcfs is not hooked to be used
mmap-overlaying yet.

Some preliminary history:

fb6932a2    X Split PAGE_LOADED -> PAGE_LOADED, PAGE_LOADED_FOR_WRITE
4a20a573    X Settled on what should happen after writeout for wcfs case
f084ff9b    X Transition to all VMA under 1 fileh to be either all based on wcfs or all based on !wcfs
parent 10f7153a
/* Wendelin.bigfile | Python interface to memory/files /* Wendelin.bigfile | Python interface to memory/files
* Copyright (C) 2014-2020 Nexedi SA and Contributors. * Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com> * Kirill Smelkov <kirr@nexedi.com>
* *
* This program is free software: you can Use, Study, Modify and Redistribute * This program is free software: you can Use, Study, Modify and Redistribute
...@@ -981,7 +981,7 @@ pyfileh_open(PyObject *pyfile0, PyObject *args) ...@@ -981,7 +981,7 @@ pyfileh_open(PyObject *pyfile0, PyObject *args)
return NULL; return NULL;
Py_INCREF(pyfile); Py_INCREF(pyfile);
err = fileh_open(&pyfileh->fileh, &pyfile->file, ram); err = fileh_open(&pyfileh->fileh, &pyfile->file, ram, DONT_MMAP_OVERLAY);
if (err) { if (err) {
XPyErr_SetFromErrno(); XPyErr_SetFromErrno();
Py_DECREF(pyfile); Py_DECREF(pyfile);
......
/* Wendelin.bigfile | virtual memory benchmarks /* Wendelin.bigfile | virtual memory benchmarks
* Copyright (C) 2017-2019 Nexedi SA and Contributors. * Copyright (C) 2017-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com> * Kirill Smelkov <kirr@nexedi.com>
* *
* This program is free software: you can Use, Study, Modify and Redistribute * This program is free software: you can Use, Study, Modify and Redistribute
...@@ -80,7 +80,7 @@ void bench_pagefault() { ...@@ -80,7 +80,7 @@ void bench_pagefault() {
}; };
/* setup f mapping */ /* setup f mapping */
err = fileh_open(fh, &f, ram); err = fileh_open(fh, &f, ram, DONT_MMAP_OVERLAY);
ok1(!err); ok1(!err);
err = fileh_mmap(vma, fh, 0, npage); err = fileh_mmap(vma, fh, 0, npage);
......
...@@ -393,7 +393,7 @@ void test_file_access_synthetic(void) ...@@ -393,7 +393,7 @@ void test_file_access_synthetic(void)
.file_ops = &x_ops, .file_ops = &x_ops,
}; };
err = fileh_open(fh, &fileid, ram); err = fileh_open(fh, &fileid, ram, DONT_MMAP_OVERLAY);
ok1(!err); ok1(!err);
ok1(list_empty(&fh->mmaps)); ok1(list_empty(&fh->mmaps));
...@@ -955,7 +955,7 @@ void test_file_access_pagefault() ...@@ -955,7 +955,7 @@ void test_file_access_pagefault()
.file_ops = &fileid_ops, .file_ops = &fileid_ops,
}; };
err = fileh_open(fh, &fileid, ram); err = fileh_open(fh, &fileid, ram, DONT_MMAP_OVERLAY);
ok1(!err); ok1(!err);
/* implicitly use fileh=fh */ /* implicitly use fileh=fh */
...@@ -1083,7 +1083,7 @@ void test_pagefault_savestate() ...@@ -1083,7 +1083,7 @@ void test_pagefault_savestate()
.file_ops = &badfile_ops, .file_ops = &badfile_ops,
}; };
err = fileh_open(fh, &f, ram); err = fileh_open(fh, &f, ram, DONT_MMAP_OVERLAY);
ok1(!err); ok1(!err);
err = fileh_mmap(vma, fh, 0, 1); err = fileh_mmap(vma, fh, 0, 1);
...@@ -1109,6 +1109,544 @@ void test_pagefault_savestate() ...@@ -1109,6 +1109,544 @@ void test_pagefault_savestate()
free(ram); free(ram);
} }
/* ---------------------------------------- */
/* test access to file mappings with file having .mmap* instead of .loadblk
*
* "mmap overlay" is virtmem mode used with wcfs: RAM pages are used only for
* dirtied data and everything else comes as read-only mmap from wcfs file.
*/
/* BigFileMMap is BigFile that mmaps blkdata for read from a regular file.
*
* Store, contrary to load, is done via regular file writes. */
struct BigFileMMap {
BigFile;
int fd; /* fd of file to mmap */
int nstoreblk; /* number of times storeblk called */
int nremmapblk; /* ----//---- remmap_blk_read called */
int nmunmap; /* ----//---- munmap called */
};
typedef struct BigFileMMap BigFileMMap;
void mmapfile_release(BigFile *file) {
BigFileMMap *f = upcast(BigFileMMap*, file);
int err;
err = close(f->fd);
BUG_ON(err);
}
int mmapfile_storeblk(BigFile *file, blk_t blk, const void *buf) {
BigFileMMap *f = upcast(BigFileMMap*, file);
size_t n = f->blksize;
off_t at = blk*f->blksize;
f->nstoreblk++;
while (n > 0) {
ssize_t wrote;
wrote = pwrite(f->fd, buf, n, at);
if (wrote == -1)
return -1;
BUG_ON(wrote > n);
n -= wrote;
buf += wrote;
at += wrote;
}
return 0;
}
int mmapfile_mmap_setup_read(VMA *vma, BigFile *file, blk_t blk, size_t blklen) {
BigFileMMap *f = upcast(BigFileMMap*, file);
size_t len = blklen*f->blksize;
void *addr;
addr = mmap(NULL, len, PROT_READ, MAP_SHARED, f->fd, blk*f->blksize);
if (addr == MAP_FAILED)
return -1;
vma->addr_start = (uintptr_t)addr;
vma->addr_stop = vma->addr_start + len;
return 0;
}
int mmapfile_remmap_blk_read(VMA *vma, BigFile *file, blk_t blk) {
BigFileMMap *f = upcast(BigFileMMap*, file);
TODO (f->blksize != vma->fileh->ramh->ram->pagesize);
ASSERT(vma->f_pgoffset <= blk && blk < vma_addr_fpgoffset(vma, vma->addr_stop));
pgoff_t pgoff_invma = blk - vma->f_pgoffset;
uintptr_t addr = vma->addr_start + pgoff_invma*f->blksize;
void *mapped;
f->nremmapblk++;
mapped = mmap((void *)addr, 1*f->blksize, PROT_READ, MAP_SHARED | MAP_FIXED, f->fd, blk*f->blksize);
if (mapped == MAP_FAILED)
return -1;
ASSERT(mapped == (void *)addr);
return 0;
}
int mmapfile_munmap(VMA *vma, BigFile *file) {
BigFileMMap *f = upcast(BigFileMMap*, file);
size_t len = vma->addr_stop - vma->addr_start;
f->nmunmap++;
xmunmap((void *)vma->addr_start, len);
return 0;
}
static const struct bigfile_ops mmapfile_ops = {
.loadblk = NULL,
.storeblk = mmapfile_storeblk,
.mmap_setup_read = mmapfile_mmap_setup_read,
.remmap_blk_read = mmapfile_remmap_blk_read,
.munmap = mmapfile_munmap,
.release = mmapfile_release,
};
/* verify virtmem behaviour when it is given BigFile with .mmap_* to handle data load. */
void test_file_access_mmapoverlay(void)
{
RAM *ram;
BigFileH fh_struct, *fh = &fh_struct;
VMA vma_struct, *vma = &vma_struct;
VMA vma2_struct, *vma2 = &vma2_struct;
Page *page0, *page2, *page3;
blk_t *b0, *b2;
size_t PS, PSb;
int fd, err;
diag("Testing file access (mmap base)");
// XXX save/restore sigaction ?
ok1(!pagefault_init());
ram = ram_new(NULL, NULL);
ok1(ram);
PS = ram->pagesize;
PSb = PS / sizeof(blk_t); /* page size in blk_t units */
/* implicitly use ram=ram */
#define CHECK_MRU(...) __CHECK_MRU(ram, __VA_ARGS__)
/* ensure we are starting from new ram */
CHECK_MRU(/*empty*/);
/* setup mmaped file */
char path[] = "/tmp/bigfile_mmap.XXXXXX";
fd = mkstemp(path);
ok1(fd != -1);
err = unlink(path);
ok1(!err);
BigFileMMap file = {
.blksize = ram->pagesize, /* artificially blksize = pagesize */
.file_ops = &mmapfile_ops,
.fd = fd,
.nstoreblk = 0,
.nremmapblk = 0,
.nmunmap = 0,
};
/* fstore stores data into file[blk] */
void fstore(blk_t blk, blk_t data) {
blk_t *buf;
int i;
buf = malloc(file.blksize);
BUG_ON(!buf);
for (i=0; i < file.blksize/sizeof(*buf); i++)
buf[i] = data;
err = file.file_ops->storeblk(&file, blk, buf);
BUG_ON(err);
free(buf);
}
/* initialize file[100 +4) */
fstore(100, 100);
fstore(101, 101);
fstore(102, 102);
fstore(103, 103);
err = fileh_open(fh, &file, ram, MMAP_OVERLAY);
ok1(!err);
/* implicitly use fileh=fh */
#define CHECK_PAGE(page, pgoffset, pgstate, pgrefcnt) \
__CHECK_PAGE(page, fh, pgoffset, pgstate, pgrefcnt)
#define CHECK_NOPAGE(pgoffset) __CHECK_NOPAGE(fh, pgoffset)
#define CHECK_DIRTY(...) __CHECK_DIRTY(fh, __VA_ARGS__)
err = fileh_mmap(vma, fh, 100, 4);
ok1(!err);
ok1(fh->mmaps.next == &vma->same_fileh);
ok1(vma->same_fileh.next == &fh->mmaps);
/* all pages initially unmapped */
ok1(!M(vma, 0)); CHECK_NOPAGE( 100 );
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1(!M(vma, 2)); CHECK_NOPAGE( 102 );
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
CHECK_MRU (/*empty*/);
CHECK_DIRTY (/*empty*/);
/* read page[0] - served from base mmap and no RAM page is loaded */
ok1(B(vma, 0*PSb + 0) == 100);
ok1(B(vma, 0*PSb + 1) == 100);
ok1(B(vma, 0*PSb + PSb - 1) == 100);
ok1(!M(vma, 0)); CHECK_NOPAGE( 100 );
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1(!M(vma, 2)); CHECK_NOPAGE( 102 );
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
CHECK_MRU (/*empty*/);
CHECK_DIRTY (/*empty*/);
/* write to page[2] - page2 is copy-on-write created in RAM */
B(vma, 2*PSb) = 12;
page2 = pagemap_get(&fh->pagemap, 102);
ok1(!M(vma, 0)); CHECK_NOPAGE( 100 );
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1( M(vma, 2)); CHECK_PAGE (page2, 102, PAGE_DIRTY, 1);
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
ok1(B(vma, 2*PSb + 0) == 12); /* set by write */
ok1(B(vma, 2*PSb + 1) == 102);
ok1(B(vma, 2*PSb + PSb - 1) == 102);
CHECK_MRU (page2);
CHECK_DIRTY (page2);
/* read page[3] - served from base mmap */
ok1(B(vma, 3*PSb + 0) == 103);
ok1(B(vma, 3*PSb + 1) == 103);
ok1(B(vma, 3*PSb + PSb - 1) == 103);
ok1(!M(vma, 0)); CHECK_NOPAGE( 100 );
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1( M(vma, 2)); CHECK_PAGE (page2, 102, PAGE_DIRTY, 1);
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
CHECK_MRU (page2);
CHECK_DIRTY (page2);
/* write to page[0] - page COW'ed into RAM */
B(vma, 0*PSb) = 10;
page0 = pagemap_get(&fh->pagemap, 100);
ok1( M(vma, 0)); CHECK_PAGE (page0, 100, PAGE_DIRTY, 1);
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1( M(vma, 2)); CHECK_PAGE (page2, 102, PAGE_DIRTY, 1);
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
ok1(B(vma, 0*PSb + 0) == 10); /* set by write */
ok1(B(vma, 0*PSb + 1) == 100);
ok1(B(vma, 0*PSb + PSb - 1) == 100);
CHECK_MRU (page0, page2);
CHECK_DIRTY (page0, page2);
/* unmap vma - dirty pages should stay in fh->pagemap and memory should
* not be forgotten */
diag("vma_unmap");
vma_unmap(vma);
ok1(list_empty(&fh->mmaps));
CHECK_PAGE (page0, 100, PAGE_DIRTY, 0);
CHECK_NOPAGE( 101 );
CHECK_PAGE (page2, 102, PAGE_DIRTY, 0);
CHECK_NOPAGE( 103 );
CHECK_MRU (page0, page2);
CHECK_DIRTY (page0, page2);
b0 = page_mmap(page0, NULL, PROT_READ); ok1(b0);
b2 = page_mmap(page2, NULL, PROT_READ); ok1(b2);
ok1(b0[0] == 10);
ok1(b0[1] == 100);
ok1(b0[PSb - 1] == 100);
ok1(b2[0] == 12);
ok1(b2[1] == 102);
ok1(b2[PSb - 1] == 102);
xmunmap(b0, PS);
xmunmap(b2, PS);
/* map vma back - dirty pages should be there _and_ mapped to vma.
* (this differs from !wcfs case which does not mmap dirty pages until access) */
diag("vma mmap again");
err = fileh_mmap(vma, fh, 100, 4);
ok1(!err);
ok1(fh->mmaps.next == &vma->same_fileh);
ok1(vma->same_fileh.next == &fh->mmaps);
ok1( M(vma, 0)); CHECK_PAGE (page0, 100, PAGE_DIRTY, 1);
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1( M(vma, 2)); CHECK_PAGE (page2, 102, PAGE_DIRTY, 1);
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
CHECK_MRU (page0, page2);
CHECK_DIRTY (page0, page2);
/* dirtying a page in one mapping should automatically mmap the dirty page
* in all other wcfs mappings */
diag("dirty page in vma2 -> dirties vma1");
err = fileh_mmap(vma2, fh, 100, 4);
ok1(!err);
ok1(fh->mmaps.next == &vma->same_fileh);
ok1(vma->same_fileh.next == &vma2->same_fileh);
ok1(vma2->same_fileh.next == &fh->mmaps);
ok1( M(vma, 0)); CHECK_PAGE (page0, 100, PAGE_DIRTY, 2);
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1( M(vma, 2)); CHECK_PAGE (page2, 102, PAGE_DIRTY, 2);
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
ok1( M(vma2, 0));
ok1(!M(vma2, 1));
ok1( M(vma2, 2));
ok1(!M(vma2, 3));
CHECK_MRU (page0, page2);
CHECK_DIRTY (page0, page2);
B(vma2, 3*PSb) = 13; /* write to page[3] via vma2 */
page3 = pagemap_get(&fh->pagemap, 103);
ok1( M(vma, 0)); CHECK_PAGE (page0, 100, PAGE_DIRTY, 2);
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1( M(vma, 2)); CHECK_PAGE (page2, 102, PAGE_DIRTY, 2);
ok1( M(vma, 3)); CHECK_PAGE (page3, 103, PAGE_DIRTY, 2);
ok1( M(vma2, 0));
ok1(!M(vma2, 1));
ok1( M(vma2, 2));
ok1( M(vma2, 3));
ok1(B(vma, 3*PSb + 0) == 13); /* set by write */
ok1(B(vma, 3*PSb + 1) == 103);
ok1(B(vma, 3*PSb + PSb - 1) == 103);
ok1(B(vma2, 3*PSb + 0) == 13); /* set by write */
ok1(B(vma2, 3*PSb + 1) == 103);
ok1(B(vma2, 3*PSb + PSb - 1) == 103);
CHECK_MRU (page3, page0, page2);
CHECK_DIRTY (page3, page0, page2);
/* unmap vma2 */
diag("unmap vma2");
vma_unmap(vma2);
ok1(fh->mmaps.next == &vma->same_fileh);
ok1(vma->same_fileh.next == &fh->mmaps);
ok1( M(vma, 0)); CHECK_PAGE (page0, 100, PAGE_DIRTY, 1);
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1( M(vma, 2)); CHECK_PAGE (page2, 102, PAGE_DIRTY, 1);
ok1( M(vma, 3)); CHECK_PAGE (page3, 103, PAGE_DIRTY, 1);
CHECK_MRU (page3, page0, page2);
CHECK_DIRTY (page3, page0, page2);
/* discard - changes should go away */
diag("discard");
ok1(file.nremmapblk == 0);
fileh_dirty_discard(fh);
ok1(file.nremmapblk == 3); /* 3 previously dirty pages remmaped from base layer */
CHECK_NOPAGE( 100 );
ok1(!M(vma, 0)); CHECK_NOPAGE( 100 );
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1(!M(vma, 2)); CHECK_NOPAGE( 102 );
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
CHECK_MRU (/*empty*/);
CHECK_DIRTY (/*empty*/);
/* discarded pages should read from base layer again */
ok1(B(vma, 0*PSb + 0) == 100);
ok1(B(vma, 0*PSb + 1) == 100);
ok1(B(vma, 0*PSb + PSb - 1) == 100);
ok1(B(vma, 2*PSb + 0) == 102);
ok1(B(vma, 2*PSb + 1) == 102);
ok1(B(vma, 2*PSb + PSb - 1) == 102);
ok1(B(vma, 3*PSb + 0) == 103);
ok1(B(vma, 3*PSb + 1) == 103);
ok1(B(vma, 3*PSb + PSb - 1) == 103);
ok1(!M(vma, 0)); CHECK_NOPAGE( 100 );
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1(!M(vma, 2)); CHECK_NOPAGE( 102 );
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
/* writeout in 3 variants - STORE, MARK, STORE+MARK */
diag("writeout");
/* mkdirty2 prepares state with 2 dirty pages only 1 of which is mapped */
void mkdirty2(int gen) {
vma_unmap(vma);
CHECK_NOPAGE( 100 );
CHECK_NOPAGE( 101 );
CHECK_NOPAGE( 102 );
CHECK_NOPAGE( 103 );
page0 = page2 = page3 = NULL;
err = fileh_mmap(vma, fh, 100, 4);
ok1(!err);
B(vma, 2*PSb) = gen + 2;
B(vma, 0*PSb) = gen + 0;
vma_unmap(vma);
page0 = pagemap_get(&fh->pagemap, 100); ok1(page0);
page2 = pagemap_get(&fh->pagemap, 102); ok1(page2);
err = fileh_mmap(vma, fh, 100, 2); /* note - only 2 pages */
ok1(!err);
ok1( M(vma, 0)); CHECK_PAGE (page0, 100, PAGE_DIRTY, 1);
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
CHECK_PAGE (page2, 102, PAGE_DIRTY, 0);
CHECK_NOPAGE( 103 );
CHECK_MRU (page0, page2);
CHECK_DIRTY (page0, page2);
}
diag("writeout (store)");
file.nstoreblk = 0;
file.nremmapblk = 0;
mkdirty2(10);
ok1(!fileh_dirty_writeout(fh, WRITEOUT_STORE));
ok1(file.nstoreblk == 2);
ok1(file.nremmapblk == 0);
ok1( M(vma, 0)); CHECK_PAGE (page0, 100, PAGE_DIRTY, 1);
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
CHECK_PAGE (page2, 102, PAGE_DIRTY, 0);
CHECK_NOPAGE( 103 );
CHECK_MRU (page0, page2);
CHECK_DIRTY (page2, page0); /* note becomes sorted by f_pgoffset */
ok1(B(vma, 0*PSb + 0) == 10);
ok1(B(vma, 0*PSb + 1) == 100);
ok1(B(vma, 0*PSb + PSb - 1) == 100);
b0 = page_mmap(page0, NULL, PROT_READ); ok1(b0);
b2 = page_mmap(page2, NULL, PROT_READ); ok1(b2);
ok1(b0[0] == 10);
ok1(b0[1] == 100);
ok1(b0[PSb - 1] == 100);
ok1(b2[0] == 12);
ok1(b2[1] == 102);
ok1(b2[PSb - 1] == 102);
xmunmap(b0, PS);
xmunmap(b2, PS);
diag("writeout (mark)");
file.nstoreblk = 0;
file.nremmapblk = 0;
ok1(!fileh_dirty_writeout(fh, WRITEOUT_MARKSTORED));
ok1(file.nstoreblk == 0);
ok1(file.nremmapblk == 1); /* only 1 (not 2) page was mmaped */
ok1(!M(vma, 0)); CHECK_NOPAGE( 100 );
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
CHECK_NOPAGE( 102 );
CHECK_NOPAGE( 103 );
CHECK_MRU (/*empty*/);
CHECK_DIRTY (/*empty*/);
vma_unmap(vma);
err = fileh_mmap(vma, fh, 100, 4);
/* data saved; served from base layer */
ok1(B(vma, 0*PSb + 0) == 10);
ok1(B(vma, 0*PSb + 1) == 100);
ok1(B(vma, 0*PSb + PSb - 1) == 100);
ok1(B(vma, 2*PSb + 0) == 12);
ok1(B(vma, 2*PSb + 1) == 102);
ok1(B(vma, 2*PSb + PSb - 1) == 102);
ok1(!M(vma, 0)); CHECK_NOPAGE( 100 );
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1(!M(vma, 2)); CHECK_NOPAGE( 102 );
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
diag("writeout (store+mark)");
mkdirty2(1000);
file.nstoreblk = 0;
file.nremmapblk = 0;
ok1(!fileh_dirty_writeout(fh, WRITEOUT_STORE | WRITEOUT_MARKSTORED));
ok1(file.nstoreblk == 2);
ok1(file.nremmapblk == 1); /* only 1 (not 2) page was mmaped */
ok1(!M(vma, 0)); CHECK_NOPAGE( 100 );
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
CHECK_NOPAGE( 102 );
CHECK_NOPAGE( 103 );
CHECK_MRU (/*empty*/);
CHECK_DIRTY (/*empty*/);
vma_unmap(vma);
err = fileh_mmap(vma, fh, 100, 4);
/* data saved; served from base layer */
ok1(B(vma, 0*PSb + 0) == 1000);
ok1(B(vma, 0*PSb + 1) == 100);
ok1(B(vma, 0*PSb + PSb - 1) == 100);
ok1(B(vma, 2*PSb + 0) == 1002);
ok1(B(vma, 2*PSb + 1) == 102);
ok1(B(vma, 2*PSb + PSb - 1) == 102);
ok1(!M(vma, 0)); CHECK_NOPAGE( 100 );
ok1(!M(vma, 1)); CHECK_NOPAGE( 101 );
ok1(!M(vma, 2)); CHECK_NOPAGE( 102 );
ok1(!M(vma, 3)); CHECK_NOPAGE( 103 );
/* no invalidation - fileh_invalidate_page is forbidden for "mmap overlay" mode */
/* free resources */
file.nmunmap = 0;
vma_unmap(vma);
ok1(file.nmunmap == 1);
fileh_close(fh);
ram_close(ram);
free(ram);
#undef CHECK_MRU
#undef CHECK_PAGE
#undef CHECK_NOPAGE
#undef CHECK_DIRTY
}
// TODO test for loadblk that returns -1 // TODO test for loadblk that returns -1
...@@ -1121,5 +1659,6 @@ int main() ...@@ -1121,5 +1659,6 @@ int main()
test_file_access_synthetic(); test_file_access_synthetic();
test_file_access_pagefault(); test_file_access_pagefault();
test_pagefault_savestate(); test_pagefault_savestate();
test_file_access_mmapoverlay();
return 0; return 0;
} }
/* Wendelin.bigfile | tests for real faults leading to crash /* Wendelin.bigfile | tests for real faults leading to crash
* Copyright (C) 2014-2019 Nexedi SA and Contributors. * Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com> * Kirill Smelkov <kirr@nexedi.com>
* *
* This program is free software: you can Use, Study, Modify and Redistribute * This program is free software: you can Use, Study, Modify and Redistribute
...@@ -109,7 +109,7 @@ void fault_in_loadblk() ...@@ -109,7 +109,7 @@ void fault_in_loadblk()
.file_ops = &faulty_ops, .file_ops = &faulty_ops,
}; };
err = fileh_open(&fh, &f, ram); err = fileh_open(&fh, &f, ram, DONT_MMAP_OVERLAY);
ok1(!err); ok1(!err);
err = fileh_mmap(vma, &fh, 0, 2); err = fileh_mmap(vma, &fh, 0, 2);
...@@ -164,7 +164,7 @@ void fault_in_storeblk() ...@@ -164,7 +164,7 @@ void fault_in_storeblk()
.file_ops = &faulty_ops, .file_ops = &faulty_ops,
}; };
err = fileh_open(&fh, &f, ram); err = fileh_open(&fh, &f, ram, DONT_MMAP_OVERLAY);
ok1(!err); ok1(!err);
err = fileh_mmap(vma, &fh, 0, 2); err = fileh_mmap(vma, &fh, 0, 2);
......
/* Wendelin.bigfile | Virtual memory /* Wendelin.bigfile | Virtual memory
* Copyright (C) 2014-2020 Nexedi SA and Contributors. * Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com> * Kirill Smelkov <kirr@nexedi.com>
* *
* This program is free software: you can Use, Study, Modify and Redistribute * This program is free software: you can Use, Study, Modify and Redistribute
...@@ -138,10 +138,25 @@ static void sigsegv_restore(const sigset_t *save_sigset) ...@@ -138,10 +138,25 @@ static void sigsegv_restore(const sigset_t *save_sigset)
* OPEN / CLOSE * * OPEN / CLOSE *
****************/ ****************/
int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram) int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram, FileHOpenFlags flags)
{ {
int err = 0; int err = 0;
sigset_t save_sigset; sigset_t save_sigset;
const bigfile_ops *fops = file->file_ops;
if (!(flags == 0 || flags == MMAP_OVERLAY || flags == DONT_MMAP_OVERLAY))
return -EINVAL;
if (flags == 0)
flags = fops->mmap_setup_read ? MMAP_OVERLAY : DONT_MMAP_OVERLAY;
if (flags & MMAP_OVERLAY && flags & DONT_MMAP_OVERLAY)
return -EINVAL;
if (flags == MMAP_OVERLAY) {
ASSERT(fops->mmap_setup_read);
ASSERT(fops->remmap_blk_read);
ASSERT(fops->munmap);
}
if (flags == DONT_MMAP_OVERLAY)
ASSERT(fops->loadblk);
sigsegv_block(&save_sigset); sigsegv_block(&save_sigset);
virt_lock(); virt_lock();
...@@ -159,6 +174,8 @@ int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram) ...@@ -159,6 +174,8 @@ int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram)
fileh->writeout_inprogress = 0; fileh->writeout_inprogress = 0;
pagemap_init(&fileh->pagemap, ilog2_exact(ram->pagesize)); pagemap_init(&fileh->pagemap, ilog2_exact(ram->pagesize));
fileh->mmap_overlay = (flags == MMAP_OVERLAY);
out: out:
virt_unlock(); virt_unlock();
sigsegv_restore(&save_sigset); sigsegv_restore(&save_sigset);
...@@ -212,8 +229,9 @@ void fileh_close(BigFileH *fileh) ...@@ -212,8 +229,9 @@ void fileh_close(BigFileH *fileh)
int fileh_mmap(VMA *vma, BigFileH *fileh, pgoff_t pgoffset, pgoff_t pglen) int fileh_mmap(VMA *vma, BigFileH *fileh, pgoff_t pgoffset, pgoff_t pglen)
{ {
void *addr;
size_t len = pglen * fileh->ramh->ram->pagesize; size_t len = pglen * fileh->ramh->ram->pagesize;
BigFile *file = fileh->file;
const bigfile_ops *fops = file->file_ops;
int err = 0; int err = 0;
sigset_t save_sigset; sigset_t save_sigset;
...@@ -230,15 +248,40 @@ int fileh_mmap(VMA *vma, BigFileH *fileh, pgoff_t pgoffset, pgoff_t pglen) ...@@ -230,15 +248,40 @@ int fileh_mmap(VMA *vma, BigFileH *fileh, pgoff_t pgoffset, pgoff_t pglen)
if (!vma->page_ismappedv) if (!vma->page_ismappedv)
goto fail; goto fail;
/* allocate address space somewhere */ if (fileh->mmap_overlay) {
addr = mem_valloc(NULL, len); /* wcfs: mmap(base, READ)
* vma->addr_{start,stop} are initialized by mmap_setup_read */
TODO (file->blksize != fileh->ramh->ram->pagesize);
err = fops->mmap_setup_read(vma, file, pgoffset, pglen);
if (err)
goto fail;
} else {
/* !wcfs: allocate address space somewhere */
void *addr = mem_valloc(NULL, len);
if (!addr) if (!addr)
goto fail; goto fail;
/* vma address range known */
/* everything allocated - link it up */
vma->addr_start = (uintptr_t)addr; vma->addr_start = (uintptr_t)addr;
vma->addr_stop = vma->addr_start + len; vma->addr_stop = vma->addr_start + len;
}
/* wcfs: mmap(fileh->dirty_pages) over base */
if (fileh->mmap_overlay) {
Page* page;
struct list_head *hpage;
list_for_each(hpage, &fileh->dirty_pages) {
page = list_entry(hpage, typeof(*page), in_dirty);
BUG_ON(page->state != PAGE_DIRTY);
if (!vma_page_infilerange(vma, page))
continue; /* page is out of requested mmap coverage */
vma_mmap_page(vma, page);
}
}
/* everything allocated - link it up */
// XXX need to init vma->virt_list first? // XXX need to init vma->virt_list first?
/* hook vma to fileh->mmaps */ /* hook vma to fileh->mmaps */
...@@ -282,7 +325,12 @@ void vma_unmap(VMA *vma) ...@@ -282,7 +325,12 @@ void vma_unmap(VMA *vma)
/* unmap whole vma at once - the kernel unmaps each mapping in turn. /* unmap whole vma at once - the kernel unmaps each mapping in turn.
* NOTE error here would mean something is broken */ * NOTE error here would mean something is broken */
if (fileh->mmap_overlay) {
int err = fileh->file->file_ops->munmap(vma, fileh->file);
BUG_ON(err);
} else {
xmunmap((void *)vma->addr_start, len); xmunmap((void *)vma->addr_start, len);
}
/* scan through mapped-to-this-vma pages and release them */ /* scan through mapped-to-this-vma pages and release them */
for (i=0; i < pglen; ++i) { for (i=0; i < pglen; ++i) {
...@@ -384,16 +432,48 @@ int fileh_dirty_writeout(BigFileH *fileh, enum WriteoutFlags flags) ...@@ -384,16 +432,48 @@ int fileh_dirty_writeout(BigFileH *fileh, enum WriteoutFlags flags)
goto out; goto out;
} }
/* page.state -> PAGE_LOADED and correct mappings RW -> R */ /* wcfs: remmap RW pages to base layer
* !wcfs: page.state -> PAGE_LOADED and correct mappings RW -> R
*
* NOTE for transactional storage (ZODB and ZBigFile) storeblk creates
* new transaction on database side, but does not update current DB
* connection to view that transaction. Thus if loadblk will be loaded
* with not-yet-resynced DB connection, it will return old - not stored
* - data. For !wcfs case this is partly mitigated by the fact that
* stored pages are kept as PAGE_LOADED in ram, but it cannot be
* relied as ram_reclaim can drop those pages and read access to them
* will trigger loadblk from database which will return old data.
* For wcfs case remapping to base layer will always return old data
* until wcfs mapping is updated to view database at newer state.
*
* In general it is a bug to access data pages in between transactions,
* so we accept those corner case difference in between wcfs and !wcfs.
*/
if (flags & WRITEOUT_MARKSTORED) { if (flags & WRITEOUT_MARKSTORED) {
page->state = PAGE_LOADED; page->state = PAGE_LOADED;
list_del_init(&page->in_dirty); list_del_init(&page->in_dirty);
list_for_each(hmmap, &fileh->mmaps) { list_for_each(hmmap, &fileh->mmaps) {
VMA *vma = list_entry(hmmap, typeof(*vma), same_fileh); VMA *vma = list_entry(hmmap, typeof(*vma), same_fileh);
if (fileh->mmap_overlay) {
/* wcfs: RW -> base layer */
vma_page_ensure_unmapped(vma, page);
} else {
/* !wcfs: RW -> R*/
vma_page_ensure_notmappedrw(vma, page); vma_page_ensure_notmappedrw(vma, page);
} }
} }
/* wcfs: all vmas are using base layer now - drop page completely
* without unnecessarily growing RSS and relying on reclaim.
* !wcfs: keep the page in RAM cache, even if it is not mapped anywhere */
if (fileh->mmap_overlay) {
ASSERT(page->refcnt == 0);
pagemap_del(&fileh->pagemap, page->f_pgoffset);
page_drop_memory(page);
page_del(page);
}
}
} }
...@@ -428,6 +508,11 @@ void fileh_dirty_discard(BigFileH *fileh) ...@@ -428,6 +508,11 @@ void fileh_dirty_discard(BigFileH *fileh)
BUG_ON(page->state != PAGE_DIRTY); BUG_ON(page->state != PAGE_DIRTY);
page_drop_memory(page); page_drop_memory(page);
// TODO consider doing pagemap_del + page_del unconditionally
if (fileh->mmap_overlay) {
pagemap_del(&fileh->pagemap, page->f_pgoffset);
page_del(page);
}
} }
BUG_ON(!list_empty(&fileh->dirty_pages)); BUG_ON(!list_empty(&fileh->dirty_pages));
...@@ -452,6 +537,15 @@ void fileh_invalidate_page(BigFileH *fileh, pgoff_t pgoffset) ...@@ -452,6 +537,15 @@ void fileh_invalidate_page(BigFileH *fileh, pgoff_t pgoffset)
/* it's an error to invalidate fileh while writeout is in progress */ /* it's an error to invalidate fileh while writeout is in progress */
BUG_ON(fileh->writeout_inprogress); BUG_ON(fileh->writeout_inprogress);
/* wcfs: even though the operation to invalidate a page is well defined (it
* is subset of discard), we forbid it since wcfs handles invalidations
* from ZODB by itself inside wcfs server.
*
* It was kind of mistake to expose in 92bfd03e (bigfile: ZODB -> BigFileH
* invalidate propagation) fileh_invalidate_page as public API, since such
* invalidation should be handled by a BigFile instance internally. */
BUG_ON(fileh->mmap_overlay);
page = pagemap_get(&fileh->pagemap, pgoffset); page = pagemap_get(&fileh->pagemap, pgoffset);
if (page) { if (page) {
/* for pages where loading is in progress, we just remove the page from /* for pages where loading is in progress, we just remove the page from
...@@ -588,6 +682,7 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write) ...@@ -588,6 +682,7 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
pgoff_t pagen; pgoff_t pagen;
Page *page; Page *page;
BigFileH *fileh; BigFileH *fileh;
struct list_head *hmmap;
/* continuing on_pagefault() - see (1) there ... */ /* continuing on_pagefault() - see (1) there ... */
...@@ -595,9 +690,52 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write) ...@@ -595,9 +690,52 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
fileh = vma->fileh; fileh = vma->fileh;
pagen = vma_addr_fpgoffset(vma, addr); pagen = vma_addr_fpgoffset(vma, addr);
/* wcfs: we should get into SIGSEGV handler only on write access */
if (fileh->mmap_overlay)
BUG_ON(!write);
/* (3) fileh, pagen -> page (via pagemap) */ /* (3) fileh, pagen -> page (via pagemap) */
page = pagemap_get(&fileh->pagemap, pagen); page = pagemap_get(&fileh->pagemap, pagen);
/* wcfs: all dirty pages are mmapped when vma is created.
* thus here, normally, if page is present in pagemap, it can be only either
* - a page we just loaded for dirtying, or
* - a page that is in progress of being loaded.
*
* however it can be also a *dirty* page due to simultaneous write
* access from 2 threads:
*
* T1 T2
*
* write pagefault write pagefault
* virt_lock
* page.state = PAGE_LOADING
* virt_unlock
* # start loading the page
* ...
* # loading completed
* virt_lock
* page.state = PAGE_LOADED_FOR_WRITE
* virt_unlock
* return VM_RETRY
* virt_lock
* # sees page.state = PAGE_LOADED_FOR_WRITE
* page.state = PAGE_DIRTY
* virt_unlock
*
* # retrying
* virt_lock
* # sees page.state = PAGE_DIRTY <--
*
*
* ( PAGE_LOADED_FOR_WRITE is used only to verify that in wcfs mode we
* always keep all dirty pages mmapped on fileh_open and so pagefault
* handler must not see a PAGE_LOADED page. )
*/
if (fileh->mmap_overlay && page)
ASSERT(page->state == PAGE_LOADED_FOR_WRITE || page->state == PAGE_LOADING ||
page->state == PAGE_DIRTY);
/* (4) no page found - allocate new from ram */ /* (4) no page found - allocate new from ram */
while (!page) { while (!page) {
page = ramh_alloc_page(fileh->ramh, pagen); page = ramh_alloc_page(fileh->ramh, pagen);
...@@ -666,18 +804,26 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write) ...@@ -666,18 +804,26 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
pageram = page_mmap(page, NULL, PROT_READ | PROT_WRITE); pageram = page_mmap(page, NULL, PROT_READ | PROT_WRITE);
TODO(!pageram); // XXX err TODO(!pageram); // XXX err
/* loadblk() -> pageram memory */ /* load block -> pageram memory */
blk = page->f_pgoffset; // NOTE because blksize = pagesize blk = page->f_pgoffset; // NOTE because blksize = pagesize
/* mark page as loading and unlock virtmem before calling loadblk() /* mark page as loading and unlock virtmem before doing actual load via
* loadblk() or wcfs.
* *
* that call is potentially slow and external code can take other * both calls are potentially slow and external code can take other
* locks. If that "other locks" are also taken before external code * locks. If that "other locks" are also taken before external code
* calls e.g. fileh_invalidate_page() in different codepath a deadlock * calls e.g. fileh_invalidate_page() in different codepath a deadlock
* can happen. (similar to storeblk case) */ * can happen. (similar to storeblk case) */
page->state = PAGE_LOADING; page->state = PAGE_LOADING;
virt_unlock(); virt_unlock();
if (fileh->mmap_overlay) {
/* wcfs: copy block data from read-only base mmap.
* NOTE we'll get SIGBUG here if wcfs returns EIO when loading block data */
memcpy(pageram, vma_page_addr(vma, page), page_size(page));
}
else {
/* !wcfs: call loadblk */
err = file->file_ops->loadblk(file, blk, pageram); err = file->file_ops->loadblk(file, blk, pageram);
/* TODO on error -> try to throw exception somehow to the caller, so /* TODO on error -> try to throw exception somehow to the caller, so
...@@ -687,6 +833,7 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write) ...@@ -687,6 +833,7 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
* kernel sends SIGBUS * kernel sends SIGBUS
*/ */
TODO (err); TODO (err);
}
/* relock virtmem */ /* relock virtmem */
virt_lock(); virt_lock();
...@@ -703,7 +850,7 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write) ...@@ -703,7 +850,7 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
/* else just mark the page as loaded ok */ /* else just mark the page as loaded ok */
else else
page->state = PAGE_LOADED; page->state = (write ? PAGE_LOADED_FOR_WRITE : PAGE_LOADED);
/* we have to retry the whole fault, because the vma could have been /* we have to retry the whole fault, because the vma could have been
* changed while we were loading page with virtmem lock released */ * changed while we were loading page with virtmem lock released */
...@@ -736,7 +883,7 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write) ...@@ -736,7 +883,7 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
/* (6) page data ready. Mmap it atomically into vma address space, or mprotect /* (6) page data ready. Mmap it atomically into vma address space, or mprotect
* appropriately if it was already mmaped. */ * appropriately if it was already mmaped. */
PageState newstate = PAGE_LOADED; PageState newstate = PAGE_LOADED;
if (write || page->state == PAGE_DIRTY) { if (write || page->state == PAGE_DIRTY || page->state == PAGE_LOADED_FOR_WRITE) {
newstate = PAGE_DIRTY; newstate = PAGE_DIRTY;
} }
...@@ -750,6 +897,19 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write) ...@@ -750,6 +897,19 @@ VMFaultResult vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
page->state = max(page->state, newstate); page->state = max(page->state, newstate);
vma_mmap_page(vma, page); vma_mmap_page(vma, page);
/* wcfs: also mmap the page to all wcfs-backed vmas. If we don't, the
* memory on those vmas will read with stale data */
if (fileh->mmap_overlay) {
list_for_each(hmmap, &fileh->mmaps) {
VMA *vma2 = list_entry(hmmap, typeof(*vma2), same_fileh);
if (vma2 == vma)
continue;
if (!vma_page_infilerange(vma2, page))
continue; /* page is out of vma2 file-range coverage */
vma_mmap_page(vma2, page);
}
}
/* mark page as used recently */ /* mark page as used recently */
// XXX = list_move_tail() // XXX = list_move_tail()
...@@ -785,8 +945,16 @@ static int __ram_reclaim(RAM *ram) ...@@ -785,8 +945,16 @@ static int __ram_reclaim(RAM *ram)
scanned++; scanned++;
/* can release ram only from loaded non-dirty pages /* can release ram only from loaded non-dirty pages
* NOTE PAGE_LOADING pages are not dropped - they just continue to load */ * NOTE PAGE_LOADING pages are not dropped - they just continue to load
if (page->state == PAGE_LOADED) { *
* NOTE PAGE_LOADED_FOR_WRITE are dropped too - even if normally they
* are going to be dirtied in a moment, due to VM_RETRY logic and so
* VMA might be changing simultaneously to pagefault handling, a
* page might remain in pagemap in PAGE_LOADED_FOR_WRITE state
* indefinitely unused and without actually being dirtied.
*
* TODO drop PAGE_LOADED_FOR_WRITE only after all PAGE_LOADED have been reclaimed. */
if (page->state == PAGE_LOADED || page->state == PAGE_LOADED_FOR_WRITE) {
page_drop_memory(page); page_drop_memory(page);
batch--; batch--;
} }
...@@ -934,6 +1102,7 @@ static void vma_mmap_page(VMA *vma, Page *page) { ...@@ -934,6 +1102,7 @@ static void vma_mmap_page(VMA *vma, Page *page) {
pgoff_t pgoff_invma; pgoff_t pgoff_invma;
int prot = (page->state == PAGE_DIRTY ? PROT_READ|PROT_WRITE : PROT_READ); int prot = (page->state == PAGE_DIRTY ? PROT_READ|PROT_WRITE : PROT_READ);
// NOTE: PAGE_LOADED_FOR_WRITE not passed here
ASSERT(page->state == PAGE_LOADED || page->state == PAGE_DIRTY); ASSERT(page->state == PAGE_LOADED || page->state == PAGE_DIRTY);
ASSERT(vma->f_pgoffset <= page->f_pgoffset && ASSERT(vma->f_pgoffset <= page->f_pgoffset &&
page->f_pgoffset < vma_addr_fpgoffset(vma, vma->addr_stop)); page->f_pgoffset < vma_addr_fpgoffset(vma, vma->addr_stop));
...@@ -980,8 +1149,19 @@ static void vma_page_ensure_unmapped(VMA *vma, Page *page) ...@@ -980,8 +1149,19 @@ static void vma_page_ensure_unmapped(VMA *vma, Page *page)
if (!vma_page_ismapped(vma, page)) if (!vma_page_ismapped(vma, page))
return; return;
/* mmap empty PROT_NONE address space instead of page memory */ if (vma->fileh->mmap_overlay) {
/* wcfs: remmap readonly to base image */
BigFile *file = vma->fileh->file;
int err;
TODO (file->blksize != page_size(page));
err = file->file_ops->remmap_blk_read(vma, file, /* blk = */page->f_pgoffset);
BUG_ON(err); /* must not fail */
}
else {
/* !wcfs: mmap empty PROT_NONE address space instead of page memory */
mem_xvalloc(vma_page_addr(vma, page), page_size(page)); mem_xvalloc(vma_page_addr(vma, page), page_size(page));
}
bitmap_clear_bit(vma->page_ismappedv, page->f_pgoffset - vma->f_pgoffset); bitmap_clear_bit(vma->page_ismappedv, page->f_pgoffset - vma->f_pgoffset);
page_decref(page); page_decref(page);
...@@ -1004,6 +1184,21 @@ static void vma_page_ensure_notmappedrw(VMA *vma, Page *page) ...@@ -1004,6 +1184,21 @@ static void vma_page_ensure_notmappedrw(VMA *vma, Page *page)
xmprotect(vma_page_addr(vma, page), page_size(page), PROT_READ); xmprotect(vma_page_addr(vma, page), page_size(page), PROT_READ);
} }
/* __fileh_page_isdirty returns whether fileh page is dirty or not.
*
* must be called under virtmem lock.
*/
bool __fileh_page_isdirty(BigFileH *fileh, pgoff_t pgoffset)
{
Page *page;
page = pagemap_get(&fileh->pagemap, pgoffset);
if (!page)
return false;
return (page->state == PAGE_DIRTY);
}
// XXX stub // XXX stub
void OOM(void) void OOM(void)
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define _WENDELIN_BIGFILE_FILE_H_ #define _WENDELIN_BIGFILE_FILE_H_
/* Wendelin.bigfile | Base file class /* Wendelin.bigfile | Base file class
* Copyright (C) 2014-2020 Nexedi SA and Contributors. * Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com> * Kirill Smelkov <kirr@nexedi.com>
* *
* This program is free software: you can Use, Study, Modify and Redistribute * This program is free software: you can Use, Study, Modify and Redistribute
...@@ -26,7 +26,8 @@ ...@@ -26,7 +26,8 @@
* particular BigFile implementations must provide. * particular BigFile implementations must provide.
* *
* The interfaces are described in `struct bigfile_ops`. * The interfaces are described in `struct bigfile_ops`.
* A particular BigFile implementation must provide loadblk/storeblk methods. * A particular BigFile implementation must provide loadblk/storeblk and
* optionally mmap_* methods.
* *
* Clients work with bigfiles via mapping files to memory - see * Clients work with bigfiles via mapping files to memory - see
* wendelin/bigfile/virtmem.h and BigFileH for client-level API details. * wendelin/bigfile/virtmem.h and BigFileH for client-level API details.
...@@ -39,11 +40,13 @@ ...@@ -39,11 +40,13 @@
extern "C" { extern "C" {
#endif #endif
typedef struct VMA VMA;
/* BigFile base class /* BigFile base class
* *
* BigFile is a file of fixed size blocks. It knows how to load/store blocks * BigFile is a file of fixed size blocks. It knows how to load/store blocks
* to/from memory. Nothing else. * to/from memory. It can be also optionally mmaped into memory.
* *
* Concrete file implementations subclass BigFile and define their file_ops. * Concrete file implementations subclass BigFile and define their file_ops.
*/ */
...@@ -79,6 +82,91 @@ struct bigfile_ops { ...@@ -79,6 +82,91 @@ struct bigfile_ops {
* The file is not otherwise used at the time of and past release call. * The file is not otherwise used at the time of and past release call.
*/ */
void (*release) (BigFile *file); void (*release) (BigFile *file);
/* Mmap overlaying
*
* Besides .loadblk and .storeblk a particular BigFile implementation can
* also optionally provide functions to setup read-only memory mappings
* with BigFile data. If such functions are provided, virtmem might use
* them to organize read access to BigFile data through the mappings and
* without allocating RAM for read pages. RAM will still be allocated for
* dirtied pages that are layed over base data layer provided by the
* mappings.
*
* The primary user of this functionality will be wcfs - virtual filesystem that
* provides access to ZBigFile data via OS-level files(*). The layering can
* be schematically depicted as follows
*
* ┌──┐ ┌──┐
* │RW│ │RW│ ← dirty pages
* └──┘ └──┘
* +
* ───────────────────────────────────────────── ← mmap'ed base data
*
* The functions to setup memory mappings are:
*
* - mmap_setup_read(vma, file[blk +blklen)) setup initial read-only mmap to serve vma
* - remmap_blk_read(vma, file[blk]) remmap blk into vma again, after e.g.
* RW dirty page was discarded
* - munmap(vma) before VMA is unmapped
*
*
* (*) see wcfs/client/wcfs.h and wcfs/wcfs.go
*/
/* mmap_setup_read is called to setup new read-only mapping of file[blk +blklen).
*
* The mapping will be used as the base read-only layer for vma.
*
* After setup bigfile backend manages the mapping and can change it dynamically
* e.g. due to changes to the file from outside. However before changing a page,
* the backend must check if that page was already dirtied by virtmem and if
* so don't change that page until virtmem calls .remmap_blk_read.
*
* The checking has to be done with virtmem lock held. A sketch of mapping
* update sequence is as below:
*
* // backend detects that block is changed from outside
* // fileh is vma->fileh - file handle with which the vma is associated
* virt_lock()
* for (pgoff : page_offsets_covered_by(blk))
* if (!__fileh_page_isdirty(fileh, pgoff)) {
* // update mappings for all fileh's vma that cover pgoff
* }
* virt_unlock()
*
* mmap_setup_read must set vma.addr_start and vma.addr_stop according to
* created memory mapping.
*
* mmap_setup_read can use vma.mmap_overlay_server to associate vma with
* object pointer specific to serving created mapping.
*
* Called under virtmem lock. TODO easy to rework to call with !virt_lock
*
* NOTE blk and blklen are in blocks, not pages.
*
* @addr NULL - mmap at anywhere, !NULL - mmap exactly at addr.
* @return 0 - ok !0 - fail
*/
int (*mmap_setup_read) (VMA *vma, BigFile *file, blk_t blk, size_t blklen);
/* remmap_blk_read is called to remmap a block into vma again, after e.g.
* RW dirty page was discarded.
*
* Called under virtmem lock. XXX hard to rework to call with !virt_lock
* Virtmem considers remmap_blk_read failure as fatal.
*/
int (*remmap_blk_read) (VMA *vma, BigFile *file, blk_t blk);
/* munmap is called when vma set up via mmap_setup_read is going to be unmapped.
*
* Called under virtmem lock. TODO easy to rework to call with !virt_lock
* Virtmem considers munmap failure as fatal.
*/
int (*munmap) (VMA *vma, BigFile *file);
}; };
typedef struct bigfile_ops bigfile_ops; typedef struct bigfile_ops bigfile_ops;
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define _WENDELIN_BIGFILE_VIRTMEM_H_ #define _WENDELIN_BIGFILE_VIRTMEM_H_
/* Wendelin.bigfile | Virtual memory /* Wendelin.bigfile | Virtual memory
* Copyright (C) 2014-2019 Nexedi SA and Contributors. * Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com> * Kirill Smelkov <kirr@nexedi.com>
* *
* This program is free software: you can Use, Study, Modify and Redistribute * This program is free software: you can Use, Study, Modify and Redistribute
...@@ -29,9 +29,24 @@ ...@@ -29,9 +29,24 @@
* Read access to mapped pages cause their on-demand loading, and write access * Read access to mapped pages cause their on-demand loading, and write access
* marks modified pages as dirty. Dirty pages then can be on request either * marks modified pages as dirty. Dirty pages then can be on request either
* written out back to file or discarded. * written out back to file or discarded.
*
*
* Mmap overlaying
*
* A particular BigFile implementation can optionally provide functionality to
* mmap its data into memory. For BigFile handles opened in such mode, virtmem
* does not allocate RAM for read access and will only allocate RAM when pages
* are dirtied. The mode in which BigFile handle is opened is specified via
* fileh_open(flags=...).
*
* The primary user of "mmap overlay" functionality will be wcfs - virtual
* filesystem that provides access to ZBigFile data via OS-level files(*).
*
* (*) see wcfs/client/wcfs.h and wcfs/wcfs.go
*/ */
#include <stdint.h> #include <stdint.h>
#include <stdbool.h>
#include <wendelin/list.h> #include <wendelin/list.h>
#include <wendelin/bigfile/types.h> #include <wendelin/bigfile/types.h>
#include <wendelin/bigfile/pagemap.h> #include <wendelin/bigfile/pagemap.h>
...@@ -77,6 +92,13 @@ struct BigFileH { ...@@ -77,6 +92,13 @@ struct BigFileH {
/* whether writeout is currently in progress */ /* whether writeout is currently in progress */
int writeout_inprogress; int writeout_inprogress;
/* whether base data for all VMAs of this fileh are taken as base-layer mmap
*
* ( we require all VMAs under one fileh to be of the same kind to easily
* make decision whether after writeout to keep a page in RAM or to
* completely drop it not to waste RSS unnecessarily ) */
unsigned mmap_overlay : 1;
}; };
typedef struct BigFileH BigFileH; typedef struct BigFileH BigFileH;
...@@ -89,7 +111,9 @@ enum PageState { ...@@ -89,7 +111,9 @@ enum PageState {
= 2, /* file content loading was in progress = 2, /* file content loading was in progress
while request to invalidate the page came in */ while request to invalidate the page came in */
PAGE_LOADED = 3, /* file content has been loaded and was not modified */ PAGE_LOADED = 3, /* file content has been loaded and was not modified */
PAGE_DIRTY = 4, /* file content has been loaded and was modified */ PAGE_LOADED_FOR_WRITE
= 4, /* file content has been loaded and is going to be modified */
PAGE_DIRTY = 5, /* file content has been loaded and was modified */
}; };
typedef enum PageState PageState; typedef enum PageState PageState;
...@@ -142,6 +166,16 @@ struct VMA { ...@@ -142,6 +166,16 @@ struct VMA {
/* whether corresponding to pgoffset-f_offset page is mapped in this VMA */ /* whether corresponding to pgoffset-f_offset page is mapped in this VMA */
bitmap *page_ismappedv; /* len ~ Δaddr / pagesize */ bitmap *page_ismappedv; /* len ~ Δaddr / pagesize */
/* BigFile-specific field used when VMA was created from fileh opened with
* MMAP_OVERLAY flag. bigfile_ops.mmap_setup_read can initialize this to
* object pointer specific to serving created base overlay mapping.
*
* For example WCFS will use this to link VMA -> wcfs.Mapping to know which
* wcfs-specific mapping is serving particular virtmem VMA.
*
* NULL for VMAs created from under DONT_MMAP_OVERLAY fileh. */
void *mmap_overlay_server;
}; };
...@@ -149,15 +183,34 @@ struct VMA { ...@@ -149,15 +183,34 @@ struct VMA {
* API for clients * * API for clients *
*****************************/ *****************************/
/* flags for fileh_open */
enum FileHOpenFlags {
/* use "mmap overlay" mode for base file data of all mappings created
* for this fileh.
*
* The file must have .mmap_setup_read & friends != NULL in file_ops.
*/
MMAP_OVERLAY = 1 << 0,
/* don't use "mmap overlay" mode */
DONT_MMAP_OVERLAY = 1 << 1,
/* NOTE: if both MMAP_OVERLAY and DONT_MMAP_OVERLAY are not given,
* the behaviour is to use mmap overlay if .mmap_* fops != NULL and
* regular loads otherwise. */
};
typedef enum FileHOpenFlags FileHOpenFlags;
/* open handle for a BigFile /* open handle for a BigFile
* *
* @fileh[out] BigFileH handle to initialize for this open * @fileh[out] BigFileH handle to initialize for this open
* @file * @file
* @ram RAM that will back created fileh mappings * @ram RAM that will back created fileh mappings
* @flags flags for this open - see FileHOpenFlags
* *
* @return 0 - ok, !0 - fail * @return 0 - ok, !0 - fail
*/ */
int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram); int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram, FileHOpenFlags flags);
/* close fileh /* close fileh
...@@ -201,8 +254,9 @@ enum WriteoutFlags { ...@@ -201,8 +254,9 @@ enum WriteoutFlags {
/* mark dirty pages as stored to file ok /* mark dirty pages as stored to file ok
* *
* pages state becomes PAGE_LOADED and all mmaps are updated to map pages as * wcfs: all mmaps are updated to map read-only to base layer.
* R/O to track further writes. * !wcfs: pages state becomes PAGE_LOADED and all mmaps are updated to map
* pages as R/O to track further writes.
*/ */
WRITEOUT_MARKSTORED = 1 << 1, WRITEOUT_MARKSTORED = 1 << 1,
}; };
...@@ -252,7 +306,7 @@ void fileh_dirty_discard(BigFileH *fileh); ...@@ -252,7 +306,7 @@ void fileh_dirty_discard(BigFileH *fileh);
* file was changed externally ) * file was changed externally )
* *
* it's an error to call fileh_invalidate_page() while writeout for fileh is in * it's an error to call fileh_invalidate_page() while writeout for fileh is in
* progress. * progress, or for fileh opened in MMAP_OVERLAY mode.
*/ */
void fileh_invalidate_page(BigFileH *fileh, pgoff_t pgoffset); void fileh_invalidate_page(BigFileH *fileh, pgoff_t pgoffset);
...@@ -332,6 +386,7 @@ typedef struct VirtGilHooks VirtGilHooks; ...@@ -332,6 +386,7 @@ typedef struct VirtGilHooks VirtGilHooks;
void virt_lock_hookgil(const VirtGilHooks *gilhooks); void virt_lock_hookgil(const VirtGilHooks *gilhooks);
bool __fileh_page_isdirty(BigFileH *fileh, pgoff_t pgoff);
// XXX is this needed? think more // XXX is this needed? think more
/* what happens on out-of-memory */ /* what happens on out-of-memory */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment