Commit fae045cc authored by Kirill Smelkov's avatar Kirill Smelkov

bigfile/virtmem: Introduce "mmap overlay" mode

with the intention to later use WCFS through it.

Before this patch virtmem had only one mode: a BigFile backend was
providing loadblk and storeblk methods, and on every block access
loadblk was called to load block data into allocated RAM page.

However with WCFS virtmem won't be needed to do anything to load data -
because loading from head/bigfile/f mmaped through OS will be handled by
OS directly. Thus for wcfs, that leaves virtmem only to handle dirtying
and writeout.

-> Introduce "mmap overlay" mode into virtmem to handle WCFS-like
BigFile backends - that can provide read-only base layer suitable for
mmapping.

This patch is organized as follows:

- fileh_open is added flags argument to indicate which mode to use for
  opened fileh. BigFileH is added .mmap_overlay bitfield correspondingly.
  (virtmem.h)

- struct bigfile_ops is extended with 3 optional methods that a BigFile
  backend might provide to support mmap-overlay mode:

  * mmap_setup_read,
  * remmap_blk_read, and
  * munmap

  (see file.h changes for documentation of this new interface)

- if opened with MMAP_OVERLAY flag, virtmem is using those methods to
  organize VMA views backed by read-only base mmap layer and writeout
  for such VMAs (virtmem.c)

- a test is added to exercise MMAP_OVERLAY virtmem mode (test_virtmem.c)

- everything else, including bigfile.py, is switched to use
  DONT_MMAP_OVERLAY unconditionally for now.

In internal comments inside virtmem new mode is interchangeable called
"mmap overlay" and "wcfs", even though wcfs is not hooked to be used
mmap-overlaying yet.

Some preliminary history:

kirr/wendelin.core@fb6932a2    X Split PAGE_LOADED -> PAGE_LOADED, PAGE_LOADED_FOR_WRITE
kirr/wendelin.core@4a20a573    X Settled on what should happen after writeout for wcfs case
kirr/wendelin.core@f084ff9b    X Transition to all VMA under 1 fileh to be either all based on wcfs or all based on !wcfs
parent 10f7153a
/* Wendelin.bigfile | Python interface to memory/files
* Copyright (C) 2014-2020 Nexedi SA and Contributors.
* Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
......@@ -981,7 +981,7 @@ pyfileh_open(PyObject *pyfile0, PyObject *args)
return NULL;
Py_INCREF(pyfile);
err = fileh_open(&pyfileh->fileh, &pyfile->file, ram);
err = fileh_open(&pyfileh->fileh, &pyfile->file, ram, DONT_MMAP_OVERLAY);
if (err) {
XPyErr_SetFromErrno();
Py_DECREF(pyfile);
......
/* Wendelin.bigfile | virtual memory benchmarks
* Copyright (C) 2017-2019 Nexedi SA and Contributors.
* Copyright (C) 2017-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
......@@ -80,7 +80,7 @@ void bench_pagefault() {
};
/* setup f mapping */
err = fileh_open(fh, &f, ram);
err = fileh_open(fh, &f, ram, DONT_MMAP_OVERLAY);
ok1(!err);
err = fileh_mmap(vma, fh, 0, npage);
......
This diff is collapsed.
/* Wendelin.bigfile | tests for real faults leading to crash
* Copyright (C) 2014-2019 Nexedi SA and Contributors.
* Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
......@@ -109,7 +109,7 @@ void fault_in_loadblk()
.file_ops = &faulty_ops,
};
err = fileh_open(&fh, &f, ram);
err = fileh_open(&fh, &f, ram, DONT_MMAP_OVERLAY);
ok1(!err);
err = fileh_mmap(vma, &fh, 0, 2);
......@@ -164,7 +164,7 @@ void fault_in_storeblk()
.file_ops = &faulty_ops,
};
err = fileh_open(&fh, &f, ram);
err = fileh_open(&fh, &f, ram, DONT_MMAP_OVERLAY);
ok1(!err);
err = fileh_mmap(vma, &fh, 0, 2);
......
This diff is collapsed.
......@@ -2,7 +2,7 @@
#define _WENDELIN_BIGFILE_FILE_H_
/* Wendelin.bigfile | Base file class
* Copyright (C) 2014-2020 Nexedi SA and Contributors.
* Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
......@@ -26,7 +26,8 @@
* particular BigFile implementations must provide.
*
* The interfaces are described in `struct bigfile_ops`.
* A particular BigFile implementation must provide loadblk/storeblk methods.
* A particular BigFile implementation must provide loadblk/storeblk and
* optionally mmap_* methods.
*
* Clients work with bigfiles via mapping files to memory - see
* wendelin/bigfile/virtmem.h and BigFileH for client-level API details.
......@@ -39,11 +40,13 @@
extern "C" {
#endif
typedef struct VMA VMA;
/* BigFile base class
*
* BigFile is a file of fixed size blocks. It knows how to load/store blocks
* to/from memory. Nothing else.
* to/from memory. It can be also optionally mmaped into memory.
*
* Concrete file implementations subclass BigFile and define their file_ops.
*/
......@@ -79,6 +82,91 @@ struct bigfile_ops {
* The file is not otherwise used at the time of and past release call.
*/
void (*release) (BigFile *file);
/* Mmap overlaying
*
* Besides .loadblk and .storeblk a particular BigFile implementation can
* also optionally provide functions to setup read-only memory mappings
* with BigFile data. If such functions are provided, virtmem might use
* them to organize read access to BigFile data through the mappings and
* without allocating RAM for read pages. RAM will still be allocated for
* dirtied pages that are layed over base data layer provided by the
* mappings.
*
* The primary user of this functionality will be wcfs - virtual filesystem that
* provides access to ZBigFile data via OS-level files(*). The layering can
* be schematically depicted as follows
*
* ┌──┐ ┌──┐
* │RW│ │RW│ ← dirty pages
* └──┘ └──┘
* +
* ───────────────────────────────────────────── ← mmap'ed base data
*
* The functions to setup memory mappings are:
*
* - mmap_setup_read(vma, file[blk +blklen)) setup initial read-only mmap to serve vma
* - remmap_blk_read(vma, file[blk]) remmap blk into vma again, after e.g.
* RW dirty page was discarded
* - munmap(vma) before VMA is unmapped
*
*
* (*) see wcfs/client/wcfs.h and wcfs/wcfs.go
*/
/* mmap_setup_read is called to setup new read-only mapping of file[blk +blklen).
*
* The mapping will be used as the base read-only layer for vma.
*
* After setup bigfile backend manages the mapping and can change it dynamically
* e.g. due to changes to the file from outside. However before changing a page,
* the backend must check if that page was already dirtied by virtmem and if
* so don't change that page until virtmem calls .remmap_blk_read.
*
* The checking has to be done with virtmem lock held. A sketch of mapping
* update sequence is as below:
*
* // backend detects that block is changed from outside
* // fileh is vma->fileh - file handle with which the vma is associated
* virt_lock()
* for (pgoff : page_offsets_covered_by(blk))
* if (!__fileh_page_isdirty(fileh, pgoff)) {
* // update mappings for all fileh's vma that cover pgoff
* }
* virt_unlock()
*
* mmap_setup_read must set vma.addr_start and vma.addr_stop according to
* created memory mapping.
*
* mmap_setup_read can use vma.mmap_overlay_server to associate vma with
* object pointer specific to serving created mapping.
*
* Called under virtmem lock. TODO easy to rework to call with !virt_lock
*
* NOTE blk and blklen are in blocks, not pages.
*
* @addr NULL - mmap at anywhere, !NULL - mmap exactly at addr.
* @return 0 - ok !0 - fail
*/
int (*mmap_setup_read) (VMA *vma, BigFile *file, blk_t blk, size_t blklen);
/* remmap_blk_read is called to remmap a block into vma again, after e.g.
* RW dirty page was discarded.
*
* Called under virtmem lock. XXX hard to rework to call with !virt_lock
* Virtmem considers remmap_blk_read failure as fatal.
*/
int (*remmap_blk_read) (VMA *vma, BigFile *file, blk_t blk);
/* munmap is called when vma set up via mmap_setup_read is going to be unmapped.
*
* Called under virtmem lock. TODO easy to rework to call with !virt_lock
* Virtmem considers munmap failure as fatal.
*/
int (*munmap) (VMA *vma, BigFile *file);
};
typedef struct bigfile_ops bigfile_ops;
......
......@@ -2,7 +2,7 @@
#define _WENDELIN_BIGFILE_VIRTMEM_H_
/* Wendelin.bigfile | Virtual memory
* Copyright (C) 2014-2019 Nexedi SA and Contributors.
* Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
......@@ -29,9 +29,24 @@
* Read access to mapped pages cause their on-demand loading, and write access
* marks modified pages as dirty. Dirty pages then can be on request either
* written out back to file or discarded.
*
*
* Mmap overlaying
*
* A particular BigFile implementation can optionally provide functionality to
* mmap its data into memory. For BigFile handles opened in such mode, virtmem
* does not allocate RAM for read access and will only allocate RAM when pages
* are dirtied. The mode in which BigFile handle is opened is specified via
* fileh_open(flags=...).
*
* The primary user of "mmap overlay" functionality will be wcfs - virtual
* filesystem that provides access to ZBigFile data via OS-level files(*).
*
* (*) see wcfs/client/wcfs.h and wcfs/wcfs.go
*/
#include <stdint.h>
#include <stdbool.h>
#include <wendelin/list.h>
#include <wendelin/bigfile/types.h>
#include <wendelin/bigfile/pagemap.h>
......@@ -77,6 +92,13 @@ struct BigFileH {
/* whether writeout is currently in progress */
int writeout_inprogress;
/* whether base data for all VMAs of this fileh are taken as base-layer mmap
*
* ( we require all VMAs under one fileh to be of the same kind to easily
* make decision whether after writeout to keep a page in RAM or to
* completely drop it not to waste RSS unnecessarily ) */
unsigned mmap_overlay : 1;
};
typedef struct BigFileH BigFileH;
......@@ -89,7 +111,9 @@ enum PageState {
= 2, /* file content loading was in progress
while request to invalidate the page came in */
PAGE_LOADED = 3, /* file content has been loaded and was not modified */
PAGE_DIRTY = 4, /* file content has been loaded and was modified */
PAGE_LOADED_FOR_WRITE
= 4, /* file content has been loaded and is going to be modified */
PAGE_DIRTY = 5, /* file content has been loaded and was modified */
};
typedef enum PageState PageState;
......@@ -142,6 +166,16 @@ struct VMA {
/* whether corresponding to pgoffset-f_offset page is mapped in this VMA */
bitmap *page_ismappedv; /* len ~ Δaddr / pagesize */
/* BigFile-specific field used when VMA was created from fileh opened with
* MMAP_OVERLAY flag. bigfile_ops.mmap_setup_read can initialize this to
* object pointer specific to serving created base overlay mapping.
*
* For example WCFS will use this to link VMA -> wcfs.Mapping to know which
* wcfs-specific mapping is serving particular virtmem VMA.
*
* NULL for VMAs created from under DONT_MMAP_OVERLAY fileh. */
void *mmap_overlay_server;
};
......@@ -149,15 +183,34 @@ struct VMA {
* API for clients *
*****************************/
/* flags for fileh_open */
enum FileHOpenFlags {
/* use "mmap overlay" mode for base file data of all mappings created
* for this fileh.
*
* The file must have .mmap_setup_read & friends != NULL in file_ops.
*/
MMAP_OVERLAY = 1 << 0,
/* don't use "mmap overlay" mode */
DONT_MMAP_OVERLAY = 1 << 1,
/* NOTE: if both MMAP_OVERLAY and DONT_MMAP_OVERLAY are not given,
* the behaviour is to use mmap overlay if .mmap_* fops != NULL and
* regular loads otherwise. */
};
typedef enum FileHOpenFlags FileHOpenFlags;
/* open handle for a BigFile
*
* @fileh[out] BigFileH handle to initialize for this open
* @file
* @ram RAM that will back created fileh mappings
* @flags flags for this open - see FileHOpenFlags
*
* @return 0 - ok, !0 - fail
*/
int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram);
int fileh_open(BigFileH *fileh, BigFile *file, RAM *ram, FileHOpenFlags flags);
/* close fileh
......@@ -201,8 +254,9 @@ enum WriteoutFlags {
/* mark dirty pages as stored to file ok
*
* pages state becomes PAGE_LOADED and all mmaps are updated to map pages as
* R/O to track further writes.
* wcfs: all mmaps are updated to map read-only to base layer.
* !wcfs: pages state becomes PAGE_LOADED and all mmaps are updated to map
* pages as R/O to track further writes.
*/
WRITEOUT_MARKSTORED = 1 << 1,
};
......@@ -252,7 +306,7 @@ void fileh_dirty_discard(BigFileH *fileh);
* file was changed externally )
*
* it's an error to call fileh_invalidate_page() while writeout for fileh is in
* progress.
* progress, or for fileh opened in MMAP_OVERLAY mode.
*/
void fileh_invalidate_page(BigFileH *fileh, pgoff_t pgoffset);
......@@ -332,6 +386,7 @@ typedef struct VirtGilHooks VirtGilHooks;
void virt_lock_hookgil(const VirtGilHooks *gilhooks);
bool __fileh_page_isdirty(BigFileH *fileh, pgoff_t pgoff);
// XXX is this needed? think more
/* what happens on out-of-memory */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment