// Copyright (C) 2018-2020 Nexedi SA and Contributors. // Kirill Smelkov <kirr@nexedi.com> // // This program is free software: you can Use, Study, Modify and Redistribute // it under the terms of the GNU General Public License version 3, or (at your // option) any later version, as published by the Free Software Foundation. // // You can also Link and Combine this program with other software covered by // the terms of any of the Free Software licenses or any of the Open Source // Initiative approved licenses and Convey the resulting work. Corresponding // source of such a combination shall include the source code for all other // software used. // // This program is distributed WITHOUT ANY WARRANTY; without even the implied // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // // See COPYING file for full licensing terms. // See https://www.nexedi.com/licensing for rationale and options. // Package wcfs provides WCFS client integrated with user-space virtual memory manager. // // This client package takes care about WCFS isolation protocol details and // provides to clients simple interface to isolated file data similar to // regular files: given a particular revision of database @at, it provides // synthetic read-only bigfile mappings with data corresponding to @at state, // but using /head/bigfile/* most of the time to build and maintain the mappings. // // For its data a mapping to bigfile X mostly reuses kernel cache for // /head/bigfile/X with amount of data not associated with kernel cache for // /head/bigfile/X being proportional to δ(bigfile/X, at..head). In the usual // case where many client workers simultaneously serve requests, their database // views are a bit outdated, but close to head, which means that in practice // the kernel cache for /head/bigfile/* is being used almost 100% of the time. // // A mapping for bigfile X @at is built from OS-level memory mappings of // on-WCFS files as // // ___ /@revA/bigfile/X // __ /@revB/bigfile/X // _ /@revC/bigfile/X // + ... // ─── ───── ────────────────────────── ───── /head/bigfile/X // // where @revR mmaps are being dynamically added/removed by this client package // to maintain X@at data view according to WCFS invalidation protocol(*). // // // Integration with wendelin.core virtmem layer // // This client package can be used standalone, but additionally provides // integration with wendelin.core userspace virtual memory manager: when a // Mapping is created it can be associated as serving base layer for a // particular virtmem VMA via FileH.mmap(vma=...). In that case, since virtmem // itself adds another layer of dirty pages over read-only base provided by // Mapping // // ┌──┐ ┌──┐ // │RW│ │RW│ ← virtmem VMA dirty pages // └──┘ └──┘ // + // VMA base = X@at view provided by Mapping: // // ___ /@revA/bigfile/X // __ /@revB/bigfile/X // _ /@revC/bigfile/X // + ... // ─── ───── ────────────────────────── ───── /head/bigfile/X // // the Mapping will interact with virtmem layer to coordinate // mapping memory updates. // // // API overview // // - `WCFS` represents filesystem-level connection to wcfs server. // - `Conn` represents logical connection that provides view of data on wcfs // filesystem as of particular database state. // - `FileH` represent isolated file view under Conn. // - `Mapping` represents one memory mapping of FileH. // // // -------- // // (*) see wcfs.go documentation for overview and details of WCFS isolation protocol. #ifndef _NXD_WCFS_H_ #define _NXD_WCFS_H_ #include <golang/libgolang.h> #include <golang/cxx.h> #include <golang/sync.h> using namespace golang; using cxx::dict; using cxx::set; #include <string> using std::string; #include <tuple> using std::tuple; #include <utility> using std::pair; #include "wcfs_misc.h" #include <wendelin/bug.h> // from wendelin/bigfile/virtmem.h extern "C" { struct VMA; } // wcfs:: namespace wcfs { typedef refptr<struct _Conn> Conn; typedef refptr<struct _Mapping> Mapping; typedef refptr<struct _FileH> FileH; typedef refptr<struct _WatchLink> WatchLink; struct PinReq; // WCFS represents filesystem-level connection to wcfs server. // // XXX Use join to create it? // // The primary way to access wcfs is to open logical connection viewing on-wcfs // data as of particular database state, and use that logical connection to // create base-layer mappings. See .connect and Conn for details. // // XXX raw files? // // WCFS logically mirrors ZODB.DB . struct WCFS { string mountpoint; pair<Conn, error> connect(zodb::Tid at); string _path(const string &obj); tuple<os::File, error> _open(const string &path, int flags=O_RDONLY); pair<WatchLink, error> _openwatch(); }; // Conn represents logical connection that provides view of data on wcfs // filesystem as of particular database state. // // It uses /head/bigfile/* and notifications received from /head/watch to // maintain isolated database view while at the same time sharing most of data // cache in OS pagecache of /head/bigfile/*. // // Use WCFS.connect(at) to create Conn. // Use .open to create new FileH. // Use .resync to resync Conn onto different database view. // // Conn logically mirrors ZODB.Connection . typedef refptr<struct _Conn> Conn; struct _Conn : object { WCFS *_wc; zodb::Tid at; WatchLink _wlink; // watch/receive pins for mappings created under this conn // XXX kill downMu? (move under filehmu so that e.g. .open() can check downErr without race) sync::Mutex _downMu; error _downErr; // !nil if connection is closed or no longer operational sync::Mutex _filehmu; dict<zodb::Oid, FileH> _filehtab; // {} foid -> fileh sync::WorkGroup _pinWG; func<void()> _pinCancel; // don't new - create via WCFS.connect private: _Conn(); ~_Conn(); friend pair<Conn, error> WCFS::connect(zodb::Tid at); public: void decref(); public: pair<FileH, error> open(zodb::Oid foid); error close(); error resync(zodb::Tid at); private: error _pinner(context::Context ctx); error __pinner(context::Context ctx); error _pin1(PinReq *req); error __pin1(PinReq *req); }; // FileH represent isolated file view under Conn. // // The file view is maintained to be as of @Conn.at database state even in the // presence of simultaneous database changes. The file view uses // /head/<file>/data primarily and @revX/<file>/data pin overrides. // // Use .mmap to map file view into memory. typedef refptr<struct _FileH> FileH; struct _FileH : object { Conn wconn; zodb::Oid foid; // ZBigFile root object ID size_t blksize; // block size of this file XXX -> off_t ? os::File _headf; // file object of head/file off_t _headfsize; // head/file size is known to be at least headfsize (size ↑=) dict<int64_t, zodb::Tid> _pinned; // {} blk -> rev that wcfs already sent us for this file vector<Mapping> _mmaps; // []Mapping ↑blk_start mappings of this file // don't new - create via Conn.open private: _FileH(); ~_FileH(); friend pair<FileH, error> _Conn::open(zodb::Oid foid); public: void decref(); public: error close(); pair<Mapping, error> mmap(int64_t blk_start, int64_t blk_len, VMA *vma=nil); }; // Mapping represents one memory mapping of FileH. typedef refptr<struct _Mapping> Mapping; struct _Mapping : object { FileH fileh; int64_t blk_start; // offset of this mapping in file uint8_t *mem_start; // mmapped memory [mem_start, mem_stop) uint8_t *mem_stop; VMA *vma; // mmapped under this virtmem VMA | nil if created standalone from virtmem int64_t blk_stop() const { ASSERT((mem_stop - mem_start) % fileh->blksize == 0); return blk_start + (mem_stop - mem_start) / fileh->blksize; } error _remmapblk(int64_t blk, zodb::Tid at); void remmap_blk(int64_t blk); void unmap(); // don't new - create via FileH.mmap private: _Mapping(); ~_Mapping(); friend pair<Mapping, error> _FileH::mmap(int64_t blk_start, int64_t blk_len, VMA *vma); public: void decref(); }; // for testing dict<int64_t, zodb::Tid> _tfileh_pinned(FileH fileh); } // wcfs:: #endif