Commit 8c935a5f authored by Kirill Smelkov's avatar Kirill Smelkov

bigfile: RAM subsystem

This thing allows to get aliasable RAM from OS kernel and to manage it.
Currently we get memory from a tmpfs mount, and hugetlbfs should also
work, but is TODO because hugetlbfs in the kernel needs to be improved.

We need aliasing because we'll need to be able to memory map the same
page into several places in address space, e.g. for taking two slices
overlapping slice of the same array at different times.

Comes with test programs that show we aliasing does not work for
anonymous memory.
parent 77d61533
/* Wendelin.bigfile | Interfaces to work with RAM
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*
*
* TODO write why this needed (shmfs, hugetlbfs, ...)
*/
#include <wendelin/bigfile/ram.h>
#include <wendelin/bigfile/virtmem.h>
#include <wendelin/utils.h>
#include <wendelin/bug.h>
/* ramh_ops */
Page *ramh_alloc_page(RAMH *ramh, pgoff_t pgoffset_hint)
{
struct Page *page;
pgoff_t ramh_pgoffset;
// XXX ok to malloc, or better do more structured allocations?
page = zalloc(sizeof(*page));
if (!page)
return NULL;
ramh_pgoffset = ramh->ramh_ops->alloc_page(ramh, pgoffset_hint);
if (ramh_pgoffset == RAMH_PGOFF_ALLOCFAIL) {
free(page);
return NULL;
}
page->state = PAGE_EMPTY;
/* ->file & ->f_pgoffset are left unset */
page->ramh = ramh;
page->ramh_pgoffset = ramh_pgoffset;
INIT_LIST_HEAD(&page->lru); /* NOTE ->lru left unlinked */
page->refcnt = 0;
return page;
}
void ramh_drop_memory(RAMH *ramh, pgoff_t ramh_pgoffset)
{
return ramh->ramh_ops->drop_memory(ramh, ramh_pgoffset);
}
void ramh_close(RAMH *ramh)
{
ramh->ramh_ops->close(ramh);
}
/* ram_ops */
size_t ram_get_current_maxsize(RAM *ram)
{
return ram->ram_ops->get_current_maxsize(ram);
}
RAMH *ramh_open(RAM *ram)
{
return ram->ram_ops->ramh_open(ram);
}
void ram_close(RAM *ram)
{
WARN("TODO ram_close()"); // XXX
//return ram->ram_ops->close(ram);
}
/* RAM types */
static LIST_HEAD(ram_types);
struct ram_type_entry {
const struct ram_type *ram_type;
RAM *default_ram;
struct list_head list;
};
void ram_register_type(const struct ram_type *ram_type)
{
struct ram_type_entry *rte = xzalloc(sizeof(*rte));
rte->ram_type = ram_type;
list_add_tail(&rte->list, &ram_types);
}
static const char ram_type_default[] = "shmfs"; /* default type for !ram_type */
RAM *ram_new(const char *ram_type, const char *arg)
{
struct list_head *h;
if (!ram_type)
ram_type = ram_type_default;
list_for_each(h, &ram_types) {
struct ram_type_entry *rte = list_entry(h, typeof(*rte), list);
const struct ram_type *rt = rte->ram_type;
if (!strcmp(rt->name, ram_type))
return rt->ram_new(arg);
}
return NULL;
}
RAM *ram_get_default(const char *ram_type)
{
struct list_head *h;
if (!ram_type)
ram_type = ram_type_default;
list_for_each(h, &ram_types) {
struct ram_type_entry *rte = list_entry(h, typeof(*rte), list);
const struct ram_type *rt = rte->ram_type;
if (strcmp(rt->name, ram_type))
continue;
if (!rte->default_ram)
rte->default_ram = rt->ram_new(NULL);
BUG_ON(!rte->default_ram);
return rte->default_ram;
}
BUG();
}
/* Wendelin.bigfile | hugetlbfs ram backend
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*/
// TODO /dev/hugepages & hugetlbfs (see t/shm-punch-hole.c for notes on hugetlbfs)
/* Wendelin.bigfile | shmfs (aka tmpfs) ram backend
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*
* ~~~~~~~~
*
* TODO description
*/
#include <wendelin/bigfile/ram.h>
#include <wendelin/utils.h>
#include <wendelin/bug.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/vfs.h>
#include <sys/mman.h>
#include <sys/types.h>
/* we'll manage RAM in "pages" of 2M
*
* Compared to std 4K pages, this will reduce per-page overhead and also
* coincides with huge page size on x86/x86_64).
*
* Hardware pages will still be of usual 4K size - we'll just manage them in
* 512-pages groups.
*/
#define SHMFS_PAGE_SIZE (2*1024*1024ULL)
/* default prefix & ramh files template */
static const char shmfs_ram_prefix_default[] = "/dev/shm";
static const char shmfs_ramh_template[] = "ramh.XXXXXX";
/* RAM on shmfs */
struct SHMFS_RAM {
RAM;
const char *prefix; /* prefix where to create ramh files */
};
typedef struct SHMFS_RAM SHMFS_RAM;
/* RAM Handle on shmfs */
struct SHMFS_RAMH {
RAMH;
int ramh_fd;
size_t ramh_fpgsize; /* current file size in pagesize units */
};
typedef struct SHMFS_RAMH SHMFS_RAMH;
static void *shmfs_mmap_page(RAMH *ramh0, pgoff_t ramh_pgoffset, void *addr, int prot)
{
SHMFS_RAMH *ramh = upcast(SHMFS_RAMH *, ramh0);
size_t pagesize = ramh->ram->pagesize;
// XXX MAP_POPULATE so that we can access mmaped memory without additional pagefault?
// tried -> this mmap becomes slow, and overall the whole run is slower. XXX why?
addr = mmap(addr, pagesize,
prot,
MAP_SHARED
| (addr ? MAP_FIXED : 0),
ramh->ramh_fd,
ramh_pgoffset * pagesize);
if (addr == MAP_FAILED)
addr = NULL;
return addr;
}
pgoff_t shmfs_alloc_page(RAMH *ramh0, pgoff_t pgoffset_hint)
{
// FIXME double calls with same pgoffset_hint ? (or move ->pagemap to ramh ?)
SHMFS_RAMH *ramh = upcast(SHMFS_RAMH *, ramh0);
pgoff_t ramh_pgoffset = pgoffset_hint;
size_t pagesize = ramh->ram->pagesize;
int err;
/*
* - allocate space for page at ramh_pgoffset,
* - hole-grow file to size covering that page, if file was smaller,
*
* all in one go.
*
* We allocate filesystem space so that we know we really allocated that
* memory now and that client code will not get SIGBUS on memory read/write
* or EFAULT on syscalls read/write, when accessing it later.
*
* It is easier to handle ENOMEM synchronously.
*/
err = fallocate(ramh->ramh_fd, 0 /* without KEEP_SIZE */,
ramh_pgoffset * pagesize, pagesize);
if (err)
return RAMH_PGOFF_ALLOCFAIL;
if (ramh_pgoffset >= ramh->ramh_fpgsize)
ramh->ramh_fpgsize = ramh_pgoffset+1;
return ramh_pgoffset;
}
static void shmfs_drop_memory(RAMH *ramh0, pgoff_t ramh_pgoffset)
{
SHMFS_RAMH *ramh = upcast(SHMFS_RAMH *, ramh0);
size_t pagesize = ramh->ram->pagesize;
BUG_ON(ramh_pgoffset >= ramh->ramh_fpgsize);
// XXX state -> empty ?
/* punch hole and this way release memory to OS.
* this should not fail - if it is, something is wrong */
xfallocate(ramh->ramh_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
ramh_pgoffset * pagesize, pagesize);
}
static void shmfs_close(RAMH *ramh0)
{
SHMFS_RAMH *ramh = upcast(SHMFS_RAMH *, ramh0);
// XXX verify no mapping left?
/* drop all memory & close */
xftruncate(ramh->ramh_fd, 0);
xclose(ramh->ramh_fd);
ramh->ramh_fd = -1;
ramh->ramh_fpgsize = 0;
// TODO free(self) ?
}
static const struct ramh_ops shmfs_ramh_ops = {
.alloc_page = shmfs_alloc_page,
.mmap_page = shmfs_mmap_page,
.drop_memory = shmfs_drop_memory,
.close = shmfs_close,
};
static size_t shmfs_get_current_maxsize(RAM *ram0)
{
SHMFS_RAM *ram = upcast(SHMFS_RAM *, ram0);
struct statfs sf;
int err;
// XXX races with fs remount/change under prefix
err = statfs(ram->prefix, &sf);
if (err)
BUGe();
return sf.f_blocks * sf.f_bsize / ram->pagesize;
}
static RAMH *shmfs_ramh_open(RAM *ram0)
{
SHMFS_RAM *ram = upcast(SHMFS_RAM *, ram0);
SHMFS_RAMH *ramh;
char *s, *ramh_filename = NULL;
int err;
ramh = zalloc(sizeof(*ramh));
if (!ramh)
goto out;
ramh->ramh_ops = &shmfs_ramh_ops;
ramh->ram = ram;
ramh_filename = malloc(strlen(ram->prefix) + 1/*"/"*/ +
strlen(shmfs_ramh_template) + 1/*NUL*/);
if (!ramh_filename)
goto out;
s = ramh_filename;
s = stpcpy(s, ram->prefix);
s = stpcpy(s, "/");
s = stpcpy(s, shmfs_ramh_template);
ramh->ramh_fd = mkstemp(ramh_filename);
if (ramh->ramh_fd == -1)
goto out;
// XXX maybe by default show and unlink atexit / on close
/* unlink ramh file, if not asked to leave it show for debugging */
s = getenv("WENDELIN_RAMH_HIDE");
if (!s || (s && s[0] == 'y')) {
err = unlink(ramh_filename);
if (err)
BUGe();
}
free(ramh_filename);
ramh->ramh_fpgsize = 0;
return ramh;
out:
free(ramh);
free(ramh_filename);
return NULL;
}
static const struct ram_ops shmfs_ram_ops = {
.get_current_maxsize = shmfs_get_current_maxsize,
.ramh_open = shmfs_ramh_open,
//.close = shmfs_ram_dtor
};
/* shmfs ram type */
static RAM *shmfs_ram_new(const char *arg)
{
SHMFS_RAM *ram = xzalloc(sizeof(*ram));
ram->ram_ops = &shmfs_ram_ops;
ram->pagesize = SHMFS_PAGE_SIZE;
INIT_LIST_HEAD(&ram->lru_list);
// TODO ensure prefix points to somewhere on shmfs
ram->prefix = xstrdup(arg ?: shmfs_ram_prefix_default);
return ram;
};
// TODO shmfs_ram_dtor
static const struct ram_type shmfs_ram_type = {
.name = "shmfs",
.ram_new = shmfs_ram_new,
};
__attribute__((constructor))
static void shmfs_init(void)
{
ram_register_type(&shmfs_ram_type);
}
/* Wendelin.bigfile | ram tests
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*/
// RUNWITH: t/t_with-tmpfs /dev/shm 7m
// XXX better link with it
#include "../ram.c"
#include "../virtmem.c"
#include "../ram_shmfs.c"
#include <ccan/tap/tap.h>
int main()
{
RAM *ram;
RAMH *ramh;
Page *page0, *page1, *page2, *page3;
uint8_t *p0, *p01, *p02, *p03, *p1, *p2, *_;
size_t ps, ram_maxsize;
tap_fail_callback = abort; // XXX to catch failure immediately
diag("Testing ram");
ram = ram_new("shmfs", NULL);
ok1(ram);
ps = ram->pagesize;
ok1(ps == 2*1024*1024); // to be sure it correlates with 7m (=3.5 pages) in setup
ramh = ramh_open(ram);
ok1(ramh);
page0 = ramh_alloc_page(ramh, 0);
ok1(page0);
ok1(page0->state == PAGE_EMPTY);
ok1(ram->pagesize == page_size(page0));
/* mmap page0 into 2 places somewhere */
p01 = page_mmap(page0, NULL, PROT_READ | PROT_WRITE);
ok1(p01);
p02 = page_mmap(page0, NULL, PROT_READ | PROT_WRITE);
ok1(p02);
ok1(p02 != p01);
ok1(p01[0] == 0); ok1(p01[ps-1] == 0);
ok1(p02[0] == 0); ok1(p02[ps-1] == 0);
/* mappings should be to the same memory */
p01[0] = 1; ok1(p02[0] == 1);
p02[ps-1] = 2; ok1(p01[ps-1] == 2);
/* mmap page0 to fixed addr and check memory is the same */
p03 = mem_xvalloc(NULL, ps); /* allocate virt address space somewhere */
ok1(p03);
_ = page_mmap(page0, p03, PROT_READ | PROT_WRITE);
ok1(_);
ok1(_ == p03);
ok1(p03[0] == 1);
ok1(p03[ps-1] == 2);
p03[0] = 4; ok1(p01[0] == 4); ok1(p02[0] == 4);
p01[ps-1] = 5; ok1(p02[ps-1] == 5); ok1(p03[ps-1] == 5);
/* memory is forgotten after drop */
ramh_drop_memory(ramh, page0->ramh_pgoffset);
ok1(p01[0] == 0); ok1(p02[0] == 0); ok1(p03[0] == 0);
ok1(p01[ps-1] == 0); ok1(p02[ps-1] == 0); ok1(p03[ps-1] == 0);
/* let's allocate memory with pgoffset > current ram_maxsize */
ram_maxsize = ram_get_current_maxsize(ram);
ok1(ram_maxsize);
page2 = ramh_alloc_page(ramh, 2*ram_maxsize);
ok1(page2);
/* see if we can access & drop it */
p1 = page_mmap(page2, NULL, PROT_READ | PROT_WRITE);
ok1(p1);
p1[0] = 1;
p1[ps-1] = 1;
ramh_drop_memory(ramh, page2->ramh_pgoffset);
ok1(p1[0] == 0);
ok1(p1[ps-1] == 0);
xmunmap(p1, ps);
xmunmap(p01, ps);
xmunmap(p02, ps);
xmunmap(p03, ps);
/* ensure we get "no memory" when overallocating (not doing so would lead
* to getting SIGBUS on accessing memory and EFAULT on read/write
* syscalls). */
ok1(ram_maxsize == 3); /* NOTE must correlate with size in XRUN setup */
page0 = ramh_alloc_page(ramh, 0);
ok1(page0);
page1 = ramh_alloc_page(ramh, 1);
ok1(page1);
page2 = ramh_alloc_page(ramh, 2);
ok1(page2);
page3 = ramh_alloc_page(ramh, 3);
ok1(!page3); /* must fail - there is no such amount of memory */
p0 = page_mmap(page0, NULL, PROT_READ | PROT_WRITE); ok1(p0);
p1 = page_mmap(page1, NULL, PROT_READ | PROT_WRITE); ok1(p1);
p2 = page_mmap(page2, NULL, PROT_READ | PROT_WRITE); ok1(p2);
/* touch all memory - so that we know we can use it without getting SIGBUS */
memset(p0, 0xff, ps);
memset(p1, 0xff, ps);
memset(p2, 0xff, ps);
// TODO allocate memory amount = 2*ram_maxsize and touch it linearly
ramh_close(ramh);
return 0;
}
#ifndef _WENDELIN_BIGFILE_RAM_H_
#define _WENDELIN_BIGFILE_RAM_H_
/* Wendelin.bigfile | Interfaces to work with RAM
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*
* ~~~~~~~~
* TODO write why this needed (shmfs, hugetlbfs, ...)
*
*
* - need to track pages state (to support commit / abort)
*
* - need to "unload" non-dirty pages to free place for requested new data (reclaim)
*
* - need to be able to map a page into several places (to support
* overlapping-in-file mappings done not neccessarily adjacent-in-time to
* each other - there is no guarantee mapping them adjacent in address space
* is possible)
*
* XXX
*/
#include <wendelin/list.h>
#include <wendelin/bigfile/types.h>
typedef struct RAMH RAMH;
typedef struct Page Page;
/* RAM - something that provides access to memory (ex. via shmfs, hugetlbfs).
*
* Can create pages from memory allocated from backend and give memory back to
* system on request.
*
* Pages allocation/deallocation is done through handles (see RAMH).
*/
struct RAM {
const struct ram_ops *ram_ops;
size_t pagesize;
struct list_head lru_list; /* RAM pages in usage order (_ -> page->lru) */
};
typedef struct RAM RAM;
/* RAM operations - implemented by RAM backend */
struct ram_ops {
size_t (*get_current_maxsize) (RAM *ram);
RAMH * (*ramh_open) (RAM *ram);
void (*close) (RAM *ram);
};
/* get RAM current max size (in pages)
*
* Maximum size is RAM current whole size, which is shared by RAM Handles and
* other independent-from-us users (possibly from another processes).
*
* So amount of ram allocated for all RAM Handles could a) not be bigger than
* this, and b) there is no guarantee that this maximum could be achieved via
* allocating for RAMH only.
*
* Maximum is "current" because it can change dynamically - ex. via RAM
* hotplug.
*/
size_t ram_get_current_maxsize(RAM *ram);
/* open RAM handle
*
* Open new handle for memory inside RAM. Close the handle with ramh_close()
*/
RAMH *ramh_open(RAM *ram);
/* close RAM
*
* TODO text
*/
void ram_close(RAM *ram);
/* RAM Handle - handle to allocate/free pages from/to RAM
*
* Additional level on top of RAM which allows to group pages allocation.
*
* RAM backends are assumed to be organized that for a RAM handle, all pages
* allocated via that handle are provided by a single file in the OS kernel.
*
* With such organization, if 2 pages are allocated with adjacent pgoffset
* and mapped adjacent to each-other in address space - there will be only 1
* in-os-kernel VMA representing those 2 pages mapping.
*
* ( #os-vma should be kept to a minimum, because on every pagefault OS kernel
* needs to lookup faulting_addr -> os_vma )
*/
struct RAMH {
const struct ramh_ops *ramh_ops;
RAM *ram;
};
typedef struct RAMH RAMH;
struct ramh_ops {
#define RAMH_PGOFF_ALLOCFAIL ((pgoff_t)-1ULL)
/* @return: allocated ram_pgoffset | RAMH_PGOFF_ALLOCFAIL */
pgoff_t (*alloc_page) (RAMH *ramh, pgoff_t pgoffset_hint);
void * (*mmap_page) (RAMH *ramh, pgoff_t ramh_pgoffset, void *addr, int prot);
void (*drop_memory) (RAMH *ramh, pgoff_t ramh_pgoffset);
void (*close) (RAMH *ramh);
};
/* allocate new page for ramh memory
*
* @pgoffset_hint hint at which offset to allocate memory -
*
* - could be used so that f_offsets coincide with ramh_offsets
* and as the result, allocated areas constitute of contiguous
* ramh memory = only 1 kernel VMA for whole area.
*
* @return new page | NULL
*
* XXX write on how to free pages (= drop & free(struct Page) ?)
*
* NOTE after allocation, page->fileh & page->f_pgoffset are unset
*/
Page *ramh_alloc_page(RAMH *ramh, pgoff_t pgoffset_hint);
/* release ramh memory-page at ramh_pgoffset to OS
*
* After this call previous content of the memory-page is lost and the memory
* is released to OS.
*
* The memory is still accessible for mmaping but will read as all zeros - on
* first access OS would again allocate memory for it from scratch.
*/
void ramh_drop_memory(RAMH *ramh, pgoff_t ramh_pgoffset);
/* close RAMH handle
*
* NOTE it is an error to call close() with mappings from ramh left
*/
void ramh_close(RAMH *ramh);
/* get default RAM by type
*
* @ram_type str for ram-type |NULL - get for default type
*/
RAM *ram_get_default(const char *ram_type);
/* create new RAM instance
*
* @ram_type str for ram-type |NULL - create for default type
* @arg str to pass to ram_type RAM constructor (NULL - use defaults)
*/
RAM *ram_new(const char *ram_type, const char *arg);
/* RAM type registration (for RAM implementers) */
struct ram_type {
const char *name;
RAM * (*ram_new) (const char *arg);
};
void ram_register_type(const struct ram_type *ram_type);
#endif
......@@ -29,6 +29,9 @@ _bigfile = Extension('wendelin.bigfile._bigfile',
'bigfile/_bigfile.c',
'bigfile/pagefault.c',
'bigfile/pagemap.c',
'bigfile/ram.c',
'bigfile/ram_shmfs.c',
'bigfile/ram_hugetlbfs.c',
'bigfile/virtmem.c',
'lib/bug.c',
'lib/utils.c',
......
/* XXX MAP_ANONYMOUS | MAP_SHARED is not really anonymous - the kernel
* internally opens a file for such mappings on shmfs (=tmpfs), or hugetlbfs for
* MAP_HUGETLB
*
* -> this is not memory aliasing for anonymous memory - this is equivalent
* to usual file mappings into multiple addresses with the same offset.
*
* -> memory aliasing for MAP_ANONYMOUS | MAP_PRIVATE memory is not
* possible as of 3.17-rc1.
*
* The most close thing so far is out-of-tree patch for remap_anon_pages()
* from Andrea Arcangeli - see http://lwn.net/Articles/550555/ (from 2013,
* but it gives the idea; the patch is being continuously updated at
* git://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git)
*
* updates:
*
* http://lwn.net/Articles/615086/ 2014 Oct
* http://lwn.net/Articles/636226/ 2015 March
*
* and remap_anon_pages() is going away because Linus dislikes it.
*
* Unfortunately, as of 3.17-rc1 transparent huge pages (THP) work only with
* private anonymous mappings...
*
* Resume: in order to use hugepages and aliasing, for now we have to stick to
* allocating pages from files on shmfs and hugetlbfs and do our own management
* with those. Sigh...
*
* Original text follows :
*
* ---- 8< ----
* Demo program, that shows how to mmap-alias two memory areas for anonymous memory.
*
* The way it works is through undocumented-in-man, but well-fixed and
* documented in kernel source
*
* mremap(old_addr, old_size, new_size, MREMAP_FIXED, new_addr)
*
* behaviour - if old_size=0, then old_addr is NOT unmapped, and as the result
* we get two mapping pointing to the same physical memory.
*
* experimentally observed that for the trick to work, original mapping has to
* be obtained with MAP_SHARED - with MAP_PRIVATE old_addr is not unmapped but
* instead of alias, new zero page is returned (maybe need to investigate - THP
* currently only work with MAP_PRIVATE mappings).
*
* References
* ----------
*
* https://git.kernel.org/cgit/linux/kernel/git/history/history.git/commit/?id=4547e81c1f3e35dc47c4bfbfd3444cb0401c2b0b
* commit 4547e81c1f3e35dc47c4bfbfd3444cb0401c2b0b
* Author: Linus Torvalds <torvalds@evo.osdl.org>
* Date: Mon Jan 12 01:13:40 2004 -0800
*
* Dosemu actually wants to do a zero-sized source mremap
* to generate the duplicate area at 0x0000 and 0x100000.
*
* There's no downside to it, so allow it even though it's
* a tad strange.
*
* http://sourceforge.net/p/dosemu/code/ci/master/tree/src/arch/linux/mapping/mapshm.c#l23
* """The trick is to set old_len = 0,
* this won't unmap at the old address, but with
* shared mem the 'nopage' vm_op will map in the right
* pages. We need however to take care not to map
* past the end of the shm area"""
*/
#define _GNU_SOURCE
#include <sys/mman.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#undef NDEBUG
#include <assert.h>
void die(const char *msg)
{
perror(msg);
exit(1);
}
void verify(const char *title, size_t len, int mmap_flags)
{
uint8_t *P1, *P2, *P;
int err;
printf("Verifying %s (len=%lx\tflags=%x) ...", title, len, mmap_flags);
fflush(stdout);
/* first page with R/W */
P1 = mmap(NULL, len, PROT_READ | PROT_WRITE,
MAP_SHARED /* <- important; for MAP_PRIVATE the trick does not work */
| mmap_flags
| MAP_ANONYMOUS, -1, 0);
if (P1 == MAP_FAILED)
die("mmap P1");
P1[0] = 0;
P1[1] = 1;
P1[2] = 2;
P1[len-1] = 99;
/* second page - just address space so far */
P2 = mmap(NULL, len, PROT_NONE,
MAP_SHARED | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
if (P2 == MAP_FAILED)
die("mmap P2");
/*
* mmap-alias P1 to P2. The trick is to pass 0 as old_size - then P1 won't
* be unmapped.
*/
P = mremap(P1, 0/*old_size - important!*/, len,
MREMAP_FIXED | MREMAP_MAYMOVE, P2);
if (P == MAP_FAILED)
die("mremap P1->P2");
assert(P == P2);
/* P1 should still be mapped (and contain old values) */
assert (P1[0] == 0);
assert (P1[1] == 1);
assert (P1[2] == 2);
assert (P1[len-1] == 99);
/* P2 should be mapped too */
assert (P2[0] == 0);
assert (P2[1] == 1);
assert (P2[2] == 2);
assert (P2[len-1] == 99);
/* verify changes propagate back and forth */
P2[0] = 11; assert (P1[0] == 11);
P1[1] = 12; assert (P2[1] == 12);
P2[len-1] = 100; assert (P1[len-1] == 100);
P1[len-2] = 88; assert (P2[len-2] == 88);
err = munmap(P1, len);
if (err < 0)
die("munmap P1");
/* verify P2 is still there */
assert (P2[0] == 11);
assert (P2[1] == 12);
assert (P2[len-2] == 88);
assert (P2[len-1] == 100);
err = munmap(P2, len);
if (err < 0)
die("munmap P2");
printf("\tOK\n");
}
int main()
{
size_t pagesize = 4096; // XXX hardcoded
size_t pagehuge = 2*1024*1024; // XXX hardcoded
verify("std pages ", 4*pagesize, 0);
/*
* NOTE(2admin) By default # of ready hugepages is 0. Adjust
*
* /proc/sys/vm/nr_hugepages
*
* or more explicitly
*
* /sys/kernel/mm/hugepages/hugepages-<size>/nr_hugepages
*
* before trying to allocate them.
*/
// FIXME for this mremap fails with EINVAL - explicitly disabled in
// mremap::vma_to_resize(). Changing this would need thorough understanding
// of linux mm, which is out of scope for now.
verify("huge pages", 4*pagehuge, MAP_HUGETLB);
return 0;
}
/* Demo program that shows how to release memory on shmfs (=tmpfs) via
* FALLOC_FL_PUNCH_HOLE - watch how it runs with
*
* `watch -n1 df -h /dev/shm/`
*
*
* NOTE if WORK > free ram dedicated to /dev/shm we get SIGBUS:
*
* via this way kernel tells userspace that there is no more space in
* backing store attached to mapping.
*
* Strategy -> free some memory on that filesystem (in particular at that
* file) and retry
*
*
* NOTE hugetlbfs (as of 3.19-rc1) supports sparse files but does not support holepunch.
*
* 1) sparse files:
*
* $ cd /dev/hugepages/
* $ truncate -s 128T x
* $ ls -lh x
* ... 128T ... x
* $ du -sh x
* 0 x
*
* # then one can mmap some inner part of it, e.g. 4M-8M and use that
* # and only that memory will be allocated.
*
*
* 2) holepunch: it supports punching holes at file-end though (i.e. ftruncate
* works) and I've digged through sys_truncate() and its support for hugetlbfs
* in hugetlbfs_setattr() and it looks like extending this support to cover
* "truncating range" (i.e. holepunch) should not be that hard.
*
* -> TODO fix hugetlbfs
*
* NOTE care should be taken to correctly maintain huge-pages reservation
* numbers etc (HugePages_Rsvd in /proc/meminfo) as hugetlbfs plays own
* games with reservation on each mmap to be able to promise not to get
* SIGBUS at later page access.
*
* https://lkml.org/lkml/2011/11/16/499
*/
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/user.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <err.h>
#include <errno.h>
#include <stdint.h>
#include <signal.h>
#include <stdlib.h>
#undef NDEBUG
#include <assert.h>
#define KB (1024ULL)
#define MB (1024*KB)
#define GB (1024*MB)
#define TB (1024*GB)
#define PB (1024*TB)
#define EB (1024*PB)
//#define ZB (1024*EB)
//#define YB (1024*ZB)
#define VIRTMAX (64*TB) /* 2^46 - address space limit on linux/x86_64 */
#define FILEMAX (8*EB-1) /* 2^64 - file size limit + off_t is signed -> 2^63 */
#define WORK (16*GB)
#define RECLAIM_BATCH (128*256) /* how many pages to free at once (= 128MB) */
/* string for si_code on SIGBUS */
const char *buscode(int si_code)
{
#define E(code, text) if (si_code == code) return text
E(BUS_ADRALN, "invalid address alignment");
E(BUS_ADRERR, "non-existent physical address");
E(BUS_OBJERR, "object specific hardware error");
E(BUS_MCEERR_AR, "hardware memory error consumed on a machine check: action required");
E(BUS_MCEERR_AO, "hardware memory error detected in process but not consumed: action optional");
return NULL; // crash
}
int fd;
/* where allocated area for work starts (may grow up, as we free lru memory) */
size_t work_alloc_start;
uint8_t *p;
size_t pagesize;
/* simple memory reclaim on SIGBUS */
void sigbus_handler(int sig, siginfo_t *si, void *_uc)
{
int save_errno = errno;
int e;
// si_code BUS_ADRALN BUS_ADDRERR BUS_OBJERR BUS_MCEERR_*
// si_trapno - not supported on x86_64
// si_addr_lsb - not set except for BUS_MCERR_* (memory hw failure)
/* handle only "nonexistent physical address" - this way filesystems report
* that there is no more space in backing store */
if (si->si_code != BUS_ADRERR)
goto dont_handle;
/* in general we should look for si->si_addr to determine which mapping and
* in turn fs/file tells us, but here we know it already */
assert( ((void *)p <= si->si_addr) && (si->si_addr < (void *)(p + WORK)) );
/* deallocate some batch of touched pages, starting from older */
fprintf(stderr, "\tfreeing %i pages @ P%lx (%lx)...\n",
RECLAIM_BATCH, work_alloc_start / pagesize, work_alloc_start);
e = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
work_alloc_start, RECLAIM_BATCH * pagesize);
if (e)
err(errno, "holepunch");
work_alloc_start += RECLAIM_BATCH * pagesize;
errno = save_errno;
return;
dont_handle:
fprintf(stderr, "SIGBUS si_code: %x (%s)\n", si->si_code, buscode(si->si_code));
fprintf(stderr, "? unexpected sigbus - abort\n");
abort();
}
void verify(const char *mntpt, size_t pgsize)
{
char filename[128];
size_t i;
int e;
pagesize = pgsize;
work_alloc_start = 0;
fd = -1;
p = NULL;
fprintf(stderr, "\nVerifying %s pagesize: %lu ...\n", mntpt, pagesize);
snprintf(filename, sizeof(filename), "%s/t", mntpt);
fd = open(filename, O_RDWR | O_CREAT | O_EXCL, 0644);
if (fd==-1)
err(errno, "open");
e = ftruncate(fd, FILEMAX);
if (e)
err(errno, "ftruncate");
p = mmap(NULL, VIRTMAX, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);
if (p == MAP_FAILED)
err(errno, "mmap");
/* touch WORK */
fprintf(stderr, "allocating...\n");
for (i=0; i<WORK; i += pagesize) {
p[i] = 1;
//if (i%50 == 0)
// usleep(1);
}
fprintf(stderr, "\t... done\n");
//assert(p[0] == 1); /* first pages may be already freed */
assert(p[work_alloc_start] == 1);
/* hole punch touched memory */
fprintf(stderr, "deallocating...\n");
for (i=0; i<WORK; i += pagesize) {
e = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
i, pagesize);
if (e)
err(errno, "fallocate");
//if (i%50 == 0)
// usleep(1);
}
fprintf(stderr, "\t... done\n");
assert(p[0] == 0); /* changes must be forgotten */
assert(p[work_alloc_start] == 0);
e = munmap(p, VIRTMAX);
if (e)
err(errno, "munmap");
e = close(fd);
if (e)
err(errno, "close");
fprintf(stderr, "OK\n");
}
int main()
{
int e;
/* prepare to catch SIGBUS */
struct sigaction sa;
sa.sa_sigaction = sigbus_handler;
sa.sa_flags = SA_SIGINFO;
e = sigemptyset(&sa.sa_mask);
if (e)
err(errno, "sigemptyset");
e = sigaction(SIGBUS, &sa, NULL);
if (e)
err(errno, "sigaction");
verify("/dev/shm", PAGE_SIZE);
/* does not work as of 3.17-rc3. Problems with hugetlbfs:
*
* 1) does not support sparse files - it is not possible to ftruncate a
* file bigger than nr_hugepages;
*
* 2) does not support fallocate.
*/
verify("/dev/hugepages", 2*MB); // XXX HPAGE_SIZE hardcoded
return 0;
}
#!/bin/sh -e
# t_with-tmpfs mountpoint size cmd args...
# run `cmd args...` with fresh tmpfs mounted at mountpoint (in a user_ns)
die() {
echo "$@" >&2
exit 1
}
test "$#" -gt 2 || die "Usage: t_with-tmpfs mountpoint size cmd args..."
# re-exec ourselves in a user_ns where we are root and can mount filesystems
#
# NOTE on Debian the kernel is built with unofficial patch to disable user
# namespaces by default. To enable:
# sysctl kernel.unprivileged_userns_clone=1
test -z "$UNSHARED" && UNSHARED=y exec unshare -Umr "$0" "$@"
mntpt="$1"
size="$2"
shift 2
mount -t tmpfs tmpfs.t "$mntpt" -osize="$size"
unset UNSHARED # in case cmd wants to spawn subsub namespace
exec "$@"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment