Commit 8c935a5f authored by Kirill Smelkov's avatar Kirill Smelkov

bigfile: RAM subsystem

This thing allows to get aliasable RAM from OS kernel and to manage it.
Currently we get memory from a tmpfs mount, and hugetlbfs should also
work, but is TODO because hugetlbfs in the kernel needs to be improved.

We need aliasing because we'll need to be able to memory map the same
page into several places in address space, e.g. for taking two slices
overlapping slice of the same array at different times.

Comes with test programs that show we aliasing does not work for
anonymous memory.
parent 77d61533
/* Wendelin.bigfile | Interfaces to work with RAM
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*
*
* TODO write why this needed (shmfs, hugetlbfs, ...)
*/
#include <wendelin/bigfile/ram.h>
#include <wendelin/bigfile/virtmem.h>
#include <wendelin/utils.h>
#include <wendelin/bug.h>
/* ramh_ops */
Page *ramh_alloc_page(RAMH *ramh, pgoff_t pgoffset_hint)
{
struct Page *page;
pgoff_t ramh_pgoffset;
// XXX ok to malloc, or better do more structured allocations?
page = zalloc(sizeof(*page));
if (!page)
return NULL;
ramh_pgoffset = ramh->ramh_ops->alloc_page(ramh, pgoffset_hint);
if (ramh_pgoffset == RAMH_PGOFF_ALLOCFAIL) {
free(page);
return NULL;
}
page->state = PAGE_EMPTY;
/* ->file & ->f_pgoffset are left unset */
page->ramh = ramh;
page->ramh_pgoffset = ramh_pgoffset;
INIT_LIST_HEAD(&page->lru); /* NOTE ->lru left unlinked */
page->refcnt = 0;
return page;
}
void ramh_drop_memory(RAMH *ramh, pgoff_t ramh_pgoffset)
{
return ramh->ramh_ops->drop_memory(ramh, ramh_pgoffset);
}
void ramh_close(RAMH *ramh)
{
ramh->ramh_ops->close(ramh);
}
/* ram_ops */
size_t ram_get_current_maxsize(RAM *ram)
{
return ram->ram_ops->get_current_maxsize(ram);
}
RAMH *ramh_open(RAM *ram)
{
return ram->ram_ops->ramh_open(ram);
}
void ram_close(RAM *ram)
{
WARN("TODO ram_close()"); // XXX
//return ram->ram_ops->close(ram);
}
/* RAM types */
static LIST_HEAD(ram_types);
struct ram_type_entry {
const struct ram_type *ram_type;
RAM *default_ram;
struct list_head list;
};
void ram_register_type(const struct ram_type *ram_type)
{
struct ram_type_entry *rte = xzalloc(sizeof(*rte));
rte->ram_type = ram_type;
list_add_tail(&rte->list, &ram_types);
}
static const char ram_type_default[] = "shmfs"; /* default type for !ram_type */
RAM *ram_new(const char *ram_type, const char *arg)
{
struct list_head *h;
if (!ram_type)
ram_type = ram_type_default;
list_for_each(h, &ram_types) {
struct ram_type_entry *rte = list_entry(h, typeof(*rte), list);
const struct ram_type *rt = rte->ram_type;
if (!strcmp(rt->name, ram_type))
return rt->ram_new(arg);
}
return NULL;
}
RAM *ram_get_default(const char *ram_type)
{
struct list_head *h;
if (!ram_type)
ram_type = ram_type_default;
list_for_each(h, &ram_types) {
struct ram_type_entry *rte = list_entry(h, typeof(*rte), list);
const struct ram_type *rt = rte->ram_type;
if (strcmp(rt->name, ram_type))
continue;
if (!rte->default_ram)
rte->default_ram = rt->ram_new(NULL);
BUG_ON(!rte->default_ram);
return rte->default_ram;
}
BUG();
}
/* Wendelin.bigfile | hugetlbfs ram backend
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*/
// TODO /dev/hugepages & hugetlbfs (see t/shm-punch-hole.c for notes on hugetlbfs)
/* Wendelin.bigfile | shmfs (aka tmpfs) ram backend
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*
* ~~~~~~~~
*
* TODO description
*/
#include <wendelin/bigfile/ram.h>
#include <wendelin/utils.h>
#include <wendelin/bug.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/vfs.h>
#include <sys/mman.h>
#include <sys/types.h>
/* we'll manage RAM in "pages" of 2M
*
* Compared to std 4K pages, this will reduce per-page overhead and also
* coincides with huge page size on x86/x86_64).
*
* Hardware pages will still be of usual 4K size - we'll just manage them in
* 512-pages groups.
*/
#define SHMFS_PAGE_SIZE (2*1024*1024ULL)
/* default prefix & ramh files template */
static const char shmfs_ram_prefix_default[] = "/dev/shm";
static const char shmfs_ramh_template[] = "ramh.XXXXXX";
/* RAM on shmfs */
struct SHMFS_RAM {
RAM;
const char *prefix; /* prefix where to create ramh files */
};
typedef struct SHMFS_RAM SHMFS_RAM;
/* RAM Handle on shmfs */
struct SHMFS_RAMH {
RAMH;
int ramh_fd;
size_t ramh_fpgsize; /* current file size in pagesize units */
};
typedef struct SHMFS_RAMH SHMFS_RAMH;
static void *shmfs_mmap_page(RAMH *ramh0, pgoff_t ramh_pgoffset, void *addr, int prot)
{
SHMFS_RAMH *ramh = upcast(SHMFS_RAMH *, ramh0);
size_t pagesize = ramh->ram->pagesize;
// XXX MAP_POPULATE so that we can access mmaped memory without additional pagefault?
// tried -> this mmap becomes slow, and overall the whole run is slower. XXX why?
addr = mmap(addr, pagesize,
prot,
MAP_SHARED
| (addr ? MAP_FIXED : 0),
ramh->ramh_fd,
ramh_pgoffset * pagesize);
if (addr == MAP_FAILED)
addr = NULL;
return addr;
}
pgoff_t shmfs_alloc_page(RAMH *ramh0, pgoff_t pgoffset_hint)
{
// FIXME double calls with same pgoffset_hint ? (or move ->pagemap to ramh ?)
SHMFS_RAMH *ramh = upcast(SHMFS_RAMH *, ramh0);
pgoff_t ramh_pgoffset = pgoffset_hint;
size_t pagesize = ramh->ram->pagesize;
int err;
/*
* - allocate space for page at ramh_pgoffset,
* - hole-grow file to size covering that page, if file was smaller,
*
* all in one go.
*
* We allocate filesystem space so that we know we really allocated that
* memory now and that client code will not get SIGBUS on memory read/write
* or EFAULT on syscalls read/write, when accessing it later.
*
* It is easier to handle ENOMEM synchronously.
*/
err = fallocate(ramh->ramh_fd, 0 /* without KEEP_SIZE */,
ramh_pgoffset * pagesize, pagesize);
if (err)
return RAMH_PGOFF_ALLOCFAIL;
if (ramh_pgoffset >= ramh->ramh_fpgsize)
ramh->ramh_fpgsize = ramh_pgoffset+1;
return ramh_pgoffset;
}
static void shmfs_drop_memory(RAMH *ramh0, pgoff_t ramh_pgoffset)
{
SHMFS_RAMH *ramh = upcast(SHMFS_RAMH *, ramh0);
size_t pagesize = ramh->ram->pagesize;
BUG_ON(ramh_pgoffset >= ramh->ramh_fpgsize);
// XXX state -> empty ?
/* punch hole and this way release memory to OS.
* this should not fail - if it is, something is wrong */
xfallocate(ramh->ramh_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
ramh_pgoffset * pagesize, pagesize);
}
static void shmfs_close(RAMH *ramh0)
{
SHMFS_RAMH *ramh = upcast(SHMFS_RAMH *, ramh0);
// XXX verify no mapping left?
/* drop all memory & close */
xftruncate(ramh->ramh_fd, 0);
xclose(ramh->ramh_fd);
ramh->ramh_fd = -1;
ramh->ramh_fpgsize = 0;
// TODO free(self) ?
}
static const struct ramh_ops shmfs_ramh_ops = {
.alloc_page = shmfs_alloc_page,
.mmap_page = shmfs_mmap_page,
.drop_memory = shmfs_drop_memory,
.close = shmfs_close,
};
static size_t shmfs_get_current_maxsize(RAM *ram0)
{
SHMFS_RAM *ram = upcast(SHMFS_RAM *, ram0);
struct statfs sf;
int err;
// XXX races with fs remount/change under prefix
err = statfs(ram->prefix, &sf);
if (err)
BUGe();
return sf.f_blocks * sf.f_bsize / ram->pagesize;
}
static RAMH *shmfs_ramh_open(RAM *ram0)
{
SHMFS_RAM *ram = upcast(SHMFS_RAM *, ram0);
SHMFS_RAMH *ramh;
char *s, *ramh_filename = NULL;
int err;
ramh = zalloc(sizeof(*ramh));
if (!ramh)
goto out;
ramh->ramh_ops = &shmfs_ramh_ops;
ramh->ram = ram;
ramh_filename = malloc(strlen(ram->prefix) + 1/*"/"*/ +
strlen(shmfs_ramh_template) + 1/*NUL*/);
if (!ramh_filename)
goto out;
s = ramh_filename;
s = stpcpy(s, ram->prefix);
s = stpcpy(s, "/");
s = stpcpy(s, shmfs_ramh_template);
ramh->ramh_fd = mkstemp(ramh_filename);
if (ramh->ramh_fd == -1)
goto out;
// XXX maybe by default show and unlink atexit / on close
/* unlink ramh file, if not asked to leave it show for debugging */
s = getenv("WENDELIN_RAMH_HIDE");
if (!s || (s && s[0] == 'y')) {
err = unlink(ramh_filename);
if (err)
BUGe();
}
free(ramh_filename);
ramh->ramh_fpgsize = 0;
return ramh;
out:
free(ramh);
free(ramh_filename);
return NULL;
}
static const struct ram_ops shmfs_ram_ops = {
.get_current_maxsize = shmfs_get_current_maxsize,
.ramh_open = shmfs_ramh_open,
//.close = shmfs_ram_dtor
};
/* shmfs ram type */
static RAM *shmfs_ram_new(const char *arg)
{
SHMFS_RAM *ram = xzalloc(sizeof(*ram));
ram->ram_ops = &shmfs_ram_ops;
ram->pagesize = SHMFS_PAGE_SIZE;
INIT_LIST_HEAD(&ram->lru_list);
// TODO ensure prefix points to somewhere on shmfs
ram->prefix = xstrdup(arg ?: shmfs_ram_prefix_default);
return ram;
};
// TODO shmfs_ram_dtor
static const struct ram_type shmfs_ram_type = {
.name = "shmfs",
.ram_new = shmfs_ram_new,
};
__attribute__((constructor))
static void shmfs_init(void)
{
ram_register_type(&shmfs_ram_type);
}
/* Wendelin.bigfile | ram tests
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*/
// RUNWITH: t/t_with-tmpfs /dev/shm 7m
// XXX better link with it
#include "../ram.c"
#include "../virtmem.c"
#include "../ram_shmfs.c"
#include <ccan/tap/tap.h>
int main()
{
RAM *ram;
RAMH *ramh;
Page *page0, *page1, *page2, *page3;
uint8_t *p0, *p01, *p02, *p03, *p1, *p2, *_;
size_t ps, ram_maxsize;
tap_fail_callback = abort; // XXX to catch failure immediately
diag("Testing ram");
ram = ram_new("shmfs", NULL);
ok1(ram);
ps = ram->pagesize;
ok1(ps == 2*1024*1024); // to be sure it correlates with 7m (=3.5 pages) in setup
ramh = ramh_open(ram);
ok1(ramh);
page0 = ramh_alloc_page(ramh, 0);
ok1(page0);
ok1(page0->state == PAGE_EMPTY);
ok1(ram->pagesize == page_size(page0));
/* mmap page0 into 2 places somewhere */
p01 = page_mmap(page0, NULL, PROT_READ | PROT_WRITE);
ok1(p01);
p02 = page_mmap(page0, NULL, PROT_READ | PROT_WRITE);
ok1(p02);
ok1(p02 != p01);
ok1(p01[0] == 0); ok1(p01[ps-1] == 0);
ok1(p02[0] == 0); ok1(p02[ps-1] == 0);
/* mappings should be to the same memory */
p01[0] = 1; ok1(p02[0] == 1);
p02[ps-1] = 2; ok1(p01[ps-1] == 2);
/* mmap page0 to fixed addr and check memory is the same */
p03 = mem_xvalloc(NULL, ps); /* allocate virt address space somewhere */
ok1(p03);
_ = page_mmap(page0, p03, PROT_READ | PROT_WRITE);
ok1(_);
ok1(_ == p03);
ok1(p03[0] == 1);
ok1(p03[ps-1] == 2);
p03[0] = 4; ok1(p01[0] == 4); ok1(p02[0] == 4);
p01[ps-1] = 5; ok1(p02[ps-1] == 5); ok1(p03[ps-1] == 5);
/* memory is forgotten after drop */
ramh_drop_memory(ramh, page0->ramh_pgoffset);
ok1(p01[0] == 0); ok1(p02[0] == 0); ok1(p03[0] == 0);
ok1(p01[ps-1] == 0); ok1(p02[ps-1] == 0); ok1(p03[ps-1] == 0);
/* let's allocate memory with pgoffset > current ram_maxsize */
ram_maxsize = ram_get_current_maxsize(ram);
ok1(ram_maxsize);
page2 = ramh_alloc_page(ramh, 2*ram_maxsize);
ok1(page2);
/* see if we can access & drop it */
p1 = page_mmap(page2, NULL, PROT_READ | PROT_WRITE);
ok1(p1);
p1[0] = 1;
p1[ps-1] = 1;
ramh_drop_memory(ramh, page2->ramh_pgoffset);
ok1(p1[0] == 0);
ok1(p1[ps-1] == 0);
xmunmap(p1, ps);
xmunmap(p01, ps);
xmunmap(p02, ps);
xmunmap(p03, ps);
/* ensure we get "no memory" when overallocating (not doing so would lead
* to getting SIGBUS on accessing memory and EFAULT on read/write
* syscalls). */
ok1(ram_maxsize == 3); /* NOTE must correlate with size in XRUN setup */
page0 = ramh_alloc_page(ramh, 0);
ok1(page0);
page1 = ramh_alloc_page(ramh, 1);
ok1(page1);
page2 = ramh_alloc_page(ramh, 2);
ok1(page2);
page3 = ramh_alloc_page(ramh, 3);
ok1(!page3); /* must fail - there is no such amount of memory */
p0 = page_mmap(page0, NULL, PROT_READ | PROT_WRITE); ok1(p0);
p1 = page_mmap(page1, NULL, PROT_READ | PROT_WRITE); ok1(p1);
p2 = page_mmap(page2, NULL, PROT_READ | PROT_WRITE); ok1(p2);
/* touch all memory - so that we know we can use it without getting SIGBUS */
memset(p0, 0xff, ps);
memset(p1, 0xff, ps);
memset(p2, 0xff, ps);
// TODO allocate memory amount = 2*ram_maxsize and touch it linearly
ramh_close(ramh);
return 0;
}
#ifndef _WENDELIN_BIGFILE_RAM_H_
#define _WENDELIN_BIGFILE_RAM_H_
/* Wendelin.bigfile | Interfaces to work with RAM
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*
* ~~~~~~~~
* TODO write why this needed (shmfs, hugetlbfs, ...)
*
*
* - need to track pages state (to support commit / abort)
*
* - need to "unload" non-dirty pages to free place for requested new data (reclaim)
*
* - need to be able to map a page into several places (to support
* overlapping-in-file mappings done not neccessarily adjacent-in-time to
* each other - there is no guarantee mapping them adjacent in address space
* is possible)
*
* XXX
*/
#include <wendelin/list.h>
#include <wendelin/bigfile/types.h>
typedef struct RAMH RAMH;
typedef struct Page Page;
/* RAM - something that provides access to memory (ex. via shmfs, hugetlbfs).
*
* Can create pages from memory allocated from backend and give memory back to
* system on request.
*
* Pages allocation/deallocation is done through handles (see RAMH).
*/
struct RAM {
const struct ram_ops *ram_ops;
size_t pagesize;
struct list_head lru_list; /* RAM pages in usage order (_ -> page->lru) */
};
typedef struct RAM RAM;
/* RAM operations - implemented by RAM backend */
struct ram_ops {
size_t (*get_current_maxsize) (RAM *ram);
RAMH * (*ramh_open) (RAM *ram);
void (*close) (RAM *ram);
};
/* get RAM current max size (in pages)
*
* Maximum size is RAM current whole size, which is shared by RAM Handles and
* other independent-from-us users (possibly from another processes).
*
* So amount of ram allocated for all RAM Handles could a) not be bigger than
* this, and b) there is no guarantee that this maximum could be achieved via
* allocating for RAMH only.
*
* Maximum is "current" because it can change dynamically - ex. via RAM
* hotplug.
*/
size_t ram_get_current_maxsize(RAM *ram);
/* open RAM handle
*
* Open new handle for memory inside RAM. Close the handle with ramh_close()
*/
RAMH *ramh_open(RAM *ram);
/* close RAM
*
* TODO text
*/
void ram_close(RAM *ram);
/* RAM Handle - handle to allocate/free pages from/to RAM
*
* Additional level on top of RAM which allows to group pages allocation.
*
* RAM backends are assumed to be organized that for a RAM handle, all pages
* allocated via that handle are provided by a single file in the OS kernel.
*
* With such organization, if 2 pages are allocated with adjacent pgoffset
* and mapped adjacent to each-other in address space - there will be only 1
* in-os-kernel VMA representing those 2 pages mapping.
*
* ( #os-vma should be kept to a minimum, because on every pagefault OS kernel
* needs to lookup faulting_addr -> os_vma )
*/
struct RAMH {
const struct ramh_ops *ramh_ops;
RAM *ram;
};
typedef struct RAMH RAMH;
struct ramh_ops {
#define RAMH_PGOFF_ALLOCFAIL ((pgoff_t)-1ULL)
/* @return: allocated ram_pgoffset | RAMH_PGOFF_ALLOCFAIL */
pgoff_t (*alloc_page) (RAMH *ramh, pgoff_t pgoffset_hint);
void * (*mmap_page) (RAMH *ramh, pgoff_t ramh_pgoffset, void *addr, int prot);
void (*drop_memory) (RAMH *ramh, pgoff_t ramh_pgoffset);
void (*close) (RAMH *ramh);
};
/* allocate new page for ramh memory
*
* @pgoffset_hint hint at which offset to allocate memory -
*
* - could be used so that f_offsets coincide with ramh_offsets
* and as the result, allocated areas constitute of contiguous
* ramh memory = only 1 kernel VMA for whole area.
*
* @return new page | NULL
*
* XXX write on how to free pages (= drop & free(struct Page) ?)
*
* NOTE after allocation, page->fileh & page->f_pgoffset are unset
*/
Page *ramh_alloc_page(RAMH *ramh, pgoff_t pgoffset_hint);
/* release ramh memory-page at ramh_pgoffset to OS
*
* After this call previous content of the memory-page is lost and the memory
* is released to OS.
*
* The memory is still accessible for mmaping but will read as all zeros - on
* first access OS would again allocate memory for it from scratch.
*/
void ramh_drop_memory(RAMH *ramh, pgoff_t ramh_pgoffset);
/* close RAMH handle
*
* NOTE it is an error to call close() with mappings from ramh left