Commit aec71382 authored by Chao Yu's avatar Chao Yu Committed by Jaegeuk Kim

f2fs: refactor flush_nat_entries codes for reducing NAT writes

Although building NAT journal in cursum reduce the read/write work for NAT
block, but previous design leave us lower performance when write checkpoint
frequently for these cases:
1. if journal in cursum has already full, it's a bit of waste that we flush all
   nat entries to page for persistence, but not to cache any entries.
2. if journal in cursum is not full, we fill nat entries to journal util
   journal is full, then flush the left dirty entries to disk without merge
   journaled entries, so these journaled entries may be flushed to disk at next
   checkpoint but lost chance to flushed last time.

In this patch we merge dirty entries located in same NAT block to nat entry set,
and linked all set to list, sorted ascending order by entries' count of set.
Later we flush entries in sparse set into journal as many as we can, and then
flush merged entries to disk. In this way we can not only gain in performance,
but also save lifetime of flash device.

In my testing environment, it shows this patch can help to reduce NAT block
writes obviously. In hard disk test case: cost time of fsstress is stablely
reduced by about 5%.

1. virtual machine + hard disk:
fsstress -p 20 -n 200 -l 5
		node num	cp count	nodes/cp
based		4599.6		1803.0		2.551
patched		2714.6		1829.6		1.483

2. virtual machine + 32g micro SD card:
fsstress -p 20 -n 200 -l 1 -w -f chown=0 -f creat=4 -f dwrite=0
-f fdatasync=4 -f fsync=4 -f link=0 -f mkdir=4 -f mknod=4 -f rename=5
-f rmdir=5 -f symlink=0 -f truncate=4 -f unlink=5 -f write=0 -S

		node num	cp count	nodes/cp
based		84.5		43.7		1.933
patched		49.2		40.0		1.23

Our latency of merging op shows not bad when handling extreme case like:
merging a great number of dirty nats:
latency(ns)	dirty nat count
3089219		24922
5129423		27422
4000250		24523

change log from v1:
 o fix wrong logic in add_nat_entry when grab a new nat entry set.
 o swith to create slab cache in create_node_manager_caches.
 o use GFP_ATOMIC instead of GFP_NOFS to avoid potential long latency.

change log from v2:
 o make comment position more appropriate suggested by Jaegeuk Kim.
Signed-off-by: default avatarChao Yu <chao2.yu@samsung.com>
Signed-off-by: default avatarJaegeuk Kim <jaegeuk@kernel.org>
parent a014e037
......@@ -256,6 +256,8 @@ struct f2fs_nm_info {
unsigned int nat_cnt; /* the # of cached nat entries */
struct list_head nat_entries; /* cached nat entry list (clean) */
struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
struct list_head nat_entry_set; /* nat entry set list */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
/* free node ids management */
struct radix_tree_root free_nid_root;/* root of the free_nid cache */
......
......@@ -25,6 +25,7 @@
static struct kmem_cache *nat_entry_slab;
static struct kmem_cache *free_nid_slab;
static struct kmem_cache *nat_entry_set_slab;
bool available_free_memory(struct f2fs_sb_info *sbi, int type)
{
......@@ -90,12 +91,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
/* get current nat block page with lock */
src_page = get_meta_page(sbi, src_off);
/* Dirty src_page means that it is already the new target NAT page. */
if (PageDirty(src_page))
return src_page;
dst_page = grab_meta_page(sbi, dst_off);
f2fs_bug_on(PageDirty(src_page));
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
......@@ -1744,7 +1741,90 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
return err;
}
static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
static struct nat_entry_set *grab_nat_entry_set(void)
{
struct nat_entry_set *nes =
f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
nes->entry_cnt = 0;
INIT_LIST_HEAD(&nes->set_list);
INIT_LIST_HEAD(&nes->entry_list);
return nes;
}
static void release_nat_entry_set(struct nat_entry_set *nes,
struct f2fs_nm_info *nm_i)
{
f2fs_bug_on(!list_empty(&nes->entry_list));
nm_i->dirty_nat_cnt -= nes->entry_cnt;
list_del(&nes->set_list);
kmem_cache_free(nat_entry_set_slab, nes);
}
static void adjust_nat_entry_set(struct nat_entry_set *nes,
struct list_head *head)
{
struct nat_entry_set *next = nes;
if (list_is_last(&nes->set_list, head))
return;
list_for_each_entry_continue(next, head, set_list)
if (nes->entry_cnt <= next->entry_cnt)
break;
list_move_tail(&nes->set_list, &next->set_list);
}
static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
{
struct nat_entry_set *nes;
nid_t start_nid = START_NID(ne->ni.nid);
list_for_each_entry(nes, head, set_list) {
if (nes->start_nid == start_nid) {
list_move_tail(&ne->list, &nes->entry_list);
nes->entry_cnt++;
adjust_nat_entry_set(nes, head);
return;
}
}
nes = grab_nat_entry_set();
nes->start_nid = start_nid;
list_move_tail(&ne->list, &nes->entry_list);
nes->entry_cnt++;
list_add(&nes->set_list, head);
}
static void merge_nats_in_set(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct list_head *dirty_list = &nm_i->dirty_nat_entries;
struct list_head *set_list = &nm_i->nat_entry_set;
struct nat_entry *ne, *tmp;
write_lock(&nm_i->nat_tree_lock);
list_for_each_entry_safe(ne, tmp, dirty_list, list) {
if (nat_get_blkaddr(ne) == NEW_ADDR)
continue;
add_nat_entry(ne, set_list);
nm_i->dirty_nat_cnt++;
}
write_unlock(&nm_i->nat_tree_lock);
}
static bool __has_cursum_space(struct f2fs_summary_block *sum, int size)
{
if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES)
return true;
else
return false;
}
static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
......@@ -1752,12 +1832,6 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
int i;
mutex_lock(&curseg->curseg_mutex);
if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) {
mutex_unlock(&curseg->curseg_mutex);
return false;
}
for (i = 0; i < nats_in_cursum(sum); i++) {
struct nat_entry *ne;
struct f2fs_nat_entry raw_ne;
......@@ -1767,23 +1841,21 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
retry:
write_lock(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
if (ne) {
__set_nat_cache_dirty(nm_i, ne);
write_unlock(&nm_i->nat_tree_lock);
continue;
}
if (ne)
goto found;
ne = grab_nat_entry(nm_i, nid);
if (!ne) {
write_unlock(&nm_i->nat_tree_lock);
goto retry;
}
node_info_from_raw_nat(&ne->ni, &raw_ne);
found:
__set_nat_cache_dirty(nm_i, ne);
write_unlock(&nm_i->nat_tree_lock);
}
update_nats_in_cursum(sum, -i);
mutex_unlock(&curseg->curseg_mutex);
return true;
}
/*
......@@ -1794,65 +1866,68 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
struct nat_entry *ne, *cur;
struct page *page = NULL;
struct f2fs_nat_block *nat_blk = NULL;
nid_t start_nid = 0, end_nid = 0;
bool flushed;
flushed = flush_nats_in_journal(sbi);
if (!flushed)
mutex_lock(&curseg->curseg_mutex);
/* 1) flush dirty nat caches */
list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) {
nid_t nid;
struct f2fs_nat_entry raw_ne;
int offset = -1;
struct nat_entry_set *nes, *tmp;
struct list_head *head = &nm_i->nat_entry_set;
bool to_journal = true;
if (nat_get_blkaddr(ne) == NEW_ADDR)
continue;
nid = nat_get_nid(ne);
/* merge nat entries of dirty list to nat entry set temporarily */
merge_nats_in_set(sbi);
if (flushed)
goto to_nat_page;
/*
* if there are no enough space in journal to store dirty nat
* entries, remove all entries from journal and merge them
* into nat entry set.
*/
if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) {
remove_nats_in_journal(sbi);
/* if there is room for nat enries in curseg->sumpage */
offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
if (offset >= 0) {
raw_ne = nat_in_journal(sum, offset);
goto flush_now;
}
to_nat_page:
if (!page || (start_nid > nid || nid > end_nid)) {
if (page) {
f2fs_put_page(page, 1);
page = NULL;
/*
* merge nat entries of dirty list to nat entry set temporarily
*/
merge_nats_in_set(sbi);
}
start_nid = START_NID(nid);
end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
if (!nm_i->dirty_nat_cnt)
return;
/*
* get nat block with dirty flag, increased reference
* count, mapped and lock
* there are two steps to flush nat entries:
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
list_for_each_entry_safe(nes, tmp, head, set_list) {
struct f2fs_nat_block *nat_blk;
struct nat_entry *ne, *cur;
struct page *page;
nid_t start_nid = nes->start_nid;
if (to_journal && !__has_cursum_space(sum, nes->entry_cnt))
to_journal = false;
if (to_journal) {
mutex_lock(&curseg->curseg_mutex);
} else {
page = get_next_nat_page(sbi, start_nid);
nat_blk = page_address(page);
f2fs_bug_on(!nat_blk);
}
f2fs_bug_on(!nat_blk);
raw_ne = nat_blk->entries[nid - start_nid];
flush_now:
raw_nat_from_node_info(&raw_ne, &ne->ni);
/* flush dirty nats in nat entry set */
list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
struct f2fs_nat_entry *raw_ne;
nid_t nid = nat_get_nid(ne);
int offset;
if (offset < 0) {
nat_blk->entries[nid - start_nid] = raw_ne;
} else {
nat_in_journal(sum, offset) = raw_ne;
if (to_journal) {
offset = lookup_journal_in_cursum(sum,
NAT_JOURNAL, nid, 1);
f2fs_bug_on(offset < 0);
raw_ne = &nat_in_journal(sum, offset);
nid_in_journal(sum, offset) = cpu_to_le32(nid);
} else {
raw_ne = &nat_blk->entries[nid - start_nid];
}
raw_nat_from_node_info(raw_ne, &ne->ni);
if (nat_get_blkaddr(ne) == NULL_ADDR &&
add_free_nid(sbi, nid, false) <= 0) {
......@@ -1865,9 +1940,17 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
write_unlock(&nm_i->nat_tree_lock);
}
}
if (!flushed)
if (to_journal)
mutex_unlock(&curseg->curseg_mutex);
else
f2fs_put_page(page, 1);
release_nat_entry_set(nes, nm_i);
}
f2fs_bug_on(!list_empty(head));
f2fs_bug_on(nm_i->dirty_nat_cnt);
}
static int init_node_manager(struct f2fs_sb_info *sbi)
......@@ -1896,6 +1979,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->nat_entries);
INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
INIT_LIST_HEAD(&nm_i->nat_entry_set);
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock);
......@@ -1976,19 +2060,30 @@ int __init create_node_manager_caches(void)
nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
sizeof(struct nat_entry));
if (!nat_entry_slab)
return -ENOMEM;
goto fail;
free_nid_slab = f2fs_kmem_cache_create("free_nid",
sizeof(struct free_nid));
if (!free_nid_slab) {
if (!free_nid_slab)
goto destory_nat_entry;
nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
sizeof(struct nat_entry_set));
if (!nat_entry_set_slab)
goto destory_free_nid;
return 0;
destory_free_nid:
kmem_cache_destroy(free_nid_slab);
destory_nat_entry:
kmem_cache_destroy(nat_entry_slab);
fail:
return -ENOMEM;
}
return 0;
}
void destroy_node_manager_caches(void)
{
kmem_cache_destroy(nat_entry_set_slab);
kmem_cache_destroy(free_nid_slab);
kmem_cache_destroy(nat_entry_slab);
}
......@@ -89,6 +89,13 @@ enum mem_type {
DIRTY_DENTS /* indicates dirty dentry pages */
};
struct nat_entry_set {
struct list_head set_list; /* link with all nat sets */
struct list_head entry_list; /* link with dirty nat entries */
nid_t start_nid; /* start nid of nats in set */
unsigned int entry_cnt; /* the # of nat entries in set */
};
/*
* For free nid mangement
*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment