Commit 786bb024 authored by Pankaj Raghav's avatar Pankaj Raghav Committed by Jens Axboe

brd: use XArray instead of radix-tree to index backing pages

XArray was introduced to hold large array of pointers with a simple API.
XArray API also provides array semantics which simplifies the way we store
and access the backing pages, and the code becomes significantly easier
to understand.

No performance difference was noticed between the two implementation
using fio with direct=1 [1].

[1] Performance in KIOPS:

          |  radix-tree |    XArray  |   Diff
          |             |            |
write     |    315      |     313    |   -0.6%
randwrite |    286      |     290    |   +1.3%
read      |    330      |     335    |   +1.5%
randread  |    309      |     312    |   +0.9%
Signed-off-by: default avatarPankaj Raghav <p.raghav@samsung.com>
Reviewed-by: default avatarHannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20230511121544.111648-1-p.raghav@samsung.comSigned-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent f1fcbaa1
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/radix-tree.h> #include <linux/xarray.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
/* /*
* Each block ramdisk device has a radix_tree brd_pages of pages that stores * Each block ramdisk device has a xarray brd_pages of pages that stores
* the pages containing the block device's contents. A brd page's ->index is * the pages containing the block device's contents. A brd page's ->index is
* its offset in PAGE_SIZE units. This is similar to, but in no way connected * its offset in PAGE_SIZE units. This is similar to, but in no way connected
* with, the kernel's pagecache or buffer cache (which sit above our block * with, the kernel's pagecache or buffer cache (which sit above our block
...@@ -40,11 +40,9 @@ struct brd_device { ...@@ -40,11 +40,9 @@ struct brd_device {
struct list_head brd_list; struct list_head brd_list;
/* /*
* Backing store of pages and lock to protect it. This is the contents * Backing store of pages. This is the contents of the block device.
* of the block device.
*/ */
spinlock_t brd_lock; struct xarray brd_pages;
struct radix_tree_root brd_pages;
u64 brd_nr_pages; u64 brd_nr_pages;
}; };
...@@ -56,21 +54,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) ...@@ -56,21 +54,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
pgoff_t idx; pgoff_t idx;
struct page *page; struct page *page;
/*
* The page lifetime is protected by the fact that we have opened the
* device node -- brd pages will never be deleted under us, so we
* don't need any further locking or refcounting.
*
* This is strictly true for the radix-tree nodes as well (ie. we
* don't actually need the rcu_read_lock()), however that is not a
* documented feature of the radix-tree API so it is better to be
* safe here (we don't have total exclusion from radix tree updates
* here, only deletes).
*/
rcu_read_lock();
idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
page = radix_tree_lookup(&brd->brd_pages, idx); page = xa_load(&brd->brd_pages, idx);
rcu_read_unlock();
BUG_ON(page && page->index != idx); BUG_ON(page && page->index != idx);
...@@ -83,7 +68,7 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) ...@@ -83,7 +68,7 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
{ {
pgoff_t idx; pgoff_t idx;
struct page *page; struct page *page, *cur;
int ret = 0; int ret = 0;
page = brd_lookup_page(brd, sector); page = brd_lookup_page(brd, sector);
...@@ -94,71 +79,42 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) ...@@ -94,71 +79,42 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
if (!page) if (!page)
return -ENOMEM; return -ENOMEM;
if (radix_tree_maybe_preload(gfp)) { xa_lock(&brd->brd_pages);
__free_page(page);
return -ENOMEM;
}
spin_lock(&brd->brd_lock);
idx = sector >> PAGE_SECTORS_SHIFT; idx = sector >> PAGE_SECTORS_SHIFT;
page->index = idx; page->index = idx;
if (radix_tree_insert(&brd->brd_pages, idx, page)) {
cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp);
if (unlikely(cur)) {
__free_page(page); __free_page(page);
page = radix_tree_lookup(&brd->brd_pages, idx); ret = xa_err(cur);
if (!page) if (!ret && (cur->index != idx))
ret = -ENOMEM;
else if (page->index != idx)
ret = -EIO; ret = -EIO;
} else { } else {
brd->brd_nr_pages++; brd->brd_nr_pages++;
} }
spin_unlock(&brd->brd_lock);
radix_tree_preload_end(); xa_unlock(&brd->brd_pages);
return ret; return ret;
} }
/* /*
* Free all backing store pages and radix tree. This must only be called when * Free all backing store pages and xarray. This must only be called when
* there are no other users of the device. * there are no other users of the device.
*/ */
#define FREE_BATCH 16
static void brd_free_pages(struct brd_device *brd) static void brd_free_pages(struct brd_device *brd)
{ {
unsigned long pos = 0; struct page *page;
struct page *pages[FREE_BATCH]; pgoff_t idx;
int nr_pages;
do {
int i;
nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
(void **)pages, pos, FREE_BATCH);
for (i = 0; i < nr_pages; i++) {
void *ret;
BUG_ON(pages[i]->index < pos); xa_for_each(&brd->brd_pages, idx, page) {
pos = pages[i]->index; __free_page(page);
ret = radix_tree_delete(&brd->brd_pages, pos); cond_resched_rcu();
BUG_ON(!ret || ret != pages[i]);
__free_page(pages[i]);
} }
pos++; xa_destroy(&brd->brd_pages);
/*
* It takes 3.4 seconds to remove 80GiB ramdisk.
* So, we need cond_resched to avoid stalling the CPU.
*/
cond_resched();
/*
* This assumes radix_tree_gang_lookup always returns as
* many pages as possible. If the radix-tree code changes,
* so will this have to.
*/
} while (nr_pages == FREE_BATCH);
} }
/* /*
...@@ -372,8 +328,7 @@ static int brd_alloc(int i) ...@@ -372,8 +328,7 @@ static int brd_alloc(int i)
brd->brd_number = i; brd->brd_number = i;
list_add_tail(&brd->brd_list, &brd_devices); list_add_tail(&brd->brd_list, &brd_devices);
spin_lock_init(&brd->brd_lock); xa_init(&brd->brd_pages);
INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
snprintf(buf, DISK_NAME_LEN, "ram%d", i); snprintf(buf, DISK_NAME_LEN, "ram%d", i);
if (!IS_ERR_OR_NULL(brd_debugfs_dir)) if (!IS_ERR_OR_NULL(brd_debugfs_dir))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment