Commit 67324ea1 authored by Mike Snitzer's avatar Mike Snitzer

dm thin: sort the per thin deferred bios using an rb_tree

A thin-pool will allocate blocks using FIFO order for all thin devices
which share the thin-pool.  Because of this simplistic allocation the
thin-pool's space can become fragmented quite easily; especially when
multiple threads are requesting blocks in parallel.

Sort each thin device's deferred_bio_list based on logical sector to
help reduce fragmentation of the thin-pool's ondisk layout.

The following tables illustrate the realized gains/potential offered by
sorting each thin device's deferred_bio_list.  An "io size"-sized random
read of the device would result in "seeks/io" fragments being read, with
an average "distance/seek" between each fragment.

Data was written to a single thin device using multiple threads via
iozone (8 threads, 64K for both the block_size and io_size).

unsorted:

     io size   seeks/io distance/seek
  --------------------------------------
          4k    0.000   0b
         16k    0.013   11m
         64k    0.065   11m
        256k    0.274   10m
          1m    1.109   10m
          4m    4.411   10m
         16m    17.097  11m
         64m    60.055  13m
        256m    148.798 25m
          1g    809.929 21m

sorted:

     io size   seeks/io distance/seek
  --------------------------------------
          4k    0.000   0b
         16k    0.000   1g
         64k    0.001   1g
        256k    0.003   1g
          1m    0.011   1g
          4m    0.045   1g
         16m    0.181   1g
         64m    0.747   1011m
        256m    3.299   1g
          1g    14.373  1g
Signed-off-by: default avatarMike Snitzer <snitzer@redhat.com>
Acked-by: default avatarJoe Thornber <ejt@redhat.com>
parent c140e1c4
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/init.h> #include <linux/init.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/rbtree.h>
#define DM_MSG_PREFIX "thin" #define DM_MSG_PREFIX "thin"
...@@ -230,6 +231,7 @@ struct thin_c { ...@@ -230,6 +231,7 @@ struct thin_c {
spinlock_t lock; spinlock_t lock;
struct bio_list deferred_bio_list; struct bio_list deferred_bio_list;
struct bio_list retry_on_resume_list; struct bio_list retry_on_resume_list;
struct rb_root sort_bio_list; /* sorted list of deferred bios */
}; };
/*----------------------------------------------------------------*/ /*----------------------------------------------------------------*/
...@@ -371,6 +373,7 @@ struct dm_thin_endio_hook { ...@@ -371,6 +373,7 @@ struct dm_thin_endio_hook {
struct dm_deferred_entry *shared_read_entry; struct dm_deferred_entry *shared_read_entry;
struct dm_deferred_entry *all_io_entry; struct dm_deferred_entry *all_io_entry;
struct dm_thin_new_mapping *overwrite_mapping; struct dm_thin_new_mapping *overwrite_mapping;
struct rb_node rb_node;
}; };
static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
...@@ -1367,12 +1370,77 @@ static int need_commit_due_to_time(struct pool *pool) ...@@ -1367,12 +1370,77 @@ static int need_commit_due_to_time(struct pool *pool)
jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
} }
#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
{
struct rb_node **rbp, *parent;
struct dm_thin_endio_hook *pbd;
sector_t bi_sector = bio->bi_iter.bi_sector;
rbp = &tc->sort_bio_list.rb_node;
parent = NULL;
while (*rbp) {
parent = *rbp;
pbd = thin_pbd(parent);
if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
rbp = &(*rbp)->rb_left;
else
rbp = &(*rbp)->rb_right;
}
pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
rb_link_node(&pbd->rb_node, parent, rbp);
rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
}
static void __extract_sorted_bios(struct thin_c *tc)
{
struct rb_node *node;
struct dm_thin_endio_hook *pbd;
struct bio *bio;
for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
pbd = thin_pbd(node);
bio = thin_bio(pbd);
bio_list_add(&tc->deferred_bio_list, bio);
rb_erase(&pbd->rb_node, &tc->sort_bio_list);
}
WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
}
static void __sort_thin_deferred_bios(struct thin_c *tc)
{
struct bio *bio;
struct bio_list bios;
bio_list_init(&bios);
bio_list_merge(&bios, &tc->deferred_bio_list);
bio_list_init(&tc->deferred_bio_list);
/* Sort deferred_bio_list using rb-tree */
while ((bio = bio_list_pop(&bios)))
__thin_bio_rb_add(tc, bio);
/*
* Transfer the sorted bios in sort_bio_list back to
* deferred_bio_list to allow lockless submission of
* all bios.
*/
__extract_sorted_bios(tc);
}
static void process_thin_deferred_bios(struct thin_c *tc) static void process_thin_deferred_bios(struct thin_c *tc)
{ {
struct pool *pool = tc->pool; struct pool *pool = tc->pool;
unsigned long flags; unsigned long flags;
struct bio *bio; struct bio *bio;
struct bio_list bios; struct bio_list bios;
struct blk_plug plug;
if (tc->requeue_mode) { if (tc->requeue_mode) {
requeue_bio_list(tc, &tc->deferred_bio_list); requeue_bio_list(tc, &tc->deferred_bio_list);
...@@ -1382,10 +1450,20 @@ static void process_thin_deferred_bios(struct thin_c *tc) ...@@ -1382,10 +1450,20 @@ static void process_thin_deferred_bios(struct thin_c *tc)
bio_list_init(&bios); bio_list_init(&bios);
spin_lock_irqsave(&tc->lock, flags); spin_lock_irqsave(&tc->lock, flags);
if (bio_list_empty(&tc->deferred_bio_list)) {
spin_unlock_irqrestore(&tc->lock, flags);
return;
}
__sort_thin_deferred_bios(tc);
bio_list_merge(&bios, &tc->deferred_bio_list); bio_list_merge(&bios, &tc->deferred_bio_list);
bio_list_init(&tc->deferred_bio_list); bio_list_init(&tc->deferred_bio_list);
spin_unlock_irqrestore(&tc->lock, flags); spin_unlock_irqrestore(&tc->lock, flags);
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bios))) { while ((bio = bio_list_pop(&bios))) {
/* /*
* If we've got no free new_mapping structs, and processing * If we've got no free new_mapping structs, and processing
...@@ -1405,6 +1483,7 @@ static void process_thin_deferred_bios(struct thin_c *tc) ...@@ -1405,6 +1483,7 @@ static void process_thin_deferred_bios(struct thin_c *tc)
else else
pool->process_bio(tc, bio); pool->process_bio(tc, bio);
} }
blk_finish_plug(&plug);
} }
static void process_deferred_bios(struct pool *pool) static void process_deferred_bios(struct pool *pool)
...@@ -2964,7 +3043,7 @@ static struct target_type pool_target = { ...@@ -2964,7 +3043,7 @@ static struct target_type pool_target = {
.name = "thin-pool", .name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE, DM_TARGET_IMMUTABLE,
.version = {1, 11, 0}, .version = {1, 12, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = pool_ctr, .ctr = pool_ctr,
.dtr = pool_dtr, .dtr = pool_dtr,
...@@ -3040,6 +3119,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ...@@ -3040,6 +3119,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
spin_lock_init(&tc->lock); spin_lock_init(&tc->lock);
bio_list_init(&tc->deferred_bio_list); bio_list_init(&tc->deferred_bio_list);
bio_list_init(&tc->retry_on_resume_list); bio_list_init(&tc->retry_on_resume_list);
tc->sort_bio_list = RB_ROOT;
if (argc == 3) { if (argc == 3) {
r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
...@@ -3287,7 +3367,7 @@ static int thin_iterate_devices(struct dm_target *ti, ...@@ -3287,7 +3367,7 @@ static int thin_iterate_devices(struct dm_target *ti,
static struct target_type thin_target = { static struct target_type thin_target = {
.name = "thin", .name = "thin",
.version = {1, 11, 0}, .version = {1, 12, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = thin_ctr, .ctr = thin_ctr,
.dtr = thin_dtr, .dtr = thin_dtr,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment