Commit 69b00b5b authored by Guoqing Jiang's avatar Guoqing Jiang Committed by Song Liu

md: introduce a new struct for IO serialization

Obviously, IO serialization could cause the degradation of
performance a lot. In order to reduce the degradation, so a
rb interval tree is added in raid1 to speed up the check of
collision.

So, a rb root is needed in md_rdev, then abstract all the
serialize related members to a new struct (serial_in_rdev),
embed it into md_rdev.

Of course, we need to free the struct if it is not needed
anymore, so rdev/rdevs_uninit_serial are added accordingly.
And they should be called when destroty memory pool or can't
alloc memory.

And we need to consider to call mddev_destroy_serial_pool
in case serialize_policy/write-behind is disabled, bitmap
is destroyed or in __md_stop_writes.
Signed-off-by: default avatarGuoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: default avatarSong Liu <songliubraving@fb.com>
parent 4d26d32f
......@@ -1789,10 +1789,8 @@ void md_bitmap_destroy(struct mddev *mddev)
return;
md_bitmap_wait_behind_writes(mddev);
if (!mddev->serialize_policy) {
mempool_destroy(mddev->serial_info_pool);
mddev->serial_info_pool = NULL;
}
if (!mddev->serialize_policy)
mddev_destroy_serial_pool(mddev, NULL, true);
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
......@@ -2478,10 +2476,8 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
mddev->bitmap_info.max_write_behind = backlog;
if (!backlog && mddev->serial_info_pool) {
/* serial_info_pool is not needed if backlog is zero */
if (!mddev->serialize_policy) {
mempool_destroy(mddev->serial_info_pool);
mddev->serial_info_pool = NULL;
}
if (!mddev->serialize_policy)
mddev_destroy_serial_pool(mddev, NULL, false);
} else if (backlog && !mddev->serial_info_pool) {
/* serial_info_pool is needed since backlog is not zero */
struct md_rdev *rdev;
......
......@@ -125,25 +125,59 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max;
}
static void rdev_uninit_serial(struct md_rdev *rdev)
{
if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
return;
kfree(rdev->serial);
rdev->serial = NULL;
}
static void rdevs_uninit_serial(struct mddev *mddev)
{
struct md_rdev *rdev;
rdev_for_each(rdev, mddev)
rdev_uninit_serial(rdev);
}
static int rdev_init_serial(struct md_rdev *rdev)
{
spin_lock_init(&rdev->serial_list_lock);
INIT_LIST_HEAD(&rdev->serial_list);
init_waitqueue_head(&rdev->serial_io_wait);
struct serial_in_rdev *serial = NULL;
if (test_bit(CollisionCheck, &rdev->flags))
return 0;
serial = kmalloc(sizeof(struct serial_in_rdev), GFP_KERNEL);
if (!serial)
return -ENOMEM;
spin_lock_init(&serial->serial_lock);
serial->serial_rb = RB_ROOT_CACHED;
init_waitqueue_head(&serial->serial_io_wait);
rdev->serial = serial;
set_bit(CollisionCheck, &rdev->flags);
return 1;
return 0;
}
static void rdevs_init_serial(struct mddev *mddev)
static int rdevs_init_serial(struct mddev *mddev)
{
struct md_rdev *rdev;
int ret = 0;
rdev_for_each(rdev, mddev) {
if (test_bit(CollisionCheck, &rdev->flags))
continue;
rdev_init_serial(rdev);
ret = rdev_init_serial(rdev);
if (ret)
break;
}
/* Free all resources if pool is not existed */
if (ret && !mddev->serial_info_pool)
rdevs_uninit_serial(mddev);
return ret;
}
/*
......@@ -166,6 +200,8 @@ static int rdev_need_serial(struct md_rdev *rdev)
void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend)
{
int ret = 0;
if (rdev && !rdev_need_serial(rdev) &&
!test_bit(CollisionCheck, &rdev->flags))
return;
......@@ -174,9 +210,11 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
mddev_suspend(mddev);
if (!rdev)
rdevs_init_serial(mddev);
ret = rdevs_init_serial(mddev);
else
rdev_init_serial(rdev);
ret = rdev_init_serial(rdev);
if (ret)
goto abort;
if (mddev->serial_info_pool == NULL) {
unsigned int noio_flag;
......@@ -186,9 +224,13 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
sizeof(struct serial_info));
memalloc_noio_restore(noio_flag);
if (!mddev->serial_info_pool)
if (!mddev->serial_info_pool) {
rdevs_uninit_serial(mddev);
pr_err("can't alloc memory pool for serialization\n");
}
}
abort:
if (!is_suspend)
mddev_resume(mddev);
}
......@@ -199,7 +241,7 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
* 2. when bitmap is destroyed while policy is not enabled.
* 3. for disable policy, the pool is destroyed only when no rdev needs it.
*/
static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend)
{
if (rdev && !test_bit(CollisionCheck, &rdev->flags))
......@@ -213,8 +255,9 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
mddev_suspend(mddev);
rdev_for_each(temp, mddev) {
if (!rdev) {
if (!rdev_need_serial(temp))
clear_bit(CollisionCheck, &temp->flags);
if (!mddev->serialize_policy ||
!rdev_need_serial(temp))
rdev_uninit_serial(temp);
else
num++;
} else if (temp != rdev &&
......@@ -223,7 +266,7 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
}
if (rdev)
clear_bit(CollisionCheck, &rdev->flags);
rdev_uninit_serial(rdev);
if (num)
pr_info("The mempool could be used by other devices\n");
......@@ -6117,8 +6160,9 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
mempool_destroy(mddev->serial_info_pool);
mddev->serial_info_pool = NULL;
/* disable policy to guarantee rdevs free resources for serialization */
mddev->serialize_policy = 0;
mddev_destroy_serial_pool(mddev, NULL, true);
}
void md_stop_writes(struct mddev *mddev)
......
......@@ -32,6 +32,16 @@
* be retried.
*/
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
/*
* The struct embedded in rdev is used to serialize IO.
*/
struct serial_in_rdev {
struct rb_root_cached serial_rb;
spinlock_t serial_lock;
wait_queue_head_t serial_io_wait;
};
/*
* MD's 'extended' device
*/
......@@ -110,12 +120,7 @@ struct md_rdev {
* in superblock.
*/
/*
* The members for check collision of write IOs.
*/
struct list_head serial_list;
spinlock_t serial_list_lock;
wait_queue_head_t serial_io_wait;
struct serial_in_rdev *serial; /* used for raid1 io serialization */
struct work_struct del_work; /* used for delayed sysfs removal */
......@@ -266,9 +271,10 @@ enum mddev_sb_flags {
#define NR_SERIAL_INFOS 8
/* record current range of serialize IOs */
struct serial_info {
sector_t lo;
sector_t hi;
struct list_head list;
struct rb_node node;
sector_t start; /* start sector of rb node */
sector_t last; /* end sector of rb node */
sector_t _subtree_last; /* highest sector in subtree of rb node */
};
struct mddev {
......@@ -740,6 +746,8 @@ extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend);
extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
......
......@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
#include <linux/interval_tree_generic.h>
#include <trace/events/block.h>
......@@ -50,55 +51,58 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#include "raid1-10.c"
#define START(node) ((node)->start)
#define LAST(node) ((node)->last)
INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
START, LAST, static inline, raid1_rb);
static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
struct serial_info *wi, *temp_wi;
struct serial_info *si;
unsigned long flags;
int ret = 0;
struct mddev *mddev = rdev->mddev;
struct serial_in_rdev *serial = rdev->serial;
wi = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
spin_lock_irqsave(&rdev->serial_list_lock, flags);
list_for_each_entry(temp_wi, &rdev->serial_list, list) {
spin_lock_irqsave(&serial->serial_lock, flags);
/* collision happened */
if (hi > temp_wi->lo && lo < temp_wi->hi) {
if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
ret = -EBUSY;
break;
}
}
if (!ret) {
wi->lo = lo;
wi->hi = hi;
list_add(&wi->list, &rdev->serial_list);
si->start = lo;
si->last = hi;
raid1_rb_insert(si, &serial->serial_rb);
} else
mempool_free(wi, mddev->serial_info_pool);
spin_unlock_irqrestore(&rdev->serial_list_lock, flags);
mempool_free(si, mddev->serial_info_pool);
spin_unlock_irqrestore(&serial->serial_lock, flags);
return ret;
}
static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
struct serial_info *wi;
struct serial_info *si;
unsigned long flags;
int found = 0;
struct mddev *mddev = rdev->mddev;
spin_lock_irqsave(&rdev->serial_list_lock, flags);
list_for_each_entry(wi, &rdev->serial_list, list)
if (hi == wi->hi && lo == wi->lo) {
list_del(&wi->list);
mempool_free(wi, mddev->serial_info_pool);
struct serial_in_rdev *serial = rdev->serial;
spin_lock_irqsave(&serial->serial_lock, flags);
for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
si; si = raid1_rb_iter_next(si, lo, hi)) {
if (si->start == lo && si->last == hi) {
raid1_rb_remove(si, &serial->serial_rb);
mempool_free(si, mddev->serial_info_pool);
found = 1;
break;
}
}
if (!found)
WARN(1, "The write IO is not recorded for serialization\n");
spin_unlock_irqrestore(&rdev->serial_list_lock, flags);
wake_up(&rdev->serial_io_wait);
spin_unlock_irqrestore(&serial->serial_lock, flags);
wake_up(&serial->serial_io_wait);
}
/*
......@@ -1482,6 +1486,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
for (i = 0; i < disks; i++) {
struct bio *mbio = NULL;
struct md_rdev *rdev = conf->mirrors[i].rdev;
struct serial_in_rdev *serial = rdev->serial;
if (!r1_bio->bios[i])
continue;
......@@ -1510,13 +1515,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (r1_bio->behind_master_bio) {
if (test_bit(CollisionCheck, &rdev->flags))
wait_event(rdev->serial_io_wait,
wait_event(serial->serial_io_wait,
check_and_add_serial(rdev, lo, hi)
== 0);
if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
} else if (mddev->serialize_policy)
wait_event(rdev->serial_io_wait,
wait_event(serial->serial_io_wait,
check_and_add_serial(rdev, lo, hi) == 0);
r1_bio->bios[i] = mbio;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment