Commit 724376a0 authored by Mikulas Patocka's avatar Mikulas Patocka Committed by Mike Snitzer

dm integrity: implement fair range locks

dm-integrity locks a range of sectors to prevent concurrent I/O or journal
writeback.  These locks were not fair - so that many small overlapping I/Os
could starve a large I/O indefinitely.

Fix this by making the range locks fair.  The ranges that are waiting are
added to the list "wait_list".  If a new I/O overlaps some of the waiting
I/Os, it is not dispatched, but it is also added to that wait list.
Entries on the wait list are processed in first-in-first-out order, so
that an I/O can't starve indefinitely.
Signed-off-by: default avatarMikulas Patocka <mpatocka@redhat.com>
Signed-off-by: default avatarMike Snitzer <snitzer@redhat.com>
parent 518748b1
...@@ -186,6 +186,7 @@ struct dm_integrity_c { ...@@ -186,6 +186,7 @@ struct dm_integrity_c {
/* these variables are locked with endio_wait.lock */ /* these variables are locked with endio_wait.lock */
struct rb_root in_progress; struct rb_root in_progress;
struct list_head wait_list;
wait_queue_head_t endio_wait; wait_queue_head_t endio_wait;
struct workqueue_struct *wait_wq; struct workqueue_struct *wait_wq;
...@@ -233,7 +234,14 @@ struct dm_integrity_c { ...@@ -233,7 +234,14 @@ struct dm_integrity_c {
struct dm_integrity_range { struct dm_integrity_range {
sector_t logical_sector; sector_t logical_sector;
unsigned n_sectors; unsigned n_sectors;
struct rb_node node; bool waiting;
union {
struct rb_node node;
struct {
struct task_struct *task;
struct list_head wait_entry;
};
};
}; };
struct dm_integrity_io { struct dm_integrity_io {
...@@ -867,13 +875,27 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig ...@@ -867,13 +875,27 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig
} }
} }
static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2)
{
return range1->logical_sector < range2->logical_sector + range2->n_sectors &&
range2->logical_sector + range2->n_sectors > range2->logical_sector;
}
static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting)
{ {
struct rb_node **n = &ic->in_progress.rb_node; struct rb_node **n = &ic->in_progress.rb_node;
struct rb_node *parent; struct rb_node *parent;
BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1)); BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
if (likely(check_waiting)) {
struct dm_integrity_range *range;
list_for_each_entry(range, &ic->wait_list, wait_entry) {
if (unlikely(ranges_overlap(range, new_range)))
return false;
}
}
parent = NULL; parent = NULL;
while (*n) { while (*n) {
...@@ -898,7 +920,22 @@ static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range * ...@@ -898,7 +920,22 @@ static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *
static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range) static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
{ {
rb_erase(&range->node, &ic->in_progress); rb_erase(&range->node, &ic->in_progress);
wake_up_locked(&ic->endio_wait); while (unlikely(!list_empty(&ic->wait_list))) {
struct dm_integrity_range *last_range =
list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry);
struct task_struct *last_range_task;
if (!ranges_overlap(range, last_range))
break;
last_range_task = last_range->task;
list_del(&last_range->wait_entry);
if (!add_new_range(ic, last_range, false)) {
last_range->task = last_range_task;
list_add(&last_range->wait_entry, &ic->wait_list);
break;
}
last_range->waiting = false;
wake_up_process(last_range_task);
}
} }
static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range) static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
...@@ -910,6 +947,19 @@ static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *r ...@@ -910,6 +947,19 @@ static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *r
spin_unlock_irqrestore(&ic->endio_wait.lock, flags); spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
} }
static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
{
new_range->waiting = true;
list_add_tail(&new_range->wait_entry, &ic->wait_list);
new_range->task = current;
do {
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&ic->endio_wait.lock);
io_schedule();
spin_lock_irq(&ic->endio_wait.lock);
} while (unlikely(new_range->waiting));
}
static void init_journal_node(struct journal_node *node) static void init_journal_node(struct journal_node *node)
{ {
RB_CLEAR_NODE(&node->node); RB_CLEAR_NODE(&node->node);
...@@ -1658,7 +1708,7 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map ...@@ -1658,7 +1708,7 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
} }
} }
} }
if (unlikely(!add_new_range(ic, &dio->range))) { if (unlikely(!add_new_range(ic, &dio->range, true))) {
/* /*
* We must not sleep in the request routine because it could * We must not sleep in the request routine because it could
* stall bios on current->bio_list. * stall bios on current->bio_list.
...@@ -1670,10 +1720,8 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map ...@@ -1670,10 +1720,8 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
INIT_WORK(&dio->work, integrity_bio_wait); INIT_WORK(&dio->work, integrity_bio_wait);
queue_work(ic->wait_wq, &dio->work); queue_work(ic->wait_wq, &dio->work);
return; return;
} else {
sleep_on_endio_wait(ic);
goto retry;
} }
wait_and_add_new_range(ic, &dio->range);
} }
spin_unlock_irq(&ic->endio_wait.lock); spin_unlock_irq(&ic->endio_wait.lock);
...@@ -1896,8 +1944,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, ...@@ -1896,8 +1944,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
spin_lock_irq(&ic->endio_wait.lock); spin_lock_irq(&ic->endio_wait.lock);
while (unlikely(!add_new_range(ic, &io->range))) if (unlikely(!add_new_range(ic, &io->range, true)))
sleep_on_endio_wait(ic); wait_and_add_new_range(ic, &io->range);
if (likely(!from_replay)) { if (likely(!from_replay)) {
struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
...@@ -2852,6 +2900,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ...@@ -2852,6 +2900,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->per_io_data_size = sizeof(struct dm_integrity_io); ti->per_io_data_size = sizeof(struct dm_integrity_io);
ic->in_progress = RB_ROOT; ic->in_progress = RB_ROOT;
INIT_LIST_HEAD(&ic->wait_list);
init_waitqueue_head(&ic->endio_wait); init_waitqueue_head(&ic->endio_wait);
bio_list_init(&ic->flush_bio_list); bio_list_init(&ic->flush_bio_list);
init_waitqueue_head(&ic->copy_to_journal_wait); init_waitqueue_head(&ic->copy_to_journal_wait);
...@@ -3196,6 +3245,7 @@ static void dm_integrity_dtr(struct dm_target *ti) ...@@ -3196,6 +3245,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
struct dm_integrity_c *ic = ti->private; struct dm_integrity_c *ic = ti->private;
BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
BUG_ON(!list_empty(&ic->wait_list));
if (ic->metadata_wq) if (ic->metadata_wq)
destroy_workqueue(ic->metadata_wq); destroy_workqueue(ic->metadata_wq);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment