Commit 12c612e1 authored by Jens Axboe's avatar Jens Axboe

Merge tag 'md-6.12-20240829' of...

Merge tag 'md-6.12-20240829' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.12/block

Pull MD updates from Song:

"Major changes in this set are:

 1. md-bitmap refactoring, by Yu Kuai;
 2. raid5 performance optimization, by Artur Paszkiewicz;
 3. Other small fixes, by Yu Kuai and Chen Ni."

* tag 'md-6.12-20240829' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md: (49 commits)
  md/raid5: rename wait_for_overlap to wait_for_reshape
  md/raid5: only add to wq if reshape is in progress
  md/raid5: use wait_on_bit() for R5_Overlap
  md: Remove flush handling
  md/md-bitmap: make in memory structure internal
  md/md-bitmap: merge md_bitmap_enabled() into bitmap_operations
  md/md-bitmap: merge md_bitmap_wait_behind_writes() into bitmap_operations
  md/md-bitmap: merge md_bitmap_free() into bitmap_operations
  md/md-bitmap: merge md_bitmap_set_pages() into struct bitmap_operations
  md/md-bitmap: merge md_bitmap_copy_from_slot() into struct bitmap_operation.
  md/md-bitmap: merge get_bitmap_from_slot() into bitmap_operations
  md/md-bitmap: merge md_bitmap_resize() into bitmap_operations
  md/md-bitmap: pass in mddev directly for md_bitmap_resize()
  md/md-bitmap: merge md_bitmap_daemon_work() into bitmap_operations
  md/md-bitmap: merge bitmap_unplug() into bitmap_operations
  md/md-bitmap: merge md_bitmap_unplug_async() into md_bitmap_unplug()
  md/md-bitmap: merge md_bitmap_sync_with_cluster() into bitmap_operations
  md/md-bitmap: merge md_bitmap_cond_end_sync() into bitmap_operations
  md/md-bitmap: merge md_bitmap_close_sync() into bitmap_operations
  md/md-bitmap: merge md_bitmap_end_sync() into bitmap_operations
  ...
parents 12515809 fb16787b
......@@ -3949,7 +3949,9 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
/* Try loading the bitmap unless "raid0", which does not have one */
if (!rs_is_raid0(rs) &&
!test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
r = md_bitmap_load(&rs->md);
struct mddev *mddev = &rs->md;
r = mddev->bitmap_ops->load(mddev);
if (r)
DMERR("Failed to load bitmap");
}
......@@ -4066,7 +4068,8 @@ static int raid_preresume(struct dm_target *ti)
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0);
r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
chunksize, false);
if (r)
DMERR("Failed to resize bitmap");
}
......
......@@ -32,11 +32,210 @@
#include "md.h"
#include "md-bitmap.h"
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
* Version 5 is currently set only for clustered devices
*/
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MAJOR_HOSTENDIAN 3
/*
* in-memory bitmap:
*
* Use 16 bit block counters to track pending writes to each "chunk".
* The 2 high order bits are special-purpose, the first is a flag indicating
* whether a resync is needed. The second is a flag indicating whether a
* resync is active.
* This means that the counter is actually 14 bits:
*
* +--------+--------+------------------------------------------------+
* | resync | resync | counter |
* | needed | active | |
* | (0-1) | (0-1) | (0-16383) |
* +--------+--------+------------------------------------------------+
*
* The "resync needed" bit is set when:
* a '1' bit is read from storage at startup.
* a write request fails on some drives
* a resync is aborted on a chunk with 'resync active' set
* It is cleared (and resync-active set) when a resync starts across all drives
* of the chunk.
*
*
* The "resync active" bit is set when:
* a resync is started on all drives, and resync_needed is set.
* resync_needed will be cleared (as long as resync_active wasn't already set).
* It is cleared when a resync completes.
*
* The counter counts pending write requests, plus the on-disk bit.
* When the counter is '1' and the resync bits are clear, the on-disk
* bit can be cleared as well, thus setting the counter to 0.
* When we set a bit, or in the counter (to start a write), if the fields is
* 0, we first set the disk bit and set the counter to 1.
*
* If the counter is 0, the on-disk bit is clear and the stripe is clean
* Anything that dirties the stripe pushes the counter to 2 (at least)
* and sets the on-disk bit (lazily).
* If a periodic sweep find the counter at 2, it is decremented to 1.
* If the sweep find the counter at 1, the on-disk bit is cleared and the
* counter goes to zero.
*
* Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
* counters as a fallback when "page" memory cannot be allocated:
*
* Normal case (page memory allocated):
*
* page pointer (32-bit)
*
* [ ] ------+
* |
* +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
* c1 c2 c2048
*
* Hijacked case (page memory allocation failed):
*
* hijacked page pointer (32-bit)
*
* [ ][ ] (no page memory allocated)
* counter #1 (16-bit) counter #2 (16-bit)
*
*/
#define PAGE_BITS (PAGE_SIZE << 3)
#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
/* how many counters per page? */
#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
/* same, except a shift value for more efficient bitops */
#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
/* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
#define BITMAP_BLOCK_SHIFT 9
/*
* bitmap structures:
*/
/* the in-memory bitmap is represented by bitmap_pages */
struct bitmap_page {
/*
* map points to the actual memory page
*/
char *map;
/*
* in emergencies (when map cannot be alloced), hijack the map
* pointer and use it as two counters itself
*/
unsigned int hijacked:1;
/*
* If any counter in this page is '1' or '2' - and so could be
* cleared then that page is marked as 'pending'
*/
unsigned int pending:1;
/*
* count of dirty bits on the page
*/
unsigned int count:30;
};
/* the main bitmap structure - one per mddev */
struct bitmap {
struct bitmap_counts {
spinlock_t lock;
struct bitmap_page *bp;
/* total number of pages in the bitmap */
unsigned long pages;
/* number of pages not yet allocated */
unsigned long missing_pages;
/* chunksize = 2^chunkshift (for bitops) */
unsigned long chunkshift;
/* total number of data chunks for the array */
unsigned long chunks;
} counts;
struct mddev *mddev; /* the md device that the bitmap is for */
__u64 events_cleared;
int need_sync;
struct bitmap_storage {
/* backing disk file */
struct file *file;
/* cached copy of the bitmap file superblock */
struct page *sb_page;
unsigned long sb_index;
/* list of cache pages for the file */
struct page **filemap;
/* attributes associated filemap pages */
unsigned long *filemap_attr;
/* number of pages in the file */
unsigned long file_pages;
/* total bytes in the bitmap */
unsigned long bytes;
} storage;
unsigned long flags;
int allclean;
atomic_t behind_writes;
/* highest actual value at runtime */
unsigned long behind_writes_used;
/*
* the bitmap daemon - periodically wakes up and sweeps the bitmap
* file, cleaning up bits and flushing out pages to disk as necessary
*/
unsigned long daemon_lastrun; /* jiffies of last run */
/*
* when we lasted called end_sync to update bitmap with resync
* progress.
*/
unsigned long last_end_sync;
/* pending writes to the bitmap file */
atomic_t pending_writes;
wait_queue_head_t write_wait;
wait_queue_head_t overflow_wait;
wait_queue_head_t behind_wait;
struct kernfs_node *sysfs_can_clear;
/* slot offset for clustered env */
int cluster_slot;
};
static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, bool init);
static inline char *bmname(struct bitmap *bitmap)
{
return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
}
static bool __bitmap_enabled(struct bitmap *bitmap)
{
return bitmap->storage.filemap &&
!test_bit(BITMAP_STALE, &bitmap->flags);
}
static bool bitmap_enabled(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return false;
return __bitmap_enabled(bitmap);
}
/*
* check a page and, if necessary, allocate it (or hijack it if the alloc fails)
*
......@@ -472,9 +671,10 @@ static void md_bitmap_wait_writes(struct bitmap *bitmap)
/* update the event counter and sync the superblock to disk */
void md_bitmap_update_sb(struct bitmap *bitmap)
static void bitmap_update_sb(void *data)
{
bitmap_super_t *sb;
struct bitmap *bitmap = data;
if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
return;
......@@ -510,10 +710,8 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
write_sb_page(bitmap, bitmap->storage.sb_index,
bitmap->storage.sb_page, 1);
}
EXPORT_SYMBOL(md_bitmap_update_sb);
/* print out the bitmap file superblock */
void md_bitmap_print_sb(struct bitmap *bitmap)
static void bitmap_print_sb(struct bitmap *bitmap)
{
bitmap_super_t *sb;
......@@ -760,7 +958,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
bitmap->mddev->bitmap_info.space > sectors_reserved)
bitmap->mddev->bitmap_info.space = sectors_reserved;
} else {
md_bitmap_print_sb(bitmap);
bitmap_print_sb(bitmap);
if (bitmap->cluster_slot < 0)
md_cluster_stop(bitmap->mddev);
}
......@@ -893,7 +1091,7 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store)
static void md_bitmap_file_kick(struct bitmap *bitmap)
{
if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
md_bitmap_update_sb(bitmap);
bitmap_update_sb(bitmap);
if (bitmap->storage.file) {
pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
......@@ -1028,13 +1226,13 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
/* this gets called when the md device is ready to unplug its underlying
* (slave) device queues -- before we let any writes go down, we need to
* sync the dirty pages of the bitmap file to disk */
void md_bitmap_unplug(struct bitmap *bitmap)
static void __bitmap_unplug(struct bitmap *bitmap)
{
unsigned long i;
int dirty, need_write;
int writing = 0;
if (!md_bitmap_enabled(bitmap))
if (!__bitmap_enabled(bitmap))
return;
/* look at each page to see if there are any set bits that need to be
......@@ -1060,7 +1258,6 @@ void md_bitmap_unplug(struct bitmap *bitmap)
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
md_bitmap_file_kick(bitmap);
}
EXPORT_SYMBOL(md_bitmap_unplug);
struct bitmap_unplug_work {
struct work_struct work;
......@@ -1073,11 +1270,11 @@ static void md_bitmap_unplug_fn(struct work_struct *work)
struct bitmap_unplug_work *unplug_work =
container_of(work, struct bitmap_unplug_work, work);
md_bitmap_unplug(unplug_work->bitmap);
__bitmap_unplug(unplug_work->bitmap);
complete(unplug_work->done);
}
void md_bitmap_unplug_async(struct bitmap *bitmap)
static void bitmap_unplug_async(struct bitmap *bitmap)
{
DECLARE_COMPLETION_ONSTACK(done);
struct bitmap_unplug_work unplug_work;
......@@ -1089,7 +1286,19 @@ void md_bitmap_unplug_async(struct bitmap *bitmap)
queue_work(md_bitmap_wq, &unplug_work.work);
wait_for_completion(&done);
}
EXPORT_SYMBOL(md_bitmap_unplug_async);
static void bitmap_unplug(struct mddev *mddev, bool sync)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
if (sync)
__bitmap_unplug(bitmap);
else
bitmap_unplug_async(bitmap);
}
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
......@@ -1226,22 +1435,21 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
return ret;
}
void md_bitmap_write_all(struct bitmap *bitmap)
/* just flag bitmap pages as needing to be written. */
static void bitmap_write_all(struct mddev *mddev)
{
/* We don't actually write all bitmap blocks here,
* just flag them as needing to be written
*/
int i;
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap || !bitmap->storage.filemap)
return;
if (bitmap->storage.file)
/* Only one copy, so nothing needed */
if (bitmap->storage.file)
return;
for (i = 0; i < bitmap->storage.file_pages; i++)
set_page_attr(bitmap, i,
BITMAP_PAGE_NEEDWRITE);
set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
bitmap->allclean = 0;
}
......@@ -1290,7 +1498,7 @@ static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
* bitmap daemon -- periodically wakes up to clean bits and flush pages
* out to disk
*/
void md_bitmap_daemon_work(struct mddev *mddev)
static void bitmap_daemon_work(struct mddev *mddev)
{
struct bitmap *bitmap;
unsigned long j;
......@@ -1461,8 +1669,11 @@ __acquires(bitmap->lock)
&(bitmap->bp[page].map[pageoff]);
}
int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind)
static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
unsigned long sectors, bool behind)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return 0;
......@@ -1523,13 +1734,15 @@ int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long s
}
return 0;
}
EXPORT_SYMBOL(md_bitmap_startwrite);
void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
unsigned long sectors, int success, int behind)
static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
unsigned long sectors, bool success, bool behind)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
if (behind) {
if (atomic_dec_and_test(&bitmap->behind_writes))
wake_up(&bitmap->behind_wait);
......@@ -1576,26 +1789,27 @@ void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
sectors = 0;
}
}
EXPORT_SYMBOL(md_bitmap_endwrite);
static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
int degraded)
static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
sector_t *blocks, bool degraded)
{
bitmap_counter_t *bmc;
int rv;
bool rv;
if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
*blocks = 1024;
return 1; /* always resync if no bitmap */
return true; /* always resync if no bitmap */
}
spin_lock_irq(&bitmap->counts.lock);
rv = false;
bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
rv = 0;
if (bmc) {
/* locked */
if (RESYNC(*bmc))
rv = 1;
else if (NEEDED(*bmc)) {
rv = 1;
if (RESYNC(*bmc)) {
rv = true;
} else if (NEEDED(*bmc)) {
rv = true;
if (!degraded) { /* don't set/clear bits if degraded */
*bmc |= RESYNC_MASK;
*bmc &= ~NEEDED_MASK;
......@@ -1603,11 +1817,12 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
}
}
spin_unlock_irq(&bitmap->counts.lock);
return rv;
}
int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
int degraded)
static bool bitmap_start_sync(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded)
{
/* bitmap_start_sync must always report on multiples of whole
* pages, otherwise resync (which is very PAGE_SIZE based) will
......@@ -1616,21 +1831,22 @@ int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *block
* At least PAGE_SIZE>>9 blocks are covered.
* Return the 'or' of the result.
*/
int rv = 0;
bool rv = false;
sector_t blocks1;
*blocks = 0;
while (*blocks < (PAGE_SIZE>>9)) {
rv |= __bitmap_start_sync(bitmap, offset,
rv |= __bitmap_start_sync(mddev->bitmap, offset,
&blocks1, degraded);
offset += blocks1;
*blocks += blocks1;
}
return rv;
}
EXPORT_SYMBOL(md_bitmap_start_sync);
void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted)
static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
sector_t *blocks, bool aborted)
{
bitmap_counter_t *bmc;
unsigned long flags;
......@@ -1659,9 +1875,14 @@ void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks
unlock:
spin_unlock_irqrestore(&bitmap->counts.lock, flags);
}
EXPORT_SYMBOL(md_bitmap_end_sync);
void md_bitmap_close_sync(struct bitmap *bitmap)
static void bitmap_end_sync(struct mddev *mddev, sector_t offset,
sector_t *blocks)
{
__bitmap_end_sync(mddev->bitmap, offset, blocks, true);
}
static void bitmap_close_sync(struct mddev *mddev)
{
/* Sync has finished, and any bitmap chunks that weren't synced
* properly have been aborted. It remains to us to clear the
......@@ -1669,19 +1890,23 @@ void md_bitmap_close_sync(struct bitmap *bitmap)
*/
sector_t sector = 0;
sector_t blocks;
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
while (sector < bitmap->mddev->resync_max_sectors) {
md_bitmap_end_sync(bitmap, sector, &blocks, 0);
__bitmap_end_sync(bitmap, sector, &blocks, false);
sector += blocks;
}
}
EXPORT_SYMBOL(md_bitmap_close_sync);
void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
bool force)
{
sector_t s = 0;
sector_t blocks;
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
......@@ -1700,15 +1925,14 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) {
md_bitmap_end_sync(bitmap, s, &blocks, 0);
__bitmap_end_sync(bitmap, s, &blocks, false);
s += blocks;
}
bitmap->last_end_sync = jiffies;
sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
}
EXPORT_SYMBOL(md_bitmap_cond_end_sync);
void md_bitmap_sync_with_cluster(struct mddev *mddev,
static void bitmap_sync_with_cluster(struct mddev *mddev,
sector_t old_lo, sector_t old_hi,
sector_t new_lo, sector_t new_hi)
{
......@@ -1716,18 +1940,17 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev,
sector_t sector, blocks = 0;
for (sector = old_lo; sector < new_lo; ) {
md_bitmap_end_sync(bitmap, sector, &blocks, 0);
__bitmap_end_sync(bitmap, sector, &blocks, false);
sector += blocks;
}
WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n");
for (sector = old_hi; sector < new_hi; ) {
md_bitmap_start_sync(bitmap, sector, &blocks, 0);
bitmap_start_sync(mddev, sector, &blocks, false);
sector += blocks;
}
WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n");
}
EXPORT_SYMBOL(md_bitmap_sync_with_cluster);
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
{
......@@ -1756,12 +1979,18 @@ static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, in
}
/* dirty the memory and file bits for bitmap chunks "s" to "e" */
void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s,
unsigned long e)
{
unsigned long chunk;
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
for (chunk = s; chunk <= e; chunk++) {
sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
md_bitmap_set_memory_bits(bitmap, sec, 1);
md_bitmap_file_set_bit(bitmap, sec);
if (sec < bitmap->mddev->recovery_cp)
......@@ -1773,10 +2002,7 @@ void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long
}
}
/*
* flush out any pending updates
*/
void md_bitmap_flush(struct mddev *mddev)
static void bitmap_flush(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
long sleep;
......@@ -1789,23 +2015,21 @@ void md_bitmap_flush(struct mddev *mddev)
*/
sleep = mddev->bitmap_info.daemon_sleep * 2;
bitmap->daemon_lastrun -= sleep;
md_bitmap_daemon_work(mddev);
bitmap_daemon_work(mddev);
bitmap->daemon_lastrun -= sleep;
md_bitmap_daemon_work(mddev);
bitmap_daemon_work(mddev);
bitmap->daemon_lastrun -= sleep;
md_bitmap_daemon_work(mddev);
bitmap_daemon_work(mddev);
if (mddev->bitmap_info.external)
md_super_wait(mddev);
md_bitmap_update_sb(bitmap);
bitmap_update_sb(bitmap);
}
/*
* free memory that was allocated
*/
void md_bitmap_free(struct bitmap *bitmap)
static void md_bitmap_free(void *data)
{
unsigned long k, pages;
struct bitmap_page *bp;
struct bitmap *bitmap = data;
if (!bitmap) /* there was no bitmap */
return;
......@@ -1836,9 +2060,8 @@ void md_bitmap_free(struct bitmap *bitmap)
kfree(bp);
kfree(bitmap);
}
EXPORT_SYMBOL(md_bitmap_free);
void md_bitmap_wait_behind_writes(struct mddev *mddev)
static void bitmap_wait_behind_writes(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
......@@ -1852,14 +2075,14 @@ void md_bitmap_wait_behind_writes(struct mddev *mddev)
}
}
void md_bitmap_destroy(struct mddev *mddev)
static void bitmap_destroy(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap) /* there was no bitmap */
return;
md_bitmap_wait_behind_writes(mddev);
bitmap_wait_behind_writes(mddev);
if (!mddev->serialize_policy)
mddev_destroy_serial_pool(mddev, NULL);
......@@ -1878,7 +2101,7 @@ void md_bitmap_destroy(struct mddev *mddev)
* if this returns an error, bitmap_destroy must be called to do clean up
* once mddev->bitmap is set
*/
struct bitmap *md_bitmap_create(struct mddev *mddev, int slot)
static struct bitmap *__bitmap_create(struct mddev *mddev, int slot)
{
struct bitmap *bitmap;
sector_t blocks = mddev->resync_max_sectors;
......@@ -1948,7 +2171,8 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot)
goto error;
bitmap->daemon_lastrun = jiffies;
err = md_bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1);
err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize,
true);
if (err)
goto error;
......@@ -1965,7 +2189,18 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot)
return ERR_PTR(err);
}
int md_bitmap_load(struct mddev *mddev)
static int bitmap_create(struct mddev *mddev, int slot)
{
struct bitmap *bitmap = __bitmap_create(mddev, slot);
if (IS_ERR(bitmap))
return PTR_ERR(bitmap);
mddev->bitmap = bitmap;
return 0;
}
static int bitmap_load(struct mddev *mddev)
{
int err = 0;
sector_t start = 0;
......@@ -1989,10 +2224,10 @@ int md_bitmap_load(struct mddev *mddev)
*/
while (sector < mddev->resync_max_sectors) {
sector_t blocks;
md_bitmap_start_sync(bitmap, sector, &blocks, 0);
bitmap_start_sync(mddev, sector, &blocks, false);
sector += blocks;
}
md_bitmap_close_sync(bitmap);
bitmap_close_sync(mddev);
if (mddev->degraded == 0
|| bitmap->events_cleared == mddev->events)
......@@ -2014,22 +2249,21 @@ int md_bitmap_load(struct mddev *mddev)
mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
md_wakeup_thread(mddev->thread);
md_bitmap_update_sb(bitmap);
bitmap_update_sb(bitmap);
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
err = -EIO;
out:
return err;
}
EXPORT_SYMBOL_GPL(md_bitmap_load);
/* caller need to free returned bitmap with md_bitmap_free() */
struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
static void *bitmap_get_from_slot(struct mddev *mddev, int slot)
{
int rv = 0;
struct bitmap *bitmap;
bitmap = md_bitmap_create(mddev, slot);
bitmap = __bitmap_create(mddev, slot);
if (IS_ERR(bitmap)) {
rv = PTR_ERR(bitmap);
return ERR_PTR(rv);
......@@ -2043,20 +2277,19 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
return bitmap;
}
EXPORT_SYMBOL(get_bitmap_from_slot);
/* Loads the bitmap associated with slot and copies the resync information
* to our bitmap
*/
int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
sector_t *low, sector_t *high, bool clear_bits)
static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low,
sector_t *high, bool clear_bits)
{
int rv = 0, i, j;
sector_t block, lo = 0, hi = 0;
struct bitmap_counts *counts;
struct bitmap *bitmap;
bitmap = get_bitmap_from_slot(mddev, slot);
bitmap = bitmap_get_from_slot(mddev, slot);
if (IS_ERR(bitmap)) {
pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
return -1;
......@@ -2076,53 +2309,59 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
}
if (clear_bits) {
md_bitmap_update_sb(bitmap);
bitmap_update_sb(bitmap);
/* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs
* BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */
for (i = 0; i < bitmap->storage.file_pages; i++)
if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING))
set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
md_bitmap_unplug(bitmap);
__bitmap_unplug(bitmap);
}
md_bitmap_unplug(mddev->bitmap);
__bitmap_unplug(mddev->bitmap);
*low = lo;
*high = hi;
md_bitmap_free(bitmap);
return rv;
}
EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot);
static void bitmap_set_pages(void *data, unsigned long pages)
{
struct bitmap *bitmap = data;
bitmap->counts.pages = pages;
}
void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
{
unsigned long chunk_kb;
struct bitmap_storage *storage;
struct bitmap_counts *counts;
struct bitmap *bitmap = data;
bitmap_super_t *sb;
if (!bitmap)
return;
return -ENOENT;
sb = kmap_local_page(bitmap->storage.sb_page);
stats->sync_size = le64_to_cpu(sb->sync_size);
kunmap_local(sb);
counts = &bitmap->counts;
stats->missing_pages = counts->missing_pages;
stats->pages = counts->pages;
chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
"%lu%s chunk",
counts->pages - counts->missing_pages,
counts->pages,
(counts->pages - counts->missing_pages)
<< (PAGE_SHIFT - 10),
chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
chunk_kb ? "KB" : "B");
if (bitmap->storage.file) {
seq_printf(seq, ", file: ");
seq_file_path(seq, bitmap->storage.file, " \t\n");
}
storage = &bitmap->storage;
stats->file_pages = storage->file_pages;
stats->file = storage->file;
seq_printf(seq, "\n");
stats->behind_writes = atomic_read(&bitmap->behind_writes);
stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait);
stats->events_cleared = bitmap->events_cleared;
return 0;
}
int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, int init)
static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, bool init)
{
/* If chunk_size is 0, choose an appropriate chunk size.
* Then possibly allocate new storage space.
......@@ -2320,14 +2559,24 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
spin_unlock_irq(&bitmap->counts.lock);
if (!init) {
md_bitmap_unplug(bitmap);
__bitmap_unplug(bitmap);
bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
}
ret = 0;
err:
return ret;
}
EXPORT_SYMBOL_GPL(md_bitmap_resize);
static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize,
bool init)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return 0;
return __bitmap_resize(bitmap, blocks, chunksize, init);
}
static ssize_t
location_show(struct mddev *mddev, char *page)
......@@ -2367,7 +2616,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
goto out;
}
md_bitmap_destroy(mddev);
bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
if (mddev->bitmap_info.file) {
struct file *f = mddev->bitmap_info.file;
......@@ -2377,7 +2626,6 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
} else {
/* No bitmap, OK to set a location */
long long offset;
struct bitmap *bitmap;
if (strncmp(buf, "none", 4) == 0)
/* nothing to be done */;
......@@ -2404,17 +2652,14 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
}
mddev->bitmap_info.offset = offset;
bitmap = md_bitmap_create(mddev, -1);
if (IS_ERR(bitmap)) {
rv = PTR_ERR(bitmap);
rv = bitmap_create(mddev, -1);
if (rv)
goto out;
}
mddev->bitmap = bitmap;
rv = md_bitmap_load(mddev);
rv = bitmap_load(mddev);
if (rv) {
mddev->bitmap_info.offset = 0;
md_bitmap_destroy(mddev);
bitmap_destroy(mddev);
goto out;
}
}
......@@ -2450,6 +2695,7 @@ space_show(struct mddev *mddev, char *page)
static ssize_t
space_store(struct mddev *mddev, const char *buf, size_t len)
{
struct bitmap *bitmap;
unsigned long sectors;
int rv;
......@@ -2460,8 +2706,8 @@ space_store(struct mddev *mddev, const char *buf, size_t len)
if (sectors == 0)
return -EINVAL;
if (mddev->bitmap &&
sectors < (mddev->bitmap->storage.bytes + 511) >> 9)
bitmap = mddev->bitmap;
if (bitmap && sectors < (bitmap->storage.bytes + 511) >> 9)
return -EFBIG; /* Bitmap is too big for this small space */
/* could make sure it isn't too big, but that isn't really
......@@ -2569,7 +2815,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
mddev_create_serial_pool(mddev, rdev);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
bitmap_update_sb(mddev->bitmap);
mddev_unlock_and_resume(mddev);
return len;
......@@ -2638,10 +2884,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
static ssize_t can_clear_show(struct mddev *mddev, char *page)
{
int len;
struct bitmap *bitmap;
spin_lock(&mddev->lock);
if (mddev->bitmap)
len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
"false" : "true"));
bitmap = mddev->bitmap;
if (bitmap)
len = sprintf(page, "%s\n", (bitmap->need_sync ? "false" :
"true"));
else
len = sprintf(page, "\n");
spin_unlock(&mddev->lock);
......@@ -2650,17 +2899,24 @@ static ssize_t can_clear_show(struct mddev *mddev, char *page)
static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len)
{
if (mddev->bitmap == NULL)
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return -ENOENT;
if (strncmp(buf, "false", 5) == 0)
mddev->bitmap->need_sync = 1;
else if (strncmp(buf, "true", 4) == 0) {
if (strncmp(buf, "false", 5) == 0) {
bitmap->need_sync = 1;
return len;
}
if (strncmp(buf, "true", 4) == 0) {
if (mddev->degraded)
return -EBUSY;
mddev->bitmap->need_sync = 0;
} else
return -EINVAL;
bitmap->need_sync = 0;
return len;
}
return -EINVAL;
}
static struct md_sysfs_entry bitmap_can_clear =
......@@ -2670,21 +2926,26 @@ static ssize_t
behind_writes_used_show(struct mddev *mddev, char *page)
{
ssize_t ret;
struct bitmap *bitmap;
spin_lock(&mddev->lock);
if (mddev->bitmap == NULL)
bitmap = mddev->bitmap;
if (!bitmap)
ret = sprintf(page, "0\n");
else
ret = sprintf(page, "%lu\n",
mddev->bitmap->behind_writes_used);
ret = sprintf(page, "%lu\n", bitmap->behind_writes_used);
spin_unlock(&mddev->lock);
return ret;
}
static ssize_t
behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len)
{
if (mddev->bitmap)
mddev->bitmap->behind_writes_used = 0;
struct bitmap *bitmap = mddev->bitmap;
if (bitmap)
bitmap->behind_writes_used = 0;
return len;
}
......@@ -2707,3 +2968,38 @@ const struct attribute_group md_bitmap_group = {
.name = "bitmap",
.attrs = md_bitmap_attrs,
};
static struct bitmap_operations bitmap_ops = {
.enabled = bitmap_enabled,
.create = bitmap_create,
.resize = bitmap_resize,
.load = bitmap_load,
.destroy = bitmap_destroy,
.flush = bitmap_flush,
.write_all = bitmap_write_all,
.dirty_bits = bitmap_dirty_bits,
.unplug = bitmap_unplug,
.daemon_work = bitmap_daemon_work,
.wait_behind_writes = bitmap_wait_behind_writes,
.startwrite = bitmap_startwrite,
.endwrite = bitmap_endwrite,
.start_sync = bitmap_start_sync,
.end_sync = bitmap_end_sync,
.cond_end_sync = bitmap_cond_end_sync,
.close_sync = bitmap_close_sync,
.update_sb = bitmap_update_sb,
.get_stats = bitmap_get_stats,
.sync_with_cluster = bitmap_sync_with_cluster,
.get_from_slot = bitmap_get_from_slot,
.copy_from_slot = bitmap_copy_from_slot,
.set_pages = bitmap_set_pages,
.free = md_bitmap_free,
};
void mddev_set_bitmap_ops(struct mddev *mddev)
{
mddev->bitmap_ops = &bitmap_ops;
}
......@@ -7,81 +7,7 @@
#ifndef BITMAP_H
#define BITMAP_H 1
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
* Version 5 is currently set only for clustered devices
*/
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MAJOR_HOSTENDIAN 3
/*
* in-memory bitmap:
*
* Use 16 bit block counters to track pending writes to each "chunk".
* The 2 high order bits are special-purpose, the first is a flag indicating
* whether a resync is needed. The second is a flag indicating whether a
* resync is active.
* This means that the counter is actually 14 bits:
*
* +--------+--------+------------------------------------------------+
* | resync | resync | counter |
* | needed | active | |
* | (0-1) | (0-1) | (0-16383) |
* +--------+--------+------------------------------------------------+
*
* The "resync needed" bit is set when:
* a '1' bit is read from storage at startup.
* a write request fails on some drives
* a resync is aborted on a chunk with 'resync active' set
* It is cleared (and resync-active set) when a resync starts across all drives
* of the chunk.
*
*
* The "resync active" bit is set when:
* a resync is started on all drives, and resync_needed is set.
* resync_needed will be cleared (as long as resync_active wasn't already set).
* It is cleared when a resync completes.
*
* The counter counts pending write requests, plus the on-disk bit.
* When the counter is '1' and the resync bits are clear, the on-disk
* bit can be cleared as well, thus setting the counter to 0.
* When we set a bit, or in the counter (to start a write), if the fields is
* 0, we first set the disk bit and set the counter to 1.
*
* If the counter is 0, the on-disk bit is clear and the stripe is clean
* Anything that dirties the stripe pushes the counter to 2 (at least)
* and sets the on-disk bit (lazily).
* If a periodic sweep find the counter at 2, it is decremented to 1.
* If the sweep find the counter at 1, the on-disk bit is cleared and the
* counter goes to zero.
*
* Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
* counters as a fallback when "page" memory cannot be allocated:
*
* Normal case (page memory allocated):
*
* page pointer (32-bit)
*
* [ ] ------+
* |
* +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
* c1 c2 c2048
*
* Hijacked case (page memory allocation failed):
*
* hijacked page pointer (32-bit)
*
* [ ][ ] (no page memory allocated)
* counter #1 (16-bit) counter #2 (16-bit)
*
*/
#ifdef __KERNEL__
#define PAGE_BITS (PAGE_SIZE << 3)
#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
#define BITMAP_MAGIC 0x6d746962
typedef __u16 bitmap_counter_t;
#define COUNTER_BITS 16
......@@ -91,26 +17,6 @@ typedef __u16 bitmap_counter_t;
#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
/* how many counters per page? */
#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
/* same, except a shift value for more efficient bitops */
#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
/* same, except a mask value for more efficient bitops */
#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
#define BITMAP_BLOCK_SHIFT 9
#endif
/*
* bitmap structures:
*/
#define BITMAP_MAGIC 0x6d746962
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
......@@ -152,136 +58,58 @@ typedef struct bitmap_super_s {
* devices. For raid10 it is the size of the array.
*/
#ifdef __KERNEL__
/* the in-memory bitmap is represented by bitmap_pages */
struct bitmap_page {
/*
* map points to the actual memory page
*/
char *map;
/*
* in emergencies (when map cannot be alloced), hijack the map
* pointer and use it as two counters itself
*/
unsigned int hijacked:1;
/*
* If any counter in this page is '1' or '2' - and so could be
* cleared then that page is marked as 'pending'
*/
unsigned int pending:1;
/*
* count of dirty bits on the page
*/
unsigned int count:30;
};
/* the main bitmap structure - one per mddev */
struct bitmap {
struct bitmap_counts {
spinlock_t lock;
struct bitmap_page *bp;
unsigned long pages; /* total number of pages
* in the bitmap */
unsigned long missing_pages; /* number of pages
* not yet allocated */
unsigned long chunkshift; /* chunksize = 2^chunkshift
* (for bitops) */
unsigned long chunks; /* Total number of data
* chunks for the array */
} counts;
struct mddev *mddev; /* the md device that the bitmap is for */
__u64 events_cleared;
int need_sync;
struct bitmap_storage {
struct file *file; /* backing disk file */
struct page *sb_page; /* cached copy of the bitmap
* file superblock */
unsigned long sb_index;
struct page **filemap; /* list of cache pages for
* the file */
unsigned long *filemap_attr; /* attributes associated
* w/ filemap pages */
unsigned long file_pages; /* number of pages in the file*/
unsigned long bytes; /* total bytes in the bitmap */
} storage;
unsigned long flags;
int allclean;
atomic_t behind_writes;
unsigned long behind_writes_used; /* highest actual value at runtime */
/*
* the bitmap daemon - periodically wakes up and sweeps the bitmap
* file, cleaning up bits and flushing out pages to disk as necessary
*/
unsigned long daemon_lastrun; /* jiffies of last run */
unsigned long last_end_sync; /* when we lasted called end_sync to
* update bitmap with resync progress */
atomic_t pending_writes; /* pending writes to the bitmap file */
wait_queue_head_t write_wait;
wait_queue_head_t overflow_wait;
wait_queue_head_t behind_wait;
struct md_bitmap_stats {
u64 events_cleared;
int behind_writes;
bool behind_wait;
struct kernfs_node *sysfs_can_clear;
int cluster_slot; /* Slot offset for clustered env */
unsigned long missing_pages;
unsigned long file_pages;
unsigned long sync_size;
unsigned long pages;
struct file *file;
};
/* the bitmap API */
/* these are used only by md/bitmap */
struct bitmap *md_bitmap_create(struct mddev *mddev, int slot);
int md_bitmap_load(struct mddev *mddev);
void md_bitmap_flush(struct mddev *mddev);
void md_bitmap_destroy(struct mddev *mddev);
void md_bitmap_print_sb(struct bitmap *bitmap);
void md_bitmap_update_sb(struct bitmap *bitmap);
void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
int md_bitmap_setallbits(struct bitmap *bitmap);
void md_bitmap_write_all(struct bitmap *bitmap);
void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
/* these are exported */
int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
unsigned long sectors, int behind);
void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
unsigned long sectors, int success, int behind);
int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
void md_bitmap_close_sync(struct bitmap *bitmap);
void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
void md_bitmap_sync_with_cluster(struct mddev *mddev,
struct bitmap_operations {
bool (*enabled)(struct mddev *mddev);
int (*create)(struct mddev *mddev, int slot);
int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize,
bool init);
int (*load)(struct mddev *mddev);
void (*destroy)(struct mddev *mddev);
void (*flush)(struct mddev *mddev);
void (*write_all)(struct mddev *mddev);
void (*dirty_bits)(struct mddev *mddev, unsigned long s,
unsigned long e);
void (*unplug)(struct mddev *mddev, bool sync);
void (*daemon_work)(struct mddev *mddev);
void (*wait_behind_writes)(struct mddev *mddev);
int (*startwrite)(struct mddev *mddev, sector_t offset,
unsigned long sectors, bool behind);
void (*endwrite)(struct mddev *mddev, sector_t offset,
unsigned long sectors, bool success, bool behind);
bool (*start_sync)(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded);
void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force);
void (*close_sync)(struct mddev *mddev);
void (*update_sb)(void *data);
int (*get_stats)(void *data, struct md_bitmap_stats *stats);
void (*sync_with_cluster)(struct mddev *mddev,
sector_t old_lo, sector_t old_hi,
sector_t new_lo, sector_t new_hi);
void *(*get_from_slot)(struct mddev *mddev, int slot);
int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo,
sector_t *hi, bool clear_bits);
void (*set_pages)(void *data, unsigned long pages);
void (*free)(void *data);
};
void md_bitmap_unplug(struct bitmap *bitmap);
void md_bitmap_unplug_async(struct bitmap *bitmap);
void md_bitmap_daemon_work(struct mddev *mddev);
int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, int init);
struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
sector_t *lo, sector_t *hi, bool clear_bits);
void md_bitmap_free(struct bitmap *bitmap);
void md_bitmap_wait_behind_writes(struct mddev *mddev);
static inline bool md_bitmap_enabled(struct bitmap *bitmap)
{
return bitmap && bitmap->storage.filemap &&
!test_bit(BITMAP_STALE, &bitmap->flags);
}
#endif
/* the bitmap API */
void mddev_set_bitmap_ops(struct mddev *mddev);
#endif
......@@ -317,7 +317,7 @@ static void recover_bitmaps(struct md_thread *thread)
str, ret);
goto clear_bit;
}
ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
ret = mddev->bitmap_ops->copy_from_slot(mddev, slot, &lo, &hi, true);
if (ret) {
pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
goto clear_bit;
......@@ -497,7 +497,7 @@ static void process_suspend_info(struct mddev *mddev,
* we don't want to trigger lots of WARN.
*/
if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
md_bitmap_sync_with_cluster(mddev, cinfo->sync_low,
mddev->bitmap_ops->sync_with_cluster(mddev, cinfo->sync_low,
cinfo->sync_hi, lo, hi);
cinfo->sync_low = lo;
cinfo->sync_hi = hi;
......@@ -628,8 +628,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
break;
case BITMAP_RESIZE:
if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
ret = md_bitmap_resize(mddev->bitmap,
le64_to_cpu(msg->high), 0, 0);
ret = mddev->bitmap_ops->resize(mddev,
le64_to_cpu(msg->high),
0, false);
break;
default:
ret = -1;
......@@ -856,7 +857,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
}
/* Read the disk bitmap sb and check if it needs recovery */
ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
ret = mddev->bitmap_ops->copy_from_slot(mddev, i, &lo, &hi, false);
if (ret) {
pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
lockres_free(bm_lockres);
......@@ -1143,13 +1144,16 @@ static int update_bitmap_size(struct mddev *mddev, sector_t size)
static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
{
struct bitmap_counts *counts;
char str[64];
struct dlm_lock_resource *bm_lockres;
struct bitmap *bitmap = mddev->bitmap;
unsigned long my_pages = bitmap->counts.pages;
void *bitmap = mddev->bitmap;
struct md_bitmap_stats stats;
unsigned long my_pages;
int i, rv;
rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
if (rv)
return rv;
my_pages = stats.pages;
/*
* We need to ensure all the nodes can grow to a larger
* bitmap size before make the reshaping.
......@@ -1159,17 +1163,22 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
return rv;
for (i = 0; i < mddev->bitmap_info.nodes; i++) {
struct dlm_lock_resource *bm_lockres;
char str[64];
if (i == md_cluster_ops->slot_number(mddev))
continue;
bitmap = get_bitmap_from_slot(mddev, i);
bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
if (IS_ERR(bitmap)) {
pr_err("can't get bitmap from slot %d\n", i);
bitmap = NULL;
goto out;
}
counts = &bitmap->counts;
rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
if (rv)
goto out;
/*
* If we can hold the bitmap lock of one node then
* the slot is not occupied, update the pages.
......@@ -1183,21 +1192,21 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
bm_lockres->flags |= DLM_LKF_NOQUEUE;
rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (!rv)
counts->pages = my_pages;
mddev->bitmap_ops->set_pages(bitmap, my_pages);
lockres_free(bm_lockres);
if (my_pages != counts->pages)
if (my_pages != stats.pages)
/*
* Let's revert the bitmap size if one node
* can't resize bitmap
*/
goto out;
md_bitmap_free(bitmap);
mddev->bitmap_ops->free(bitmap);
}
return 0;
out:
md_bitmap_free(bitmap);
mddev->bitmap_ops->free(bitmap);
update_bitmap_size(mddev, oldsize);
return -1;
}
......@@ -1207,24 +1216,27 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
*/
static int cluster_check_sync_size(struct mddev *mddev)
{
int i, rv;
bitmap_super_t *sb;
unsigned long my_sync_size, sync_size = 0;
int node_num = mddev->bitmap_info.nodes;
int current_slot = md_cluster_ops->slot_number(mddev);
struct bitmap *bitmap = mddev->bitmap;
char str[64];
int node_num = mddev->bitmap_info.nodes;
struct dlm_lock_resource *bm_lockres;
struct md_bitmap_stats stats;
void *bitmap = mddev->bitmap;
unsigned long sync_size = 0;
unsigned long my_sync_size;
char str[64];
int i, rv;
sb = kmap_atomic(bitmap->storage.sb_page);
my_sync_size = sb->sync_size;
kunmap_atomic(sb);
rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
if (rv)
return rv;
my_sync_size = stats.sync_size;
for (i = 0; i < node_num; i++) {
if (i == current_slot)
continue;
bitmap = get_bitmap_from_slot(mddev, i);
bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
if (IS_ERR(bitmap)) {
pr_err("can't get bitmap from slot %d\n", i);
return -1;
......@@ -1238,25 +1250,28 @@ static int cluster_check_sync_size(struct mddev *mddev)
bm_lockres = lockres_init(mddev, str, NULL, 1);
if (!bm_lockres) {
pr_err("md-cluster: Cannot initialize %s\n", str);
md_bitmap_free(bitmap);
mddev->bitmap_ops->free(bitmap);
return -1;
}
bm_lockres->flags |= DLM_LKF_NOQUEUE;
rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (!rv)
md_bitmap_update_sb(bitmap);
mddev->bitmap_ops->update_sb(bitmap);
lockres_free(bm_lockres);
sb = kmap_atomic(bitmap->storage.sb_page);
if (sync_size == 0)
sync_size = sb->sync_size;
else if (sync_size != sb->sync_size) {
kunmap_atomic(sb);
md_bitmap_free(bitmap);
rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
if (rv) {
mddev->bitmap_ops->free(bitmap);
return rv;
}
if (sync_size == 0) {
sync_size = stats.sync_size;
} else if (sync_size != stats.sync_size) {
mddev->bitmap_ops->free(bitmap);
return -1;
}
kunmap_atomic(sb);
md_bitmap_free(bitmap);
mddev->bitmap_ops->free(bitmap);
}
return (my_sync_size == sync_size) ? 0 : -1;
......@@ -1585,7 +1600,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
if (sn == (cinfo->slot_number - 1))
continue;
err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
err = mddev->bitmap_ops->copy_from_slot(mddev, sn, &lo, &hi, false);
if (err) {
pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
goto out;
......
......@@ -546,137 +546,30 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n
return 0;
}
/*
* Generic flush handling for md
*/
static void md_end_flush(struct bio *bio)
{
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
bio_put(bio);
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending))
/* The pre-request flush has finished */
queue_work(md_wq, &mddev->flush_work);
}
static void md_submit_flush_data(struct work_struct *ws);
static void submit_flushes(struct work_struct *ws)
bool md_flush_request(struct mddev *mddev, struct bio *bio)
{
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct md_rdev *rdev;
mddev->start_flush = ktime_get_boottime();
INIT_WORK(&mddev->flush_work, md_submit_flush_data);
atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags)) {
struct bio *bi;
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
bi = bio_alloc_bioset(rdev->bdev, 0,
REQ_OP_WRITE | REQ_PREFLUSH,
GFP_NOIO, &mddev->bio_set);
bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
atomic_inc(&mddev->flush_pending);
submit_bio(bi);
rcu_read_lock();
}
rcu_read_unlock();
if (atomic_dec_and_test(&mddev->flush_pending))
queue_work(md_wq, &mddev->flush_work);
}
static void md_submit_flush_data(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct bio *bio = mddev->flush_bio;
struct bio *new;
/*
* must reset flush_bio before calling into md_handle_request to avoid a
* deadlock, because other bios passed md_handle_request suspend check
* could wait for this and below md_handle_request could wait for those
* bios because of suspend check
*/
spin_lock_irq(&mddev->lock);
mddev->prev_flush_start = mddev->start_flush;
mddev->flush_bio = NULL;
spin_unlock_irq(&mddev->lock);
wake_up(&mddev->sb_wait);
if (bio->bi_iter.bi_size == 0) {
/* an empty barrier - all done */
bio_endio(bio);
} else {
bio->bi_opf &= ~REQ_PREFLUSH;
/*
* make_requst() will never return error here, it only
* returns error in raid5_make_request() by dm-raid.
* Since dm always splits data and flush operation into
* two separate io, io size of flush submitted by dm
* always is 0, make_request() will not be called here.
* md_flush_reqeust() should be called under md_handle_request() and
* 'active_io' is already grabbed. Hence it's safe to get rdev directly
* without rcu protection.
*/
if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio)))
bio_io_error(bio);
}
WARN_ON(percpu_ref_is_zero(&mddev->active_io));
/* The pair is percpu_ref_get() from md_flush_request() */
percpu_ref_put(&mddev->active_io);
}
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
continue;
/*
* Manages consolidation of flushes and submitting any flushes needed for
* a bio with REQ_PREFLUSH. Returns true if the bio is finished or is
* being finished in another context. Returns false if the flushing is
* complete but still needs the I/O portion of the bio to be processed.
*/
bool md_flush_request(struct mddev *mddev, struct bio *bio)
{
ktime_t req_start = ktime_get_boottime();
spin_lock_irq(&mddev->lock);
/* flush requests wait until ongoing flush completes,
* hence coalescing all the pending requests.
*/
wait_event_lock_irq(mddev->sb_wait,
!mddev->flush_bio ||
ktime_before(req_start, mddev->prev_flush_start),
mddev->lock);
/* new request after previous flush is completed */
if (ktime_after(req_start, mddev->prev_flush_start)) {
WARN_ON(mddev->flush_bio);
/*
* Grab a reference to make sure mddev_suspend() will wait for
* this flush to be done.
*
* md_flush_reqeust() is called under md_handle_request() and
* 'active_io' is already grabbed, hence percpu_ref_is_zero()
* won't pass, percpu_ref_tryget_live() can't be used because
* percpu_ref_kill() can be called by mddev_suspend()
* concurrently.
*/
WARN_ON(percpu_ref_is_zero(&mddev->active_io));
percpu_ref_get(&mddev->active_io);
mddev->flush_bio = bio;
spin_unlock_irq(&mddev->lock);
INIT_WORK(&mddev->flush_work, submit_flushes);
queue_work(md_wq, &mddev->flush_work);
return true;
new = bio_alloc_bioset(rdev->bdev, 0,
REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
&mddev->bio_set);
bio_chain(new, bio);
submit_bio(new);
}
/* flush was performed for some other bio while we waited. */
spin_unlock_irq(&mddev->lock);
if (bio->bi_iter.bi_size == 0) {
/* pure flush without data - all done */
if (bio_sectors(bio) == 0) {
bio_endio(bio);
return true;
}
......@@ -763,7 +656,6 @@ int mddev_init(struct mddev *mddev)
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->sync_seq, 0);
spin_lock_init(&mddev->lock);
atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
......@@ -772,6 +664,7 @@ int mddev_init(struct mddev *mddev)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
mddev_set_bitmap_ops(mddev);
INIT_WORK(&mddev->sync_work, md_start_sync);
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
......@@ -1372,6 +1265,18 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
return ret;
}
static u64 md_bitmap_events_cleared(struct mddev *mddev)
{
struct md_bitmap_stats stats;
int err;
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (err)
return 0;
return stats.events_cleared;
}
/*
* validate_super for 0.90.0
* note: we are not using "freshest" for 0.9 superblock
......@@ -1464,7 +1369,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
/* if adding to array with a bitmap, then we can accept an
* older device ... but not too old.
*/
if (ev1 < mddev->bitmap->events_cleared)
if (ev1 < md_bitmap_events_cleared(mddev))
return 0;
if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
......@@ -1991,7 +1896,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
/* If adding to array with a bitmap, then we can accept an
* older device, but not too old.
*/
if (ev1 < mddev->bitmap->events_cleared)
if (ev1 < md_bitmap_events_cleared(mddev))
return 0;
if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
......@@ -2323,7 +2228,6 @@ super_1_allow_new_offset(struct md_rdev *rdev,
unsigned long long new_offset)
{
/* All necessary checks on new >= old have been done */
struct bitmap *bitmap;
if (new_offset >= rdev->data_offset)
return 1;
......@@ -2340,11 +2244,18 @@ super_1_allow_new_offset(struct md_rdev *rdev,
*/
if (rdev->sb_start + (32+4)*2 > new_offset)
return 0;
bitmap = rdev->mddev->bitmap;
if (bitmap && !rdev->mddev->bitmap_info.file &&
rdev->sb_start + rdev->mddev->bitmap_info.offset +
bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
if (!rdev->mddev->bitmap_info.file) {
struct mddev *mddev = rdev->mddev;
struct md_bitmap_stats stats;
int err;
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (!err && rdev->sb_start + mddev->bitmap_info.offset +
stats.file_pages * (PAGE_SIZE >> 9) > new_offset)
return 0;
}
if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
return 0;
......@@ -2820,7 +2731,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
mddev_add_trace_msg(mddev, "md md_update_sb");
rewrite:
md_bitmap_update_sb(mddev->bitmap);
mddev->bitmap_ops->update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) {
if (rdev->sb_loaded != 1)
continue; /* no noise on spare devices */
......@@ -4680,17 +4591,23 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
while (*buf) {
chunk = end_chunk = simple_strtoul(buf, &end, 0);
if (buf == end) break;
if (buf == end)
break;
if (*end == '-') { /* range */
buf = end + 1;
end_chunk = simple_strtoul(buf, &end, 0);
if (buf == end) break;
if (buf == end)
break;
}
if (*end && !isspace(*end)) break;
md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
if (*end && !isspace(*end))
break;
mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk);
buf = skip_spaces(end);
}
md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */
out:
mddev_unlock(mddev);
return len;
......@@ -6206,16 +6123,10 @@ int md_run(struct mddev *mddev)
}
if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
struct bitmap *bitmap;
bitmap = md_bitmap_create(mddev, -1);
if (IS_ERR(bitmap)) {
err = PTR_ERR(bitmap);
err = mddev->bitmap_ops->create(mddev, -1);
if (err)
pr_warn("%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
} else
mddev->bitmap = bitmap;
}
if (err)
goto bitmap_abort;
......@@ -6285,7 +6196,7 @@ int md_run(struct mddev *mddev)
pers->free(mddev, mddev->private);
mddev->private = NULL;
module_put(pers->owner);
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
abort:
bioset_exit(&mddev->io_clone_set);
exit_sync_set:
......@@ -6304,9 +6215,10 @@ int do_md_run(struct mddev *mddev)
err = md_run(mddev);
if (err)
goto out;
err = md_bitmap_load(mddev);
err = mddev->bitmap_ops->load(mddev);
if (err) {
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
goto out;
}
......@@ -6450,7 +6362,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
md_bitmap_flush(mddev);
mddev->bitmap_ops->flush(mddev);
if (md_is_rdwr(mddev) &&
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
......@@ -6477,7 +6390,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
static void mddev_detach(struct mddev *mddev)
{
md_bitmap_wait_behind_writes(mddev);
mddev->bitmap_ops->wait_behind_writes(mddev);
if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
......@@ -6492,7 +6405,8 @@ static void mddev_detach(struct mddev *mddev)
static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
mddev->pers = NULL;
......@@ -7270,22 +7184,19 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = 0;
if (mddev->pers) {
if (fd >= 0) {
struct bitmap *bitmap;
err = mddev->bitmap_ops->create(mddev, -1);
if (!err)
err = mddev->bitmap_ops->load(mddev);
bitmap = md_bitmap_create(mddev, -1);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
err = md_bitmap_load(mddev);
} else
err = PTR_ERR(bitmap);
if (err) {
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
fd = -1;
}
} else if (fd < 0) {
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
}
}
if (fd < 0) {
struct file *f = mddev->bitmap_info.file;
if (f) {
......@@ -7554,7 +7465,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
goto err;
}
if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
struct bitmap *bitmap;
/* add the bitmap */
if (mddev->bitmap) {
rv = -EEXIST;
......@@ -7568,24 +7478,24 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
bitmap = md_bitmap_create(mddev, -1);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
rv = md_bitmap_load(mddev);
} else
rv = PTR_ERR(bitmap);
rv = mddev->bitmap_ops->create(mddev, -1);
if (!rv)
rv = mddev->bitmap_ops->load(mddev);
if (rv)
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
} else {
/* remove the bitmap */
if (!mddev->bitmap) {
rv = -ENOENT;
struct md_bitmap_stats stats;
rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (rv)
goto err;
}
if (mddev->bitmap->storage.file) {
if (stats.file) {
rv = -EINVAL;
goto err;
}
if (mddev->bitmap_info.nodes) {
/* hold PW on all the bitmap lock */
if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
......@@ -7600,7 +7510,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
module_put(md_cluster_mod);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
md_bitmap_destroy(mddev);
mddev->bitmap_ops->destroy(mddev);
mddev->bitmap_info.offset = 0;
}
}
......@@ -8370,6 +8280,33 @@ static void md_seq_stop(struct seq_file *seq, void *v)
spin_unlock(&all_mddevs_lock);
}
static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
{
struct md_bitmap_stats stats;
unsigned long used_pages;
unsigned long chunk_kb;
int err;
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (err)
return;
chunk_kb = mddev->bitmap_info.chunksize >> 10;
used_pages = stats.pages - stats.missing_pages;
seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk",
used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10),
chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
chunk_kb ? "KB" : "B");
if (stats.file) {
seq_puts(seq, ", file: ");
seq_file_path(seq, stats.file, " \t\n");
}
seq_putc(seq, '\n');
}
static int md_seq_show(struct seq_file *seq, void *v)
{
struct mddev *mddev;
......@@ -8453,7 +8390,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
} else
seq_printf(seq, "\n ");
md_bitmap_status(seq, mddev->bitmap);
md_bitmap_status(seq, mddev);
seq_printf(seq, "\n");
}
......@@ -8668,7 +8605,6 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
BUG_ON(mddev->ro == MD_RDONLY);
if (mddev->ro == MD_AUTO_READ) {
/* need to switch to read/write */
flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
......@@ -9506,7 +9442,7 @@ static void md_start_sync(struct work_struct *ws)
* stored on all devices. So make sure all bitmap pages get written.
*/
if (spares)
md_bitmap_write_all(mddev->bitmap);
mddev->bitmap_ops->write_all(mddev);
name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
"reshape" : "resync";
......@@ -9594,7 +9530,7 @@ static void unregister_sync_thread(struct mddev *mddev)
void md_check_recovery(struct mddev *mddev)
{
if (mddev->bitmap)
md_bitmap_daemon_work(mddev);
mddev->bitmap_ops->daemon_work(mddev);
if (signal_pending(current)) {
if (mddev->pers->sync_request && !mddev->external) {
......@@ -9965,7 +9901,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
if (ret)
pr_info("md-cluster: resize failed\n");
else
md_bitmap_update_sb(mddev->bitmap);
mddev->bitmap_ops->update_sb(mddev->bitmap);
}
/* Check for change of roles in the active devices */
......
......@@ -535,7 +535,8 @@ struct mddev {
struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */
struct bitmap *bitmap; /* the bitmap for the device */
void *bitmap; /* the bitmap for the device */
struct bitmap_operations *bitmap_ops;
struct {
struct file *file; /* the bitmap file */
loff_t offset; /* offset from superblock of
......@@ -571,16 +572,6 @@ struct mddev {
*/
struct bio_set io_clone_set;
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
* the rest of the request (without the REQ_PREFLUSH flag).
*/
struct bio *flush_bio;
atomic_t flush_pending;
ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed
* flush was started.
*/
struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
mempool_t *serial_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
......
......@@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* If bitmap is not enabled, it's safe to submit the io directly, and
* this can get optimal performance.
*/
if (!md_bitmap_enabled(mddev->bitmap)) {
if (!mddev->bitmap_ops->enabled(mddev)) {
raid1_submit_write(bio);
return true;
}
......@@ -166,12 +166,9 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* while current io submission must wait for bitmap io to be done. In order to
* avoid such deadlock, submit bitmap io asynchronously.
*/
static inline void raid1_prepare_flush_writes(struct bitmap *bitmap)
static inline void raid1_prepare_flush_writes(struct mddev *mddev)
{
if (current->bio_list)
md_bitmap_unplug_async(bitmap);
else
md_bitmap_unplug(bitmap);
mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL);
}
/*
......
......@@ -411,18 +411,20 @@ static void raid1_end_read_request(struct bio *bio)
static void close_write(struct r1bio *r1_bio)
{
struct mddev *mddev = r1_bio->mddev;
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
bio_free_pages(r1_bio->behind_master_bio);
bio_put(r1_bio->behind_master_bio);
r1_bio->behind_master_bio = NULL;
}
/* clear the bitmap if all writes complete successfully */
md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
r1_bio->sectors,
mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
!test_bit(R1BIO_Degraded, &r1_bio->state),
test_bit(R1BIO_BehindIO, &r1_bio->state));
md_write_end(r1_bio->mddev);
md_write_end(mddev);
}
static void r1_bio_write_done(struct r1bio *r1_bio)
......@@ -894,7 +896,7 @@ static void wake_up_barrier(struct r1conf *conf)
static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
raid1_prepare_flush_writes(conf->mddev->bitmap);
raid1_prepare_flush_writes(conf->mddev);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */
......@@ -1311,13 +1313,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
struct bio *read_bio;
struct bitmap *bitmap = mddev->bitmap;
const enum req_op op = bio_op(bio);
const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
int rdisk;
bool r1bio_existed = !!r1_bio;
char b[BDEVNAME_SIZE];
/*
* If r1_bio is set, we are blocking the raid1d thread
......@@ -1326,16 +1326,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
*/
gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
if (r1bio_existed) {
/* Need to get the block device name carefully */
struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
if (rdev)
snprintf(b, sizeof(b), "%pg", rdev->bdev);
else
strcpy(b, "???");
}
/*
* Still need barrier for READ in case that whole
* array is frozen.
......@@ -1357,15 +1347,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
* used and no empty request is available.
*/
rdisk = read_balance(conf, r1_bio, &max_sectors);
if (rdisk < 0) {
/* couldn't find anywhere to read from */
if (r1bio_existed) {
pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
if (r1bio_existed)
pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
mdname(mddev),
b,
(unsigned long long)r1_bio->sector);
}
conf->mirrors[r1_bio->read_disk].rdev->bdev,
r1_bio->sector);
raid_end_bio_io(r1_bio);
return;
}
......@@ -1377,15 +1365,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
(unsigned long long)r1_bio->sector,
mirror->rdev->bdev);
if (test_bit(WriteMostly, &mirror->rdev->flags) &&
bitmap) {
if (test_bit(WriteMostly, &mirror->rdev->flags)) {
/*
* Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind'
*/
mddev_add_trace_msg(mddev, "raid1 wait behind writes");
wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0);
mddev->bitmap_ops->wait_behind_writes(mddev);
}
if (max_sectors < bio_sectors(bio)) {
......@@ -1426,7 +1412,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
int i, disks;
struct bitmap *bitmap = mddev->bitmap;
unsigned long flags;
struct md_rdev *blocked_rdev;
int first_clone;
......@@ -1579,7 +1564,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* at a time and thus needs a new bio that can fit the whole payload
* this bio in page sized chunks.
*/
if (write_behind && bitmap)
if (write_behind && mddev->bitmap)
max_sectors = min_t(int, max_sectors,
BIO_MAX_VECS * (PAGE_SIZE >> 9));
if (max_sectors < bio_sectors(bio)) {
......@@ -1606,18 +1591,22 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
continue;
if (first_clone) {
unsigned long max_write_behind =
mddev->bitmap_info.max_write_behind;
struct md_bitmap_stats stats;
int err;
/* do behind I/O ?
* Not if there are too many, or cannot
* allocate memory, or a reader on WriteMostly
* is waiting for behind writes to flush */
if (bitmap && write_behind &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait)) {
err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
if (!err && write_behind && !stats.behind_wait &&
stats.behind_writes < max_write_behind)
alloc_behind_master_bio(r1_bio, bio);
}
md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors,
mddev->bitmap_ops->startwrite(
mddev, r1_bio->sector, r1_bio->sectors,
test_bit(R1BIO_BehindIO, &r1_bio->state));
first_clone = 0;
}
......@@ -2036,7 +2025,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
/* make sure these bits don't get cleared. */
do {
md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks);
s += sync_blocks;
sectors_to_go -= sync_blocks;
} while (sectors_to_go > 0);
......@@ -2765,7 +2754,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
int wonly = -1;
int write_targets = 0, read_targets = 0;
sector_t sync_blocks;
int still_degraded = 0;
bool still_degraded = false;
int good_sectors = RESYNC_SECTORS;
int min_bad = 0; /* number of sectors that are bad in all devices */
int idx = sector_to_idx(sector_nr);
......@@ -2782,12 +2771,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* We can find the current addess in mddev->curr_resync
*/
if (mddev->curr_resync < max_sector) /* aborted */
md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1);
mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
&sync_blocks);
else /* completed sync */
conf->fullsync = 0;
md_bitmap_close_sync(mddev->bitmap);
mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
if (mddev_is_clustered(mddev)) {
......@@ -2807,7 +2796,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
/* before building a request, check if we can skip these blocks..
* This call the bitmap_start_sync doesn't actually record anything
*/
if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block, and probably several more */
*skipped = 1;
......@@ -2825,9 +2814,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* sector_nr + two times RESYNC_SECTORS
*/
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
if (raise_barrier(conf, sector_nr))
return 0;
......@@ -2858,7 +2847,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
still_degraded = 1;
still_degraded = true;
} else if (!test_bit(In_sync, &rdev->flags)) {
bio->bi_opf = REQ_OP_WRITE;
bio->bi_end_io = end_sync_write;
......@@ -2982,7 +2971,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
if (sync_blocks == 0) {
if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
&sync_blocks, still_degraded) &&
!conf->fullsync &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
......@@ -3307,14 +3296,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
* worth it.
*/
sector_t newsize = raid1_size(mddev, sectors, 0);
int ret;
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
if (mddev->bitmap) {
int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
if (ret)
return ret;
}
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
......
......@@ -426,12 +426,13 @@ static void raid10_end_read_request(struct bio *bio)
static void close_write(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;
/* clear the bitmap if all writes complete successfully */
md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
r10_bio->sectors,
mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
!test_bit(R10BIO_Degraded, &r10_bio->state),
0);
md_write_end(r10_bio->mddev);
false);
md_write_end(mddev);
}
static void one_write_done(struct r10bio *r10_bio)
......@@ -884,7 +885,7 @@ static void flush_pending_writes(struct r10conf *conf)
__set_current_state(TASK_RUNNING);
blk_start_plug(&plug);
raid1_prepare_flush_writes(conf->mddev->bitmap);
raid1_prepare_flush_writes(conf->mddev);
wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
......@@ -1100,7 +1101,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
raid1_prepare_flush_writes(mddev->bitmap);
raid1_prepare_flush_writes(mddev);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */
......@@ -1492,7 +1493,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors,
false);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
......@@ -2465,7 +2467,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
s = PAGE_SIZE >> 9;
rdev = conf->mirrors[dr].rdev;
addr = r10_bio->devs[0].addr + sect,
addr = r10_bio->devs[0].addr + sect;
ok = sync_page_io(rdev,
addr,
s << 9,
......@@ -3192,13 +3194,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1);
mddev->bitmap_ops->end_sync(mddev,
mddev->curr_resync,
&sync_blocks);
else for (i = 0; i < conf->geo.raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
md_bitmap_end_sync(mddev->bitmap, sect,
&sync_blocks, 1);
mddev->bitmap_ops->end_sync(mddev, sect,
&sync_blocks);
}
} else {
/* completed sync */
......@@ -3218,7 +3222,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
conf->fullsync = 0;
}
md_bitmap_close_sync(mddev->bitmap);
mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
*skipped = 1;
return sectors_skipped;
......@@ -3287,10 +3291,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio = NULL;
for (i = 0 ; i < conf->geo.raid_disks; i++) {
int still_degraded;
bool still_degraded;
struct r10bio *rb2;
sector_t sect;
int must_sync;
bool must_sync;
int any_working;
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
......@@ -3307,7 +3311,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (!mrdev && !mreplace)
continue;
still_degraded = 0;
still_degraded = false;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
......@@ -3320,8 +3324,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* we only need to recover the block if it is set in
* the bitmap
*/
must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, 1);
must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
&sync_blocks,
true);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
......@@ -3359,12 +3364,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
struct md_rdev *rdev = conf->mirrors[j].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
still_degraded = 1;
still_degraded = false;
break;
}
}
must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
&sync_blocks, still_degraded);
any_working = 0;
......@@ -3538,12 +3543,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
&sync_blocks,
mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) {
/* We can skip this block */
......@@ -4190,6 +4196,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
*/
struct r10conf *conf = mddev->private;
sector_t oldsize, size;
int ret;
if (mddev->reshape_position != MaxSector)
return -EBUSY;
......@@ -4202,11 +4209,11 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
if (mddev->external_size &&
mddev->array_sectors > size)
return -EINVAL;
if (mddev->bitmap) {
int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
ret = mddev->bitmap_ops->resize(mddev, size, 0, false);
if (ret)
return ret;
}
md_set_array_sectors(mddev, size);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > oldsize) {
......@@ -4472,7 +4479,7 @@ static int raid10_start_reshape(struct mddev *mddev)
newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
if (!mddev_is_clustered(mddev)) {
ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
if (ret)
goto abort;
else
......@@ -4487,20 +4494,20 @@ static int raid10_start_reshape(struct mddev *mddev)
/*
* some node is already performing reshape, and no need to
* call md_bitmap_resize again since it should be called when
* call bitmap_ops->resize again since it should be called when
* receiving BITMAP_RESIZE msg
*/
if ((sb && (le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
goto out;
ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
if (ret)
goto abort;
ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
if (ret) {
md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
mddev->bitmap_ops->resize(mddev, oldsize, 0, false);
goto abort;
}
}
......
......@@ -313,10 +313,10 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
RAID5_STRIPE_SECTORS(conf),
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
false);
}
}
}
......@@ -2798,7 +2798,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
{
struct r5l_log *log = READ_ONCE(conf->log);
int i;
int do_wakeup = 0;
sector_t tree_index;
void __rcu **pslot;
uintptr_t refcount;
......@@ -2815,7 +2814,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
for (i = sh->disks; i--; ) {
clear_bit(R5_InJournal, &sh->dev[i].flags);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
do_wakeup = 1;
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
}
/*
......@@ -2828,9 +2827,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
if (atomic_dec_and_test(&conf->pending_full_writes))
md_wakeup_thread(conf->mddev->thread);
if (do_wakeup)
wake_up(&conf->wait_for_overlap);
spin_lock_irq(&log->stripe_in_journal_lock);
list_del_init(&sh->r5c);
spin_unlock_irq(&log->stripe_in_journal_lock);
......
......@@ -2337,7 +2337,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_Overlap, &dev->flags))
wake_up(&sh->raid_conf->wait_for_overlap);
wake_up_bit(&dev->flags, R5_Overlap);
}
}
local_unlock(&conf->percpu->lock);
......@@ -3473,7 +3473,7 @@ static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
* With PPL only writes to consecutive data chunks within a
* stripe are allowed because for a single stripe_head we can
* only have one PPL entry at a time, which describes one data
* range. Not really an overlap, but wait_for_overlap can be
* range. Not really an overlap, but R5_Overlap can be
* used to handle this.
*/
sector_t sector;
......@@ -3563,8 +3563,8 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
*/
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
RAID5_STRIPE_SECTORS(conf), false);
spin_lock_irq(&sh->stripe_lock);
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
......@@ -3652,7 +3652,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
log_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
......@@ -3663,8 +3663,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = nextbi;
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0, 0);
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
false, false);
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
......@@ -3696,7 +3697,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
if (bi)
s->to_read--;
while (bi && bi->bi_iter.bi_sector <
......@@ -3709,8 +3710,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0, 0);
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
false, false);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
......@@ -3734,7 +3736,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
wake_up(&conf->wait_for_overlap);
wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
s->syncing = 0;
s->replacing = 0;
/* There is nothing more to do for sync/check/repair.
......@@ -4059,10 +4061,10 @@ static void handle_stripe_clean_event(struct r5conf *conf,
bio_endio(wbi);
wbi = wbi2;
}
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
RAID5_STRIPE_SECTORS(conf),
conf->mddev->bitmap_ops->endwrite(conf->mddev,
sh->sector, RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
false);
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
......@@ -4875,7 +4877,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
{
struct stripe_head *sh, *next;
int i;
int do_wakeup = 0;
list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
......@@ -4911,7 +4912,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
spin_unlock_irq(&sh->stripe_lock);
for (i = 0; i < sh->disks; i++) {
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
do_wakeup = 1;
wake_up_bit(&sh->dev[i].flags, R5_Overlap);
sh->dev[i].flags = head_sh->dev[i].flags &
(~((1 << R5_WriteError) | (1 << R5_Overlap)));
}
......@@ -4925,12 +4926,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
spin_unlock_irq(&head_sh->stripe_lock);
for (i = 0; i < head_sh->disks; i++)
if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
do_wakeup = 1;
wake_up_bit(&head_sh->dev[i].flags, R5_Overlap);
if (head_sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &head_sh->state);
if (do_wakeup)
wake_up(&head_sh->raid_conf->wait_for_overlap);
}
static void handle_stripe(struct stripe_head *sh)
......@@ -5196,7 +5194,7 @@ static void handle_stripe(struct stripe_head *sh)
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
wake_up(&conf->wait_for_overlap);
wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
}
/* If the failed drives are just a ReadError, then we might need
......@@ -5259,7 +5257,7 @@ static void handle_stripe(struct stripe_head *sh)
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
}
......@@ -5753,12 +5751,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
int d;
again:
sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0);
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
raid5_release_stripe(sh);
schedule();
wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap,
TASK_UNINTERRUPTIBLE);
goto again;
}
clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
......@@ -5770,12 +5767,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock);
raid5_release_stripe(sh);
schedule();
wait_on_bit(&sh->dev[d].flags, R5_Overlap,
TASK_UNINTERRUPTIBLE);
goto again;
}
}
set_bit(STRIPE_DISCARD, &sh->state);
finish_wait(&conf->wait_for_overlap, &w);
sh->overwrite_disks = 0;
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
......@@ -5788,13 +5785,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
for (d = 0;
d < conf->raid_disks - conf->max_degraded;
for (d = 0; d < conf->raid_disks - conf->max_degraded;
d++)
md_bitmap_startwrite(mddev->bitmap,
sh->sector,
RAID5_STRIPE_SECTORS(conf),
0);
mddev->bitmap_ops->startwrite(mddev, sh->sector,
RAID5_STRIPE_SECTORS(conf), false);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
......@@ -5855,7 +5849,6 @@ static int add_all_stripe_bios(struct r5conf *conf,
struct bio *bi, int forwrite, int previous)
{
int dd_idx;
int ret = 1;
spin_lock_irq(&sh->stripe_lock);
......@@ -5871,13 +5864,18 @@ static int add_all_stripe_bios(struct r5conf *conf,
if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
set_bit(R5_Overlap, &dev->flags);
ret = 0;
continue;
spin_unlock_irq(&sh->stripe_lock);
raid5_release_stripe(sh);
/* release batch_last before wait to avoid risk of deadlock */
if (ctx->batch_last) {
raid5_release_stripe(ctx->batch_last);
ctx->batch_last = NULL;
}
md_wakeup_thread(conf->mddev->thread);
wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE);
return 0;
}
}
if (!ret)
goto out;
for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
struct r5dev *dev = &sh->dev[dd_idx];
......@@ -5894,9 +5892,8 @@ static int add_all_stripe_bios(struct r5conf *conf,
RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
}
out:
spin_unlock_irq(&sh->stripe_lock);
return ret;
return 1;
}
enum reshape_loc {
......@@ -5992,17 +5989,17 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
goto out_release;
}
if (test_bit(STRIPE_EXPANDING, &sh->state) ||
!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
/*
* Stripe is busy expanding or add failed due to
* overlap. Flush everything and wait a while.
*/
if (test_bit(STRIPE_EXPANDING, &sh->state)) {
md_wakeup_thread(mddev->thread);
ret = STRIPE_SCHEDULE_AND_RETRY;
goto out_release;
}
if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
ret = STRIPE_RETRY;
goto out;
}
if (stripe_can_batch(sh)) {
stripe_add_to_batch_list(conf, sh, ctx->batch_last);
if (ctx->batch_last)
......@@ -6073,6 +6070,7 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
bool on_wq;
struct r5conf *conf = mddev->private;
sector_t logical_sector;
struct stripe_request_ctx ctx = {};
......@@ -6146,11 +6144,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
* sequential IO pattern. We don't bother with the optimization when
* reshaping as the performance benefit is not worth the complexity.
*/
if (likely(conf->reshape_progress == MaxSector))
if (likely(conf->reshape_progress == MaxSector)) {
logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
on_wq = false;
} else {
add_wait_queue(&conf->wait_for_reshape, &wait);
on_wq = true;
}
s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
add_wait_queue(&conf->wait_for_overlap, &wait);
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
......@@ -6161,6 +6163,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
continue;
if (res == STRIPE_SCHEDULE_AND_RETRY) {
WARN_ON_ONCE(!on_wq);
/*
* Must release the reference to batch_last before
* scheduling and waiting for work to be done,
......@@ -6185,7 +6188,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
logical_sector = ctx.first_sector +
(s << RAID5_STRIPE_SHIFT(conf));
}
remove_wait_queue(&conf->wait_for_overlap, &wait);
if (unlikely(on_wq))
remove_wait_queue(&conf->wait_for_reshape, &wait);
if (ctx.batch_last)
raid5_release_stripe(ctx.batch_last);
......@@ -6338,7 +6342,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
: (safepos < writepos && readpos > writepos)) ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
wait_event(conf->wait_for_reshape,
atomic_read(&conf->reshape_stripes)==0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
......@@ -6364,7 +6368,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
......@@ -6447,7 +6451,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
(sector_nr - mddev->curr_resync_completed) * 2
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
wait_event(conf->wait_for_reshape,
atomic_read(&conf->reshape_stripes) == 0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
......@@ -6473,7 +6477,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
ret:
......@@ -6486,7 +6490,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
struct r5conf *conf = mddev->private;
struct stripe_head *sh;
sector_t sync_blocks;
int still_degraded = 0;
bool still_degraded = false;
int i;
if (sector_nr >= max_sector) {
......@@ -6498,17 +6502,17 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (mddev->curr_resync < max_sector) /* aborted */
md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1);
mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
&sync_blocks);
else /* completed sync */
conf->fullsync = 0;
md_bitmap_close_sync(mddev->bitmap);
mddev->bitmap_ops->close_sync(mddev);
return 0;
}
/* Allow raid5_quiesce to complete */
wait_event(conf->wait_for_overlap, conf->quiesce != 2);
wait_event(conf->wait_for_reshape, conf->quiesce != 2);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
......@@ -6531,7 +6535,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
true) &&
sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
......@@ -6540,7 +6545,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
sh = raid5_get_active_stripe(conf, NULL, sector_nr,
R5_GAS_NOBLOCK);
......@@ -6559,10 +6564,11 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
struct md_rdev *rdev = conf->disks[i].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
still_degraded = true;
}
md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
......@@ -6767,7 +6773,7 @@ static void raid5d(struct md_thread *thread)
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
md_bitmap_unplug(mddev->bitmap);
mddev->bitmap_ops->unplug(mddev, true);
spin_lock_irq(&conf->device_lock);
conf->seq_write = conf->seq_flush;
activate_bit_delay(conf, conf->temp_inactive_list);
......@@ -7492,7 +7498,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
init_waitqueue_head(&conf->wait_for_reshape);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->loprio_list);
INIT_LIST_HEAD(&conf->hold_list);
......@@ -8312,6 +8318,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
*/
sector_t newsize;
struct r5conf *conf = mddev->private;
int ret;
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return -EINVAL;
......@@ -8320,11 +8327,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
if (mddev->bitmap) {
int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false);
if (ret)
return ret;
}
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
......@@ -8550,7 +8557,7 @@ static void end_reshape(struct r5conf *conf)
!test_bit(In_sync, &rdev->flags))
rdev->recovery_offset = MaxSector;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
mddev_update_io_opt(conf->mddev,
conf->raid_disks - conf->max_degraded);
......@@ -8614,13 +8621,13 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
conf->quiesce = 1;
unlock_all_device_hash_locks_irq(conf);
/* allow reshape to continue */
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
} else {
/* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
wake_up(&conf->wait_for_quiescent);
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
unlock_all_device_hash_locks_irq(conf);
}
log_quiesce(conf, quiesce);
......@@ -8939,7 +8946,7 @@ static void raid5_prepare_suspend(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
wake_up(&conf->wait_for_overlap);
wake_up(&conf->wait_for_reshape);
}
static struct md_personality raid6_personality =
......
......@@ -668,7 +668,7 @@ struct r5conf {
struct llist_head released_stripes;
wait_queue_head_t wait_for_quiescent;
wait_queue_head_t wait_for_stripe;
wait_queue_head_t wait_for_overlap;
wait_queue_head_t wait_for_reshape;
unsigned long cache_state;
struct shrinker *shrinker;
int pool_size; /* number of disks in stripeheads in pool */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment