Commit 2a013e37 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md/4.3' of git://neil.brown.name/md

Pull md updates from Neil Brown:

 - an assortment of little fixes, several for minor races only likely to
   be hit during testing

 - further cluster-md-raid1 development, not ready for real use yet.

 - new RAID6 syndrome code for ARM NEON

 - fix a race where a write can return before failure of one device is
   properly recorded in metadata, so an immediate crash might result in
   that write being lost.

* tag 'md/4.3' of git://neil.brown.name/md: (33 commits)
  md/raid5: ensure device failure recorded before write request returns.
  md/raid5: use bio_list for the list of bios to return.
  md/raid10: ensure device failure recorded before write request returns.
  md/raid1: ensure device failure recorded before write request returns.
  md-cluster: remove inappropriate try_module_get from join()
  md: extend spinlock protection in register_md_cluster_operations
  md-cluster: Read the disk bitmap sb and check if it needs recovery
  md-cluster: only call complete(&cinfo->completion) when node join cluster
  md-cluster: add missed lockres_free
  md-cluster: remove the unused sb_lock
  md-cluster: init suspend_list and suspend_lock early in join
  md-cluster: add the error check if failed to get dlm lock
  md-cluster: init completion within lockres_init
  md-cluster: fix deadlock issue on message lock
  md-cluster: transfer the resync ownership to another node
  md-cluster: split recover_slot for future code reuse
  md-cluster: use %pU to print UUIDs
  md: setup safemode_timer before it's being used
  md/raid5: handle possible race as reshape completes.
  md: sync sync_completed has correct value as recovery finishes.
  ...
parents 17447717 e89c6fdf
...@@ -91,7 +91,7 @@ The algorithm is: ...@@ -91,7 +91,7 @@ The algorithm is:
this message inappropriate or redundant. this message inappropriate or redundant.
3. sender write LVB. 3. sender write LVB.
sender down-convert MESSAGE from EX to CR sender down-convert MESSAGE from EX to CW
sender try to get EX of ACK sender try to get EX of ACK
[ wait until all receiver has *processed* the MESSAGE ] [ wait until all receiver has *processed* the MESSAGE ]
...@@ -112,7 +112,7 @@ The algorithm is: ...@@ -112,7 +112,7 @@ The algorithm is:
sender down-convert ACK from EX to CR sender down-convert ACK from EX to CR
sender release MESSAGE sender release MESSAGE
sender release TOKEN sender release TOKEN
receiver upconvert to EX of MESSAGE receiver upconvert to PR of MESSAGE
receiver get CR of ACK receiver get CR of ACK
receiver release MESSAGE receiver release MESSAGE
......
This diff is collapsed.
...@@ -483,6 +483,8 @@ static void mddev_put(struct mddev *mddev) ...@@ -483,6 +483,8 @@ static void mddev_put(struct mddev *mddev)
bioset_free(bs); bioset_free(bs);
} }
static void md_safemode_timeout(unsigned long data);
void mddev_init(struct mddev *mddev) void mddev_init(struct mddev *mddev)
{ {
mutex_init(&mddev->open_mutex); mutex_init(&mddev->open_mutex);
...@@ -490,7 +492,8 @@ void mddev_init(struct mddev *mddev) ...@@ -490,7 +492,8 @@ void mddev_init(struct mddev *mddev)
mutex_init(&mddev->bitmap_info.mutex); mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs); INIT_LIST_HEAD(&mddev->all_mddevs);
init_timer(&mddev->safemode_timer); setup_timer(&mddev->safemode_timer, md_safemode_timeout,
(unsigned long) mddev);
atomic_set(&mddev->active, 1); atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0); atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0); atomic_set(&mddev->active_io, 0);
...@@ -3255,8 +3258,6 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) ...@@ -3255,8 +3258,6 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
return 0; return 0;
} }
static void md_safemode_timeout(unsigned long data);
static ssize_t static ssize_t
safe_delay_show(struct mddev *mddev, char *page) safe_delay_show(struct mddev *mddev, char *page)
{ {
...@@ -4189,6 +4190,8 @@ action_show(struct mddev *mddev, char *page) ...@@ -4189,6 +4190,8 @@ action_show(struct mddev *mddev, char *page)
type = "repair"; type = "repair";
} else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
type = "recover"; type = "recover";
else if (mddev->reshape_position != MaxSector)
type = "reshape";
} }
return sprintf(page, "%s\n", type); return sprintf(page, "%s\n", type);
} }
...@@ -5180,8 +5183,6 @@ int md_run(struct mddev *mddev) ...@@ -5180,8 +5183,6 @@ int md_run(struct mddev *mddev)
atomic_set(&mddev->max_corr_read_errors, atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0; mddev->safemode = 0;
mddev->safemode_timer.function = md_safemode_timeout;
mddev->safemode_timer.data = (unsigned long) mddev;
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1; mddev->in_sync = 1;
smp_wmb(); smp_wmb();
...@@ -5194,6 +5195,11 @@ int md_run(struct mddev *mddev) ...@@ -5194,6 +5195,11 @@ int md_run(struct mddev *mddev)
if (sysfs_link_rdev(mddev, rdev)) if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */; /* failure here is OK */;
if (mddev->degraded && !mddev->ro)
/* This ensures that recovering status is reported immediately
* via sysfs - until a lack of spares is confirmed.
*/
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
if (mddev->flags & MD_UPDATE_SB_FLAGS) if (mddev->flags & MD_UPDATE_SB_FLAGS)
...@@ -5741,16 +5747,16 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg) ...@@ -5741,16 +5747,16 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
err = 0; err = 0;
spin_lock(&mddev->lock); spin_lock(&mddev->lock);
/* bitmap disabled, zero the first byte and copy out */ /* bitmap enabled */
if (!mddev->bitmap_info.file) if (mddev->bitmap_info.file) {
file->pathname[0] = '\0'; ptr = file_path(mddev->bitmap_info.file, file->pathname,
else if ((ptr = file_path(mddev->bitmap_info.file, sizeof(file->pathname));
file->pathname, sizeof(file->pathname))), if (IS_ERR(ptr))
IS_ERR(ptr)) err = PTR_ERR(ptr);
err = PTR_ERR(ptr); else
else memmove(file->pathname, ptr,
memmove(file->pathname, ptr, sizeof(file->pathname)-(ptr-file->pathname));
sizeof(file->pathname)-(ptr-file->pathname)); }
spin_unlock(&mddev->lock); spin_unlock(&mddev->lock);
if (err == 0 && if (err == 0 &&
...@@ -7069,7 +7075,7 @@ static void status_unused(struct seq_file *seq) ...@@ -7069,7 +7075,7 @@ static void status_unused(struct seq_file *seq)
seq_printf(seq, "\n"); seq_printf(seq, "\n");
} }
static void status_resync(struct seq_file *seq, struct mddev *mddev) static int status_resync(struct seq_file *seq, struct mddev *mddev)
{ {
sector_t max_sectors, resync, res; sector_t max_sectors, resync, res;
unsigned long dt, db; unsigned long dt, db;
...@@ -7077,18 +7083,32 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev) ...@@ -7077,18 +7083,32 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
int scale; int scale;
unsigned int per_milli; unsigned int per_milli;
if (mddev->curr_resync <= 3)
resync = 0;
else
resync = mddev->curr_resync
- atomic_read(&mddev->recovery_active);
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors; max_sectors = mddev->resync_max_sectors;
else else
max_sectors = mddev->dev_sectors; max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync;
if (resync <= 3) {
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
/* Still cleaning up */
resync = max_sectors;
} else
resync -= atomic_read(&mddev->recovery_active);
if (resync == 0) {
if (mddev->recovery_cp < MaxSector) {
seq_printf(seq, "\tresync=PENDING");
return 1;
}
return 0;
}
if (resync < 3) {
seq_printf(seq, "\tresync=DELAYED");
return 1;
}
WARN_ON(max_sectors == 0); WARN_ON(max_sectors == 0);
/* Pick 'scale' such that (resync>>scale)*1000 will fit /* Pick 'scale' such that (resync>>scale)*1000 will fit
* in a sector_t, and (max_sectors>>scale) will fit in a * in a sector_t, and (max_sectors>>scale) will fit in a
...@@ -7153,6 +7173,7 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev) ...@@ -7153,6 +7173,7 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
((unsigned long)rt % 60)/6); ((unsigned long)rt % 60)/6);
seq_printf(seq, " speed=%ldK/sec", db/2/dt); seq_printf(seq, " speed=%ldK/sec", db/2/dt);
return 1;
} }
static void *md_seq_start(struct seq_file *seq, loff_t *pos) static void *md_seq_start(struct seq_file *seq, loff_t *pos)
...@@ -7298,13 +7319,8 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -7298,13 +7319,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
mddev->pers->status(seq, mddev); mddev->pers->status(seq, mddev);
seq_printf(seq, "\n "); seq_printf(seq, "\n ");
if (mddev->pers->sync_request) { if (mddev->pers->sync_request) {
if (mddev->curr_resync > 2) { if (status_resync(seq, mddev))
status_resync(seq, mddev);
seq_printf(seq, "\n "); seq_printf(seq, "\n ");
} else if (mddev->curr_resync >= 1)
seq_printf(seq, "\tresync=DELAYED\n ");
else if (mddev->recovery_cp < MaxSector)
seq_printf(seq, "\tresync=PENDING\n ");
} }
} else } else
seq_printf(seq, "\n "); seq_printf(seq, "\n ");
...@@ -7387,15 +7403,19 @@ int unregister_md_personality(struct md_personality *p) ...@@ -7387,15 +7403,19 @@ int unregister_md_personality(struct md_personality *p)
} }
EXPORT_SYMBOL(unregister_md_personality); EXPORT_SYMBOL(unregister_md_personality);
int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) int register_md_cluster_operations(struct md_cluster_operations *ops,
struct module *module)
{ {
if (md_cluster_ops != NULL) int ret = 0;
return -EALREADY;
spin_lock(&pers_lock); spin_lock(&pers_lock);
md_cluster_ops = ops; if (md_cluster_ops != NULL)
md_cluster_mod = module; ret = -EALREADY;
else {
md_cluster_ops = ops;
md_cluster_mod = module;
}
spin_unlock(&pers_lock); spin_unlock(&pers_lock);
return 0; return ret;
} }
EXPORT_SYMBOL(register_md_cluster_operations); EXPORT_SYMBOL(register_md_cluster_operations);
...@@ -7793,7 +7813,8 @@ void md_do_sync(struct md_thread *thread) ...@@ -7793,7 +7813,8 @@ void md_do_sync(struct md_thread *thread)
> (max_sectors >> 4)) || > (max_sectors >> 4)) ||
time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
(j - mddev->curr_resync_completed)*2 (j - mddev->curr_resync_completed)*2
>= mddev->resync_max - mddev->curr_resync_completed >= mddev->resync_max - mddev->curr_resync_completed ||
mddev->curr_resync_completed > mddev->resync_max
)) { )) {
/* time to update curr_resync_completed */ /* time to update curr_resync_completed */
wait_event(mddev->recovery_wait, wait_event(mddev->recovery_wait,
...@@ -7838,6 +7859,9 @@ void md_do_sync(struct md_thread *thread) ...@@ -7838,6 +7859,9 @@ void md_do_sync(struct md_thread *thread)
break; break;
j += sectors; j += sectors;
if (j > max_sectors)
/* when skipping, extra large numbers can be returned. */
j = max_sectors;
if (j > 2) if (j > 2)
mddev->curr_resync = j; mddev->curr_resync = j;
if (mddev_is_clustered(mddev)) if (mddev_is_clustered(mddev))
...@@ -7906,12 +7930,15 @@ void md_do_sync(struct md_thread *thread) ...@@ -7906,12 +7930,15 @@ void md_do_sync(struct md_thread *thread)
blk_finish_plug(&plug); blk_finish_plug(&plug);
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->curr_resync > 2) {
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
/* tell personality that we are finished */ /* tell personality that we are finished */
mddev->pers->sync_request(mddev, max_sectors, &skipped); mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_finish(mddev);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2) { mddev->curr_resync > 2) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
...@@ -7945,6 +7972,9 @@ void md_do_sync(struct md_thread *thread) ...@@ -7945,6 +7972,9 @@ void md_do_sync(struct md_thread *thread)
} }
} }
skip: skip:
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_finish(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
spin_lock(&mddev->lock); spin_lock(&mddev->lock);
...@@ -7955,11 +7985,11 @@ void md_do_sync(struct md_thread *thread) ...@@ -7955,11 +7985,11 @@ void md_do_sync(struct md_thread *thread)
mddev->resync_max = MaxSector; mddev->resync_max = MaxSector;
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed; mddev->resync_min = mddev->curr_resync_completed;
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
mddev->curr_resync = 0; mddev->curr_resync = 0;
spin_unlock(&mddev->lock); spin_unlock(&mddev->lock);
wake_up(&resync_wait); wake_up(&resync_wait);
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
return; return;
} }
...@@ -8128,6 +8158,7 @@ void md_check_recovery(struct mddev *mddev) ...@@ -8128,6 +8158,7 @@ void md_check_recovery(struct mddev *mddev)
*/ */
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev); md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock; goto unlock;
} }
...@@ -8574,6 +8605,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, ...@@ -8574,6 +8605,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
/* Make sure they get written out promptly */ /* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state); sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
md_wakeup_thread(rdev->mddev->thread); md_wakeup_thread(rdev->mddev->thread);
} }
return rv; return rv;
......
...@@ -83,7 +83,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -83,7 +83,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
char b2[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE];
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
bool discard_supported = false; unsigned short blksize = 512;
if (!conf) if (!conf)
return -ENOMEM; return -ENOMEM;
...@@ -98,6 +98,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -98,6 +98,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
sector_div(sectors, mddev->chunk_sectors); sector_div(sectors, mddev->chunk_sectors);
rdev1->sectors = sectors * mddev->chunk_sectors; rdev1->sectors = sectors * mddev->chunk_sectors;
blksize = max(blksize, queue_logical_block_size(
rdev1->bdev->bd_disk->queue));
rdev_for_each(rdev2, mddev) { rdev_for_each(rdev2, mddev) {
pr_debug("md/raid0:%s: comparing %s(%llu)" pr_debug("md/raid0:%s: comparing %s(%llu)"
" with %s(%llu)\n", " with %s(%llu)\n",
...@@ -134,6 +137,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -134,6 +137,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
} }
pr_debug("md/raid0:%s: FINAL %d zones\n", pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones); mdname(mddev), conf->nr_strip_zones);
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
*/
if ((mddev->chunk_sectors << 9) % blksize) {
printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
mdname(mddev),
mddev->chunk_sectors << 9, blksize);
err = -EINVAL;
goto abort;
}
err = -ENOMEM; err = -ENOMEM;
conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
conf->nr_strip_zones, GFP_KERNEL); conf->nr_strip_zones, GFP_KERNEL);
...@@ -188,16 +203,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -188,16 +203,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
} }
dev[j] = rdev1; dev[j] = rdev1;
if (mddev->queue)
disk_stack_limits(mddev->gendisk, rdev1->bdev,
rdev1->data_offset << 9);
if (!smallest || (rdev1->sectors < smallest->sectors)) if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1; smallest = rdev1;
cnt++; cnt++;
if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
discard_supported = true;
} }
if (cnt != mddev->raid_disks) { if (cnt != mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
...@@ -258,28 +266,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) ...@@ -258,28 +266,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
(unsigned long long)smallest->sectors); (unsigned long long)smallest->sectors);
} }
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
*/
if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
mdname(mddev),
mddev->chunk_sectors << 9);
goto abort;
}
if (mddev->queue) {
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
(mddev->chunk_sectors << 9) * mddev->raid_disks);
if (!discard_supported)
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
else
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
}
pr_debug("md/raid0:%s: done.\n", mdname(mddev)); pr_debug("md/raid0:%s: done.\n", mdname(mddev));
*private_conf = conf; *private_conf = conf;
...@@ -378,12 +364,6 @@ static int raid0_run(struct mddev *mddev) ...@@ -378,12 +364,6 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev)) if (md_check_no_bitmap(mddev))
return -EINVAL; return -EINVAL;
if (mddev->queue) {
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
}
/* if private is not null, we are here after takeover */ /* if private is not null, we are here after takeover */
if (mddev->private == NULL) { if (mddev->private == NULL) {
ret = create_strip_zones(mddev, &conf); ret = create_strip_zones(mddev, &conf);
...@@ -392,6 +372,29 @@ static int raid0_run(struct mddev *mddev) ...@@ -392,6 +372,29 @@ static int raid0_run(struct mddev *mddev)
mddev->private = conf; mddev->private = conf;
} }
conf = mddev->private; conf = mddev->private;
if (mddev->queue) {
struct md_rdev *rdev;
bool discard_supported = false;
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
discard_supported = true;
}
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
(mddev->chunk_sectors << 9) * mddev->raid_disks);
if (!discard_supported)
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
else
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
}
/* calculate array device size */ /* calculate array device size */
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
......
...@@ -1474,6 +1474,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1474,6 +1474,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
*/ */
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_CHANGE_PENDING, &mddev->flags);
printk(KERN_ALERT printk(KERN_ALERT
"md/raid1:%s: Disk failure on %s, disabling device.\n" "md/raid1:%s: Disk failure on %s, disabling device.\n"
"md/raid1:%s: Operation continuing on %d devices.\n", "md/raid1:%s: Operation continuing on %d devices.\n",
...@@ -2235,6 +2236,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio ...@@ -2235,6 +2236,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{ {
int m; int m;
bool fail = false;
for (m = 0; m < conf->raid_disks * 2 ; m++) for (m = 0; m < conf->raid_disks * 2 ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) { if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev; struct md_rdev *rdev = conf->mirrors[m].rdev;
...@@ -2247,6 +2249,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) ...@@ -2247,6 +2249,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
* narrow down and record precise write * narrow down and record precise write
* errors. * errors.
*/ */
fail = true;
if (!narrow_write_error(r1_bio, m)) { if (!narrow_write_error(r1_bio, m)) {
md_error(conf->mddev, md_error(conf->mddev,
conf->mirrors[m].rdev); conf->mirrors[m].rdev);
...@@ -2258,7 +2261,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) ...@@ -2258,7 +2261,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
} }
if (test_bit(R1BIO_WriteError, &r1_bio->state)) if (test_bit(R1BIO_WriteError, &r1_bio->state))
close_write(r1_bio); close_write(r1_bio);
raid_end_bio_io(r1_bio); if (fail) {
spin_lock_irq(&conf->device_lock);
list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
spin_unlock_irq(&conf->device_lock);
md_wakeup_thread(conf->mddev->thread);
} else
raid_end_bio_io(r1_bio);
} }
static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
...@@ -2364,6 +2373,23 @@ static void raid1d(struct md_thread *thread) ...@@ -2364,6 +2373,23 @@ static void raid1d(struct md_thread *thread)
md_check_recovery(mddev); md_check_recovery(mddev);
if (!list_empty_careful(&conf->bio_end_io_list) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
LIST_HEAD(tmp);
spin_lock_irqsave(&conf->device_lock, flags);
if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
list_add(&tmp, &conf->bio_end_io_list);
list_del_init(&conf->bio_end_io_list);
}
spin_unlock_irqrestore(&conf->device_lock, flags);
while (!list_empty(&tmp)) {
r1_bio = list_first_entry(&conf->bio_end_io_list,
struct r1bio, retry_list);
list_del(&r1_bio->retry_list);
raid_end_bio_io(r1_bio);
}
}
blk_start_plug(&plug); blk_start_plug(&plug);
for (;;) { for (;;) {
...@@ -2763,6 +2789,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2763,6 +2789,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->raid_disks = mddev->raid_disks; conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev; conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock); spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier); init_waitqueue_head(&conf->wait_barrier);
...@@ -3057,6 +3084,7 @@ static int raid1_reshape(struct mddev *mddev) ...@@ -3057,6 +3084,7 @@ static int raid1_reshape(struct mddev *mddev)
unfreeze_array(conf); unfreeze_array(conf);
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
......
...@@ -61,6 +61,11 @@ struct r1conf { ...@@ -61,6 +61,11 @@ struct r1conf {
* block, or anything else. * block, or anything else.
*/ */
struct list_head retry_list; struct list_head retry_list;
/* A separate list of r1bio which just need raid_end_bio_io called.
* This mustn't happen for writes which had any errors if the superblock
* needs to be written.
*/
struct list_head bio_end_io_list;
/* queue pending writes to be submitted on unplug */ /* queue pending writes to be submitted on unplug */
struct bio_list pending_bio_list; struct bio_list pending_bio_list;
......
...@@ -1589,6 +1589,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1589,6 +1589,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Blocked, &rdev->flags); set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags); set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_CHANGE_PENDING, &mddev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
printk(KERN_ALERT printk(KERN_ALERT
"md/raid10:%s: Disk failure on %s, disabling device.\n" "md/raid10:%s: Disk failure on %s, disabling device.\n"
...@@ -2623,6 +2624,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2623,6 +2624,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
} }
put_buf(r10_bio); put_buf(r10_bio);
} else { } else {
bool fail = false;
for (m = 0; m < conf->copies; m++) { for (m = 0; m < conf->copies; m++) {
int dev = r10_bio->devs[m].devnum; int dev = r10_bio->devs[m].devnum;
struct bio *bio = r10_bio->devs[m].bio; struct bio *bio = r10_bio->devs[m].bio;
...@@ -2634,6 +2636,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2634,6 +2636,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
r10_bio->sectors, 0); r10_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} else if (bio != NULL && bio->bi_error) { } else if (bio != NULL && bio->bi_error) {
fail = true;
if (!narrow_write_error(r10_bio, m)) { if (!narrow_write_error(r10_bio, m)) {
md_error(conf->mddev, rdev); md_error(conf->mddev, rdev);
set_bit(R10BIO_Degraded, set_bit(R10BIO_Degraded,
...@@ -2654,7 +2657,13 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2654,7 +2657,13 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
if (test_bit(R10BIO_WriteError, if (test_bit(R10BIO_WriteError,
&r10_bio->state)) &r10_bio->state))
close_write(r10_bio); close_write(r10_bio);
raid_end_bio_io(r10_bio); if (fail) {
spin_lock_irq(&conf->device_lock);
list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
spin_unlock_irq(&conf->device_lock);
md_wakeup_thread(conf->mddev->thread);
} else
raid_end_bio_io(r10_bio);
} }
} }
...@@ -2669,6 +2678,23 @@ static void raid10d(struct md_thread *thread) ...@@ -2669,6 +2678,23 @@ static void raid10d(struct md_thread *thread)
md_check_recovery(mddev); md_check_recovery(mddev);
if (!list_empty_careful(&conf->bio_end_io_list) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
LIST_HEAD(tmp);
spin_lock_irqsave(&conf->device_lock, flags);
if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
list_add(&tmp, &conf->bio_end_io_list);
list_del_init(&conf->bio_end_io_list);
}
spin_unlock_irqrestore(&conf->device_lock, flags);
while (!list_empty(&tmp)) {
r10_bio = list_first_entry(&conf->bio_end_io_list,
struct r10bio, retry_list);
list_del(&r10_bio->retry_list);
raid_end_bio_io(r10_bio);
}
}
blk_start_plug(&plug); blk_start_plug(&plug);
for (;;) { for (;;) {
...@@ -3443,6 +3469,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) ...@@ -3443,6 +3469,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
conf->reshape_safe = conf->reshape_progress; conf->reshape_safe = conf->reshape_progress;
spin_lock_init(&conf->device_lock); spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock); spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier); init_waitqueue_head(&conf->wait_barrier);
...@@ -4097,7 +4124,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, ...@@ -4097,7 +4124,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
* at a time, possibly less if that exceeds RESYNC_PAGES, * at a time, possibly less if that exceeds RESYNC_PAGES,
* or we hit a bad block or something. * or we hit a bad block or something.
* This might mean we pause for normal IO in the middle of * This might mean we pause for normal IO in the middle of
* a chunk, but that is not a problem was mddev->reshape_position * a chunk, but that is not a problem as mddev->reshape_position
* can record any location. * can record any location.
* *
* If we will want to write to a location that isn't * If we will want to write to a location that isn't
...@@ -4121,7 +4148,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, ...@@ -4121,7 +4148,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
* *
* In all this the minimum difference in data offsets * In all this the minimum difference in data offsets
* (conf->offset_diff - always positive) allows a bit of slack, * (conf->offset_diff - always positive) allows a bit of slack,
* so next can be after 'safe', but not by more than offset_disk * so next can be after 'safe', but not by more than offset_diff
* *
* We need to prepare all the bios here before we start any IO * We need to prepare all the bios here before we start any IO
* to ensure the size we choose is acceptable to all devices. * to ensure the size we choose is acceptable to all devices.
......
...@@ -53,6 +53,12 @@ struct r10conf { ...@@ -53,6 +53,12 @@ struct r10conf {
sector_t offset_diff; sector_t offset_diff;
struct list_head retry_list; struct list_head retry_list;
/* A separate list of r1bio which just need raid_end_bio_io called.
* This mustn't happen for writes which had any errors if the superblock
* needs to be written.
*/
struct list_head bio_end_io_list;
/* queue pending writes and submit them on unplug */ /* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list; struct bio_list pending_bio_list;
int pending_count; int pending_count;
......
This diff is collapsed.
...@@ -265,7 +265,7 @@ struct stripe_head_state { ...@@ -265,7 +265,7 @@ struct stripe_head_state {
int dec_preread_active; int dec_preread_active;
unsigned long ops_request; unsigned long ops_request;
struct bio *return_bi; struct bio_list return_bi;
struct md_rdev *blocked_rdev; struct md_rdev *blocked_rdev;
int handle_bad_blocks; int handle_bad_blocks;
}; };
...@@ -476,6 +476,9 @@ struct r5conf { ...@@ -476,6 +476,9 @@ struct r5conf {
int skip_copy; /* Don't copy data from bio to stripe cache */ int skip_copy; /* Don't copy data from bio to stripe cache */
struct list_head *last_hold; /* detect hold_list promotions */ struct list_head *last_hold; /* detect hold_list promotions */
/* bios to have bi_end_io called after metadata is synced */
struct bio_list return_bi;
atomic_t reshape_stripes; /* stripes with pending writes for reshape */ atomic_t reshape_stripes; /* stripes with pending writes for reshape */
/* unfortunately we need two cache names as we temporarily have /* unfortunately we need two cache names as we temporarily have
* two caches. * two caches.
......
...@@ -40,9 +40,20 @@ ...@@ -40,9 +40,20 @@
(unsigned long)bytes, ptrs); \ (unsigned long)bytes, ptrs); \
kernel_neon_end(); \ kernel_neon_end(); \
} \ } \
static void raid6_neon ## _n ## _xor_syndrome(int disks, \
int start, int stop, \
size_t bytes, void **ptrs) \
{ \
void raid6_neon ## _n ## _xor_syndrome_real(int, \
int, int, unsigned long, void**); \
kernel_neon_begin(); \
raid6_neon ## _n ## _xor_syndrome_real(disks, \
start, stop, (unsigned long)bytes, ptrs); \
kernel_neon_end(); \
} \
struct raid6_calls const raid6_neonx ## _n = { \ struct raid6_calls const raid6_neonx ## _n = { \
raid6_neon ## _n ## _gen_syndrome, \ raid6_neon ## _n ## _gen_syndrome, \
NULL, /* XOR not yet implemented */ \ raid6_neon ## _n ## _xor_syndrome, \
raid6_have_neon, \ raid6_have_neon, \
"neonx" #_n, \ "neonx" #_n, \
0 \ 0 \
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
* neon.uc - RAID-6 syndrome calculation using ARM NEON instructions * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
* *
* Copyright (C) 2012 Rob Herring * Copyright (C) 2012 Rob Herring
* Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
* *
* Based on altivec.uc: * Based on altivec.uc:
* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
...@@ -78,3 +79,48 @@ void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) ...@@ -78,3 +79,48 @@ void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
vst1q_u8(&q[d+NSIZE*$$], wq$$); vst1q_u8(&q[d+NSIZE*$$], wq$$);
} }
} }
void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
unsigned long bytes, void **ptrs)
{
uint8_t **dptr = (uint8_t **)ptrs;
uint8_t *p, *q;
int d, z, z0;
register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
const unative_t x1d = NBYTES(0x1d);
z0 = stop; /* P/Q right side optimization */
p = dptr[disks-2]; /* XOR parity */
q = dptr[disks-1]; /* RS syndrome */
for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
/* P/Q data pages */
for ( z = z0-1 ; z >= start ; z-- ) {
wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
wp$$ = veorq_u8(wp$$, wd$$);
w2$$ = MASK(wq$$);
w1$$ = SHLBYTE(wq$$);
w2$$ = vandq_u8(w2$$, x1d);
w1$$ = veorq_u8(w1$$, w2$$);
wq$$ = veorq_u8(w1$$, wd$$);
}
/* P/Q left side optimization */
for ( z = start-1 ; z >= 0 ; z-- ) {
w2$$ = MASK(wq$$);
w1$$ = SHLBYTE(wq$$);
w2$$ = vandq_u8(w2$$, x1d);
wq$$ = veorq_u8(w1$$, w2$$);
}
w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
wq$$ = veorq_u8(wq$$, w1$$);
vst1q_u8(&p[d+NSIZE*$$], wp$$);
vst1q_u8(&q[d+NSIZE*$$], wq$$);
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment