Commit 2a013e37 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md/4.3' of git://neil.brown.name/md

Pull md updates from Neil Brown:

 - an assortment of little fixes, several for minor races only likely to
   be hit during testing

 - further cluster-md-raid1 development, not ready for real use yet.

 - new RAID6 syndrome code for ARM NEON

 - fix a race where a write can return before failure of one device is
   properly recorded in metadata, so an immediate crash might result in
   that write being lost.

* tag 'md/4.3' of git://neil.brown.name/md: (33 commits)
  md/raid5: ensure device failure recorded before write request returns.
  md/raid5: use bio_list for the list of bios to return.
  md/raid10: ensure device failure recorded before write request returns.
  md/raid1: ensure device failure recorded before write request returns.
  md-cluster: remove inappropriate try_module_get from join()
  md: extend spinlock protection in register_md_cluster_operations
  md-cluster: Read the disk bitmap sb and check if it needs recovery
  md-cluster: only call complete(&cinfo->completion) when node join cluster
  md-cluster: add missed lockres_free
  md-cluster: remove the unused sb_lock
  md-cluster: init suspend_list and suspend_lock early in join
  md-cluster: add the error check if failed to get dlm lock
  md-cluster: init completion within lockres_init
  md-cluster: fix deadlock issue on message lock
  md-cluster: transfer the resync ownership to another node
  md-cluster: split recover_slot for future code reuse
  md-cluster: use %pU to print UUIDs
  md: setup safemode_timer before it's being used
  md/raid5: handle possible race as reshape completes.
  md: sync sync_completed has correct value as recovery finishes.
  ...
parents 17447717 e89c6fdf
......@@ -91,7 +91,7 @@ The algorithm is:
this message inappropriate or redundant.
3. sender write LVB.
sender down-convert MESSAGE from EX to CR
sender down-convert MESSAGE from EX to CW
sender try to get EX of ACK
[ wait until all receiver has *processed* the MESSAGE ]
......@@ -112,7 +112,7 @@ The algorithm is:
sender down-convert ACK from EX to CR
sender release MESSAGE
sender release TOKEN
receiver upconvert to EX of MESSAGE
receiver upconvert to PR of MESSAGE
receiver get CR of ACK
receiver release MESSAGE
......
......@@ -45,6 +45,7 @@ struct resync_info {
/* md_cluster_info flags */
#define MD_CLUSTER_WAITING_FOR_NEWDISK 1
#define MD_CLUSTER_SUSPEND_READ_BALANCING 2
#define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3
struct md_cluster_info {
......@@ -52,7 +53,6 @@ struct md_cluster_info {
dlm_lockspace_t *lockspace;
int slot_number;
struct completion completion;
struct dlm_lock_resource *sb_lock;
struct mutex sb_mutex;
struct dlm_lock_resource *bitmap_lockres;
struct list_head suspend_list;
......@@ -75,6 +75,7 @@ enum msg_type {
NEWDISK,
REMOVE,
RE_ADD,
BITMAP_NEEDS_SYNC,
};
struct cluster_msg {
......@@ -99,7 +100,6 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
{
int ret = 0;
init_completion(&res->completion);
ret = dlm_lock(res->ls, mode, &res->lksb,
res->flags, res->name, strlen(res->name),
0, sync_ast, res, res->bast);
......@@ -124,6 +124,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
if (!res)
return NULL;
init_completion(&res->completion);
res->ls = cinfo->lockspace;
res->mddev = mddev;
namelen = strlen(name);
......@@ -165,11 +166,24 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
static void lockres_free(struct dlm_lock_resource *res)
{
int ret;
if (!res)
return;
init_completion(&res->completion);
dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
/* cancel a lock request or a conversion request that is blocked */
res->flags |= DLM_LKF_CANCEL;
retry:
ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
if (unlikely(ret != 0)) {
pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
/* if a lock conversion is cancelled, then the lock is put
* back to grant queue, need to ensure it is unlocked */
if (ret == -DLM_ECANCEL)
goto retry;
}
res->flags &= ~DLM_LKF_CANCEL;
wait_for_completion(&res->completion);
kfree(res->name);
......@@ -177,18 +191,6 @@ static void lockres_free(struct dlm_lock_resource *res)
kfree(res);
}
static char *pretty_uuid(char *dest, char *src)
{
int i, len = 0;
for (i = 0; i < 16; i++) {
if (i == 4 || i == 6 || i == 8 || i == 10)
len += sprintf(dest + len, "-");
len += sprintf(dest + len, "%02x", (__u8)src[i]);
}
return dest;
}
static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
sector_t lo, sector_t hi)
{
......@@ -281,16 +283,11 @@ static void recover_prep(void *arg)
set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
static void recover_slot(void *arg, struct dlm_slot *slot)
static void __recover_slot(struct mddev *mddev, int slot)
{
struct mddev *mddev = arg;
struct md_cluster_info *cinfo = mddev->cluster_info;
pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
mddev->bitmap_info.cluster_name,
slot->nodeid, slot->slot,
cinfo->slot_number);
set_bit(slot->slot - 1, &cinfo->recovery_map);
set_bit(slot, &cinfo->recovery_map);
if (!cinfo->recovery_thread) {
cinfo->recovery_thread = md_register_thread(recover_bitmaps,
mddev, "recover");
......@@ -302,6 +299,20 @@ static void recover_slot(void *arg, struct dlm_slot *slot)
md_wakeup_thread(cinfo->recovery_thread);
}
static void recover_slot(void *arg, struct dlm_slot *slot)
{
struct mddev *mddev = arg;
struct md_cluster_info *cinfo = mddev->cluster_info;
pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
mddev->bitmap_info.cluster_name,
slot->nodeid, slot->slot,
cinfo->slot_number);
/* deduct one since dlm slot starts from one while the num of
* cluster-md begins with 0 */
__recover_slot(mddev, slot->slot - 1);
}
static void recover_done(void *arg, struct dlm_slot *slots,
int num_slots, int our_slot,
uint32_t generation)
......@@ -310,10 +321,17 @@ static void recover_done(void *arg, struct dlm_slot *slots,
struct md_cluster_info *cinfo = mddev->cluster_info;
cinfo->slot_number = our_slot;
/* completion is only need to be complete when node join cluster,
* it doesn't need to run during another node's failure */
if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
complete(&cinfo->completion);
clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
}
clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
}
/* the ops is called when node join the cluster, and do lock recovery
* if node failure occurs */
static const struct dlm_lockspace_ops md_ls_ops = {
.recover_prep = recover_prep,
.recover_slot = recover_slot,
......@@ -388,7 +406,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
int len;
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
pretty_uuid(disk_uuid + len, cmsg->uuid);
sprintf(disk_uuid + len, "%pU", cmsg->uuid);
snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
init_completion(&cinfo->newdisk_completion);
......@@ -457,6 +475,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
__func__, __LINE__, msg->slot);
process_readd_disk(mddev, msg);
break;
case BITMAP_NEEDS_SYNC:
pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
__func__, __LINE__, msg->slot);
__recover_slot(mddev, msg->slot);
break;
default:
pr_warn("%s:%d Received unknown message from %d\n",
__func__, __LINE__, msg->slot);
......@@ -472,6 +495,7 @@ static void recv_daemon(struct md_thread *thread)
struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
struct cluster_msg msg;
int ret;
/*get CR on Message*/
if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
......@@ -484,13 +508,21 @@ static void recv_daemon(struct md_thread *thread)
process_recvd_msg(thread->mddev, &msg);
/*release CR on ack_lockres*/
dlm_unlock_sync(ack_lockres);
/*up-convert to EX on message_lockres*/
dlm_lock_sync(message_lockres, DLM_LOCK_EX);
ret = dlm_unlock_sync(ack_lockres);
if (unlikely(ret != 0))
pr_info("unlock ack failed return %d\n", ret);
/*up-convert to PR on message_lockres*/
ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
if (unlikely(ret != 0))
pr_info("lock PR on msg failed return %d\n", ret);
/*get CR on ack_lockres again*/
dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
if (unlikely(ret != 0))
pr_info("lock CR on ack failed return %d\n", ret);
/*release CR on message_lockres*/
dlm_unlock_sync(message_lockres);
ret = dlm_unlock_sync(message_lockres);
if (unlikely(ret != 0))
pr_info("unlock msg failed return %d\n", ret);
}
/* lock_comm()
......@@ -519,7 +551,7 @@ static void unlock_comm(struct md_cluster_info *cinfo)
* The function:
* 1. Grabs the message lockresource in EX mode
* 2. Copies the message to the message LVB
* 3. Downconverts message lockresource to CR
* 3. Downconverts message lockresource to CW
* 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
* and the other nodes read the message. The thread will wait here until all other
* nodes have released ack lock resource.
......@@ -540,12 +572,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
sizeof(struct cluster_msg));
/*down-convert EX to CR on Message*/
error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
/*down-convert EX to CW on Message*/
error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
if (error) {
pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
error);
goto failed_message;
goto failed_ack;
}
/*up-convert CR to EX on Ack*/
......@@ -565,7 +597,13 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
}
failed_ack:
dlm_unlock_sync(cinfo->message_lockres);
error = dlm_unlock_sync(cinfo->message_lockres);
if (unlikely(error != 0)) {
pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
error);
/* in case the message can't be released due to some reason */
goto failed_ack;
}
failed_message:
return error;
}
......@@ -587,6 +625,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
struct dlm_lock_resource *bm_lockres;
struct suspend_info *s;
char str[64];
sector_t lo, hi;
for (i = 0; i < total_slots; i++) {
......@@ -617,9 +656,24 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
lockres_free(bm_lockres);
continue;
}
if (ret)
if (ret) {
lockres_free(bm_lockres);
goto out;
/* TODO: Read the disk bitmap sb and check if it needs recovery */
}
/* Read the disk bitmap sb and check if it needs recovery */
ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
if (ret) {
pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
lockres_free(bm_lockres);
continue;
}
if ((hi > 0) && (lo < mddev->recovery_cp)) {
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
mddev->recovery_cp = lo;
md_check_recovery(mddev);
}
dlm_unlock_sync(bm_lockres);
lockres_free(bm_lockres);
}
......@@ -633,20 +687,20 @@ static int join(struct mddev *mddev, int nodes)
int ret, ops_rv;
char str[64];
if (!try_module_get(THIS_MODULE))
return -ENOENT;
cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
if (!cinfo)
return -ENOMEM;
INIT_LIST_HEAD(&cinfo->suspend_list);
spin_lock_init(&cinfo->suspend_lock);
init_completion(&cinfo->completion);
set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
mutex_init(&cinfo->sb_mutex);
mddev->cluster_info = cinfo;
memset(str, 0, 64);
pretty_uuid(str, mddev->uuid);
sprintf(str, "%pU", mddev->uuid);
ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
DLM_LSFL_FS, LVB_SIZE,
&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
......@@ -659,12 +713,6 @@ static int join(struct mddev *mddev, int nodes)
ret = -ERANGE;
goto err;
}
cinfo->sb_lock = lockres_init(mddev, "cmd-super",
NULL, 0);
if (!cinfo->sb_lock) {
ret = -ENOMEM;
goto err;
}
/* Initiate the communication resources */
ret = -ENOMEM;
cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
......@@ -705,9 +753,6 @@ static int join(struct mddev *mddev, int nodes)
goto err;
}
INIT_LIST_HEAD(&cinfo->suspend_list);
spin_lock_init(&cinfo->suspend_lock);
ret = gather_all_resync_info(mddev, nodes);
if (ret)
goto err;
......@@ -719,12 +764,10 @@ static int join(struct mddev *mddev, int nodes)
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
lockres_free(cinfo->bitmap_lockres);
lockres_free(cinfo->sb_lock);
if (cinfo->lockspace)
dlm_release_lockspace(cinfo->lockspace, 2);
mddev->cluster_info = NULL;
kfree(cinfo);
module_put(THIS_MODULE);
return ret;
}
......@@ -740,7 +783,6 @@ static int leave(struct mddev *mddev)
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
lockres_free(cinfo->sb_lock);
lockres_free(cinfo->bitmap_lockres);
dlm_release_lockspace(cinfo->lockspace, 2);
return 0;
......@@ -817,8 +859,17 @@ static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
static void resync_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
int slot = cinfo->slot_number - 1;
pr_info("%s:%d\n", __func__, __LINE__);
resync_send(mddev, RESYNCING, 0, 0);
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
cmsg.slot = cpu_to_le32(slot);
sendmsg(cinfo, &cmsg);
}
}
static int area_resyncing(struct mddev *mddev, int direction,
......
......@@ -483,6 +483,8 @@ static void mddev_put(struct mddev *mddev)
bioset_free(bs);
}
static void md_safemode_timeout(unsigned long data);
void mddev_init(struct mddev *mddev)
{
mutex_init(&mddev->open_mutex);
......@@ -490,7 +492,8 @@ void mddev_init(struct mddev *mddev)
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
init_timer(&mddev->safemode_timer);
setup_timer(&mddev->safemode_timer, md_safemode_timeout,
(unsigned long) mddev);
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
......@@ -3255,8 +3258,6 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
return 0;
}
static void md_safemode_timeout(unsigned long data);
static ssize_t
safe_delay_show(struct mddev *mddev, char *page)
{
......@@ -4189,6 +4190,8 @@ action_show(struct mddev *mddev, char *page)
type = "repair";
} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
type = "recover";
else if (mddev->reshape_position != MaxSector)
type = "reshape";
}
return sprintf(page, "%s\n", type);
}
......@@ -5180,8 +5183,6 @@ int md_run(struct mddev *mddev)
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
mddev->safemode_timer.function = md_safemode_timeout;
mddev->safemode_timer.data = (unsigned long) mddev;
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
smp_wmb();
......@@ -5194,6 +5195,11 @@ int md_run(struct mddev *mddev)
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
if (mddev->degraded && !mddev->ro)
/* This ensures that recovering status is reported immediately
* via sysfs - until a lack of spares is confirmed.
*/
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
if (mddev->flags & MD_UPDATE_SB_FLAGS)
......@@ -5741,16 +5747,16 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
err = 0;
spin_lock(&mddev->lock);
/* bitmap disabled, zero the first byte and copy out */
if (!mddev->bitmap_info.file)
file->pathname[0] = '\0';
else if ((ptr = file_path(mddev->bitmap_info.file,
file->pathname, sizeof(file->pathname))),
IS_ERR(ptr))
/* bitmap enabled */
if (mddev->bitmap_info.file) {
ptr = file_path(mddev->bitmap_info.file, file->pathname,
sizeof(file->pathname));
if (IS_ERR(ptr))
err = PTR_ERR(ptr);
else
memmove(file->pathname, ptr,
sizeof(file->pathname)-(ptr-file->pathname));
}
spin_unlock(&mddev->lock);
if (err == 0 &&
......@@ -7069,7 +7075,7 @@ static void status_unused(struct seq_file *seq)
seq_printf(seq, "\n");
}
static void status_resync(struct seq_file *seq, struct mddev *mddev)
static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
unsigned long dt, db;
......@@ -7077,18 +7083,32 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
int scale;
unsigned int per_milli;
if (mddev->curr_resync <= 3)
resync = 0;
else
resync = mddev->curr_resync
- atomic_read(&mddev->recovery_active);
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors;
else
max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync;
if (resync <= 3) {
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
/* Still cleaning up */
resync = max_sectors;
} else
resync -= atomic_read(&mddev->recovery_active);
if (resync == 0) {
if (mddev->recovery_cp < MaxSector) {
seq_printf(seq, "\tresync=PENDING");
return 1;
}
return 0;
}
if (resync < 3) {
seq_printf(seq, "\tresync=DELAYED");
return 1;
}
WARN_ON(max_sectors == 0);
/* Pick 'scale' such that (resync>>scale)*1000 will fit
* in a sector_t, and (max_sectors>>scale) will fit in a
......@@ -7153,6 +7173,7 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
((unsigned long)rt % 60)/6);
seq_printf(seq, " speed=%ldK/sec", db/2/dt);
return 1;
}
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
......@@ -7298,13 +7319,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
mddev->pers->status(seq, mddev);
seq_printf(seq, "\n ");
if (mddev->pers->sync_request) {
if (mddev->curr_resync > 2) {
status_resync(seq, mddev);
if (status_resync(seq, mddev))
seq_printf(seq, "\n ");
} else if (mddev->curr_resync >= 1)
seq_printf(seq, "\tresync=DELAYED\n ");
else if (mddev->recovery_cp < MaxSector)
seq_printf(seq, "\tresync=PENDING\n ");
}
} else
seq_printf(seq, "\n ");
......@@ -7387,15 +7403,19 @@ int unregister_md_personality(struct md_personality *p)
}
EXPORT_SYMBOL(unregister_md_personality);
int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
int register_md_cluster_operations(struct md_cluster_operations *ops,
struct module *module)
{
if (md_cluster_ops != NULL)
return -EALREADY;
int ret = 0;
spin_lock(&pers_lock);
if (md_cluster_ops != NULL)
ret = -EALREADY;
else {
md_cluster_ops = ops;
md_cluster_mod = module;
}
spin_unlock(&pers_lock);
return 0;
return ret;
}
EXPORT_SYMBOL(register_md_cluster_operations);
......@@ -7793,7 +7813,8 @@ void md_do_sync(struct md_thread *thread)
> (max_sectors >> 4)) ||
time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
(j - mddev->curr_resync_completed)*2
>= mddev->resync_max - mddev->curr_resync_completed
>= mddev->resync_max - mddev->curr_resync_completed ||
mddev->curr_resync_completed > mddev->resync_max
)) {
/* time to update curr_resync_completed */
wait_event(mddev->recovery_wait,
......@@ -7838,6 +7859,9 @@ void md_do_sync(struct md_thread *thread)
break;
j += sectors;
if (j > max_sectors)
/* when skipping, extra large numbers can be returned. */
j = max_sectors;
if (j > 2)
mddev->curr_resync = j;
if (mddev_is_clustered(mddev))
......@@ -7906,12 +7930,15 @@ void md_do_sync(struct md_thread *thread)
blk_finish_plug(&plug);
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->curr_resync > 2) {
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
/* tell personality that we are finished */
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_finish(mddev);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
......@@ -7945,6 +7972,9 @@ void md_do_sync(struct md_thread *thread)
}
}
skip:
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_finish(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
spin_lock(&mddev->lock);
......@@ -7955,11 +7985,11 @@ void md_do_sync(struct md_thread *thread)
mddev->resync_max = MaxSector;
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
mddev->curr_resync = 0;
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return;
}
......@@ -8128,6 +8158,7 @@ void md_check_recovery(struct mddev *mddev)
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
......@@ -8574,6 +8605,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
/* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
md_wakeup_thread(rdev->mddev->thread);
}
return rv;
......
......@@ -83,7 +83,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
char b[BDEVNAME_SIZE];
char b2[BDEVNAME_SIZE];
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
bool discard_supported = false;
unsigned short blksize = 512;
if (!conf)
return -ENOMEM;
......@@ -98,6 +98,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
sector_div(sectors, mddev->chunk_sectors);
rdev1->sectors = sectors * mddev->chunk_sectors;
blksize = max(blksize, queue_logical_block_size(
rdev1->bdev->bd_disk->queue));
rdev_for_each(rdev2, mddev) {
pr_debug("md/raid0:%s: comparing %s(%llu)"
" with %s(%llu)\n",
......@@ -134,6 +137,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
*/
if ((mddev->chunk_sectors << 9) % blksize) {
printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
mdname(mddev),
mddev->chunk_sectors << 9, blksize);
err = -EINVAL;
goto abort;
}
err = -ENOMEM;
conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
conf->nr_strip_zones, GFP_KERNEL);
......@@ -188,16 +203,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
dev[j] = rdev1;
if (mddev->queue)
disk_stack_limits(mddev->gendisk, rdev1->bdev,
rdev1->data_offset << 9);
if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1;
cnt++;
if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
discard_supported = true;
}
if (cnt != mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
......@@ -258,28 +266,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
(unsigned long long)smallest->sectors);
}
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
*/
if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
mdname(mddev),
mddev->chunk_sectors << 9);
goto abort;
}
if (mddev->queue) {
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
(mddev->chunk_sectors << 9) * mddev->raid_disks);
if (!discard_supported)
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
else
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
}
pr_debug("md/raid0:%s: done.\n", mdname(mddev));
*private_conf = conf;
......@@ -378,12 +364,6 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
if (mddev->queue) {
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
}
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
ret = create_strip_zones(mddev, &conf);
......@@ -392,6 +372,29 @@ static int raid0_run(struct mddev *mddev)
mddev->private = conf;
}
conf = mddev->private;
if (mddev->queue) {
struct md_rdev *rdev;
bool discard_supported = false;
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
discard_supported = true;
}
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
(mddev->chunk_sectors << 9) * mddev->raid_disks);
if (!discard_supported)
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
else
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
}
/* calculate array device size */
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
......
......@@ -1474,6 +1474,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_CHANGE_PENDING, &mddev->flags);
printk(KERN_ALERT
"md/raid1:%s: Disk failure on %s, disabling device.\n"
"md/raid1:%s: Operation continuing on %d devices.\n",
......@@ -2235,6 +2236,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{
int m;
bool fail = false;
for (m = 0; m < conf->raid_disks * 2 ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
......@@ -2247,6 +2249,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
* narrow down and record precise write
* errors.
*/
fail = true;
if (!narrow_write_error(r1_bio, m)) {
md_error(conf->mddev,
conf->mirrors[m].rdev);
......@@ -2258,6 +2261,12 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
}
if (test_bit(R1BIO_WriteError, &r1_bio->state))
close_write(r1_bio);
if (fail) {
spin_lock_irq(&conf->device_lock);
list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
spin_unlock_irq(&conf->device_lock);
md_wakeup_thread(conf->mddev->thread);
} else
raid_end_bio_io(r1_bio);
}
......@@ -2364,6 +2373,23 @@ static void raid1d(struct md_thread *thread)
md_check_recovery(mddev);
if (!list_empty_careful(&conf->bio_end_io_list) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
LIST_HEAD(tmp);
spin_lock_irqsave(&conf->device_lock, flags);
if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
list_add(&tmp, &conf->bio_end_io_list);
list_del_init(&conf->bio_end_io_list);
}
spin_unlock_irqrestore(&conf->device_lock, flags);
while (!list_empty(&tmp)) {
r1_bio = list_first_entry(&conf->bio_end_io_list,
struct r1bio, retry_list);
list_del(&r1_bio->retry_list);
raid_end_bio_io(r1_bio);
}
}
blk_start_plug(&plug);
for (;;) {
......@@ -2763,6 +2789,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
......@@ -3057,6 +3084,7 @@ static int raid1_reshape(struct mddev *mddev)
unfreeze_array(conf);
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
......
......@@ -61,6 +61,11 @@ struct r1conf {
* block, or anything else.
*/
struct list_head retry_list;
/* A separate list of r1bio which just need raid_end_bio_io called.
* This mustn't happen for writes which had any errors if the superblock
* needs to be written.
*/
struct list_head bio_end_io_list;
/* queue pending writes to be submitted on unplug */
struct bio_list pending_bio_list;
......
......@@ -1589,6 +1589,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_CHANGE_PENDING, &mddev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
printk(KERN_ALERT
"md/raid10:%s: Disk failure on %s, disabling device.\n"
......@@ -2623,6 +2624,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
}
put_buf(r10_bio);
} else {
bool fail = false;
for (m = 0; m < conf->copies; m++) {
int dev = r10_bio->devs[m].devnum;
struct bio *bio = r10_bio->devs[m].bio;
......@@ -2634,6 +2636,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
r10_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev);
} else if (bio != NULL && bio->bi_error) {
fail = true;
if (!narrow_write_error(r10_bio, m)) {
md_error(conf->mddev, rdev);
set_bit(R10BIO_Degraded,
......@@ -2654,6 +2657,12 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
if (test_bit(R10BIO_WriteError,
&r10_bio->state))
close_write(r10_bio);
if (fail) {
spin_lock_irq(&conf->device_lock);
list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
spin_unlock_irq(&conf->device_lock);
md_wakeup_thread(conf->mddev->thread);
} else
raid_end_bio_io(r10_bio);
}
}
......@@ -2669,6 +2678,23 @@ static void raid10d(struct md_thread *thread)
md_check_recovery(mddev);
if (!list_empty_careful(&conf->bio_end_io_list) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
LIST_HEAD(tmp);
spin_lock_irqsave(&conf->device_lock, flags);
if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
list_add(&tmp, &conf->bio_end_io_list);
list_del_init(&conf->bio_end_io_list);
}
spin_unlock_irqrestore(&conf->device_lock, flags);
while (!list_empty(&tmp)) {
r10_bio = list_first_entry(&conf->bio_end_io_list,
struct r10bio, retry_list);
list_del(&r10_bio->retry_list);
raid_end_bio_io(r10_bio);
}
}
blk_start_plug(&plug);
for (;;) {
......@@ -3443,6 +3469,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
conf->reshape_safe = conf->reshape_progress;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
......@@ -4097,7 +4124,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
* at a time, possibly less if that exceeds RESYNC_PAGES,
* or we hit a bad block or something.
* This might mean we pause for normal IO in the middle of
* a chunk, but that is not a problem was mddev->reshape_position
* a chunk, but that is not a problem as mddev->reshape_position
* can record any location.
*
* If we will want to write to a location that isn't
......@@ -4121,7 +4148,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
*
* In all this the minimum difference in data offsets
* (conf->offset_diff - always positive) allows a bit of slack,
* so next can be after 'safe', but not by more than offset_disk
* so next can be after 'safe', but not by more than offset_diff
*
* We need to prepare all the bios here before we start any IO
* to ensure the size we choose is acceptable to all devices.
......
......@@ -53,6 +53,12 @@ struct r10conf {
sector_t offset_diff;
struct list_head retry_list;
/* A separate list of r1bio which just need raid_end_bio_io called.
* This mustn't happen for writes which had any errors if the superblock
* needs to be written.
*/
struct list_head bio_end_io_list;
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
int pending_count;
......
......@@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
static void return_io(struct bio *return_bi)
static void return_io(struct bio_list *return_bi)
{
struct bio *bi = return_bi;
while (bi) {
return_bi = bi->bi_next;
bi->bi_next = NULL;
struct bio *bi;
while ((bi = bio_list_pop(return_bi)) != NULL) {
bi->bi_iter.bi_size = 0;
trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
bi, 0);
bio_endio(bi);
bi = return_bi;
}
}
......@@ -1177,7 +1173,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
struct bio *return_bi = NULL;
struct bio_list return_bi = BIO_EMPTY_LIST;
int i;
pr_debug("%s: stripe %llu\n", __func__,
......@@ -1201,17 +1197,15 @@ static void ops_complete_biofill(void *stripe_head_ref)
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
if (!raid5_dec_bi_active_stripes(rbi)) {
rbi->bi_next = return_bi;
return_bi = rbi;
}
if (!raid5_dec_bi_active_stripes(rbi))
bio_list_add(&return_bi, rbi);
rbi = rbi2;
}
}
}
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
return_io(return_bi);
return_io(&return_bi);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
......@@ -2517,6 +2511,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_CHANGE_PENDING, &mddev->flags);
printk(KERN_ALERT
"md/raid:%s: Disk failure on %s, disabling device.\n"
"md/raid:%s: Operation continuing on %d devices.\n",
......@@ -3069,7 +3064,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
static void
handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks,
struct bio **return_bi)
struct bio_list *return_bi)
{
int i;
BUG_ON(sh->batch_head);
......@@ -3114,8 +3109,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
bi->bi_next = *return_bi;
*return_bi = bi;
bio_list_add(return_bi, bi);
}
bi = nextbi;
}
......@@ -3139,8 +3133,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
bi->bi_next = *return_bi;
*return_bi = bi;
bio_list_add(return_bi, bi);
}
bi = bi2;
}
......@@ -3163,10 +3156,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
r5_next_bio(bi, sh->dev[i].sector);
bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
bi->bi_next = *return_bi;
*return_bi = bi;
}
if (!raid5_dec_bi_active_stripes(bi))
bio_list_add(return_bi, bi);
bi = nextbi;
}
}
......@@ -3445,7 +3436,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
* never LOCKED, so we don't need to test 'failed' directly.
*/
static void handle_stripe_clean_event(struct r5conf *conf,
struct stripe_head *sh, int disks, struct bio **return_bi)
struct stripe_head *sh, int disks, struct bio_list *return_bi)
{
int i;
struct r5dev *dev;
......@@ -3479,8 +3470,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
wbi2 = r5_next_bio(wbi, dev->sector);
if (!raid5_dec_bi_active_stripes(wbi)) {
md_write_end(conf->mddev);
wbi->bi_next = *return_bi;
*return_bi = wbi;
bio_list_add(return_bi, wbi);
}
wbi = wbi2;
}
......@@ -4613,7 +4603,15 @@ static void handle_stripe(struct stripe_head *sh)
md_wakeup_thread(conf->mddev->thread);
}
return_io(s.return_bi);
if (!bio_list_empty(&s.return_bi)) {
if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->return_bi, &s.return_bi);
spin_unlock_irq(&conf->device_lock);
md_wakeup_thread(conf->mddev->thread);
} else
return_io(&s.return_bi);
}
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
}
......@@ -4672,12 +4670,12 @@ static int raid5_congested(struct mddev *mddev, int bits)
static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
struct r5conf *conf = mddev->private;
sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
unsigned int chunk_sectors = mddev->chunk_sectors;
unsigned int chunk_sectors;
unsigned int bio_sectors = bio_sectors(bio);
if (mddev->new_chunk_sectors < mddev->chunk_sectors)
chunk_sectors = mddev->new_chunk_sectors;
chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
return chunk_sectors >=
((sector & (chunk_sectors - 1)) + bio_sectors);
}
......@@ -5325,6 +5323,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sector_t stripe_addr;
int reshape_sectors;
struct list_head stripes;
sector_t retn;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
......@@ -5332,6 +5331,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
} else if (mddev->reshape_backwards &&
conf->reshape_progress == MaxSector) {
/* shouldn't happen, but just in case, finish up.*/
sector_nr = MaxSector;
} else if (!mddev->reshape_backwards &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
......@@ -5340,7 +5343,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
mddev->curr_resync_completed = sector_nr;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
*skipped = 1;
return sector_nr;
retn = sector_nr;
goto finish;
}
}
......@@ -5348,10 +5352,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* If old and new chunk sizes differ, we need to process the
* largest of these
*/
if (mddev->new_chunk_sectors > mddev->chunk_sectors)
reshape_sectors = mddev->new_chunk_sectors;
else
reshape_sectors = mddev->chunk_sectors;
reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
/* We update the metadata at least every 10 seconds, or when
* the data about to be copied would over-write the source of
......@@ -5366,11 +5368,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
if (mddev->reshape_backwards) {
writepos -= min_t(sector_t, reshape_sectors, writepos);
BUG_ON(writepos < reshape_sectors);
writepos -= reshape_sectors;
readpos += reshape_sectors;
safepos += reshape_sectors;
} else {
writepos += reshape_sectors;
/* readpos and safepos are worst-case calculations.
* A negative number is overly pessimistic, and causes
* obvious problems for unsigned storage. So clip to 0.
*/
readpos -= min_t(sector_t, reshape_sectors, readpos);
safepos -= min_t(sector_t, reshape_sectors, safepos);
}
......@@ -5513,7 +5520,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* then we need to write out the superblock.
*/
sector_nr += reshape_sectors;
if ((sector_nr - mddev->curr_resync_completed) * 2
retn = reshape_sectors;
finish:
if (mddev->curr_resync_completed > mddev->resync_max ||
(sector_nr - mddev->curr_resync_completed) * 2
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
......@@ -5538,7 +5548,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
ret:
return reshape_sectors;
return retn;
}
static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
......@@ -5794,6 +5804,18 @@ static void raid5d(struct md_thread *thread)
md_check_recovery(mddev);
if (!bio_list_empty(&conf->return_bi) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
struct bio_list tmp = BIO_EMPTY_LIST;
spin_lock_irq(&conf->device_lock);
if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
bio_list_merge(&tmp, &conf->return_bi);
bio_list_init(&conf->return_bi);
}
spin_unlock_irq(&conf->device_lock);
return_io(&tmp);
}
blk_start_plug(&plug);
handled = 0;
spin_lock_irq(&conf->device_lock);
......@@ -6234,8 +6256,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
/* size is defined by the smallest of previous and new size */
raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
sectors &= ~((sector_t)mddev->chunk_sectors - 1);
sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
sectors &= ~((sector_t)conf->chunk_sectors - 1);
sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
return sectors * (raid_disks - conf->max_degraded);
}
......@@ -6453,6 +6475,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
bio_list_init(&conf->return_bi);
init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
......@@ -6542,6 +6565,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (conf->reshape_progress != MaxSector) {
conf->prev_chunk_sectors = mddev->chunk_sectors;
conf->prev_algo = mddev->layout;
} else {
conf->prev_chunk_sectors = conf->chunk_sectors;
conf->prev_algo = conf->algorithm;
}
conf->min_nr_stripes = NR_STRIPES;
......@@ -6661,6 +6687,8 @@ static int run(struct mddev *mddev)
sector_t here_new, here_old;
int old_disks;
int max_degraded = (mddev->level == 6 ? 2 : 1);
int chunk_sectors;
int new_data_disks;
if (mddev->new_level != mddev->level) {
printk(KERN_ERR "md/raid:%s: unsupported reshape "
......@@ -6672,28 +6700,25 @@ static int run(struct mddev *mddev)
/* reshape_position must be on a new-stripe boundary, and one
* further up in new geometry must map after here in old
* geometry.
* If the chunk sizes are different, then as we perform reshape
* in units of the largest of the two, reshape_position needs
* be a multiple of the largest chunk size times new data disks.
*/
here_new = mddev->reshape_position;
if (sector_div(here_new, mddev->new_chunk_sectors *
(mddev->raid_disks - max_degraded))) {
chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
new_data_disks = mddev->raid_disks - max_degraded;
if (sector_div(here_new, chunk_sectors * new_data_disks)) {
printk(KERN_ERR "md/raid:%s: reshape_position not "
"on a stripe boundary\n", mdname(mddev));
return -EINVAL;
}
reshape_offset = here_new * mddev->new_chunk_sectors;
reshape_offset = here_new * chunk_sectors;
/* here_new is the stripe we will write to */
here_old = mddev->reshape_position;
sector_div(here_old, mddev->chunk_sectors *
(old_disks-max_degraded));
sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
/* here_old is the first stripe that we might need to read
* from */
if (mddev->delta_disks == 0) {
if ((here_new * mddev->new_chunk_sectors !=
here_old * mddev->chunk_sectors)) {
printk(KERN_ERR "md/raid:%s: reshape position is"
" confused - aborting\n", mdname(mddev));
return -EINVAL;
}
/* We cannot be sure it is safe to start an in-place
* reshape. It is only safe if user-space is monitoring
* and taking constant backups.
......@@ -6712,10 +6737,10 @@ static int run(struct mddev *mddev)
return -EINVAL;
}
} else if (mddev->reshape_backwards
? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
here_old * mddev->chunk_sectors)
: (here_new * mddev->new_chunk_sectors >=
here_old * mddev->chunk_sectors + (-min_offset_diff))) {
? (here_new * chunk_sectors + min_offset_diff <=
here_old * chunk_sectors)
: (here_new * chunk_sectors >=
here_old * chunk_sectors + (-min_offset_diff))) {
/* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "md/raid:%s: reshape_position too early for "
"auto-recovery - aborting.\n",
......@@ -6967,7 +6992,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
int i;
seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
mddev->chunk_sectors / 2, mddev->layout);
conf->chunk_sectors / 2, mddev->layout);
seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
......@@ -7173,7 +7198,9 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
* worth it.
*/
sector_t newsize;
sectors &= ~((sector_t)mddev->chunk_sectors - 1);
struct r5conf *conf = mddev->private;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
if (mddev->external_size &&
mddev->array_sectors > newsize)
......@@ -7412,6 +7439,7 @@ static void end_reshape(struct r5conf *conf)
rdev->data_offset = rdev->new_data_offset;
smp_wmb();
conf->reshape_progress = MaxSector;
conf->mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
......
......@@ -265,7 +265,7 @@ struct stripe_head_state {
int dec_preread_active;
unsigned long ops_request;
struct bio *return_bi;
struct bio_list return_bi;
struct md_rdev *blocked_rdev;
int handle_bad_blocks;
};
......@@ -476,6 +476,9 @@ struct r5conf {
int skip_copy; /* Don't copy data from bio to stripe cache */
struct list_head *last_hold; /* detect hold_list promotions */
/* bios to have bi_end_io called after metadata is synced */
struct bio_list return_bi;
atomic_t reshape_stripes; /* stripes with pending writes for reshape */
/* unfortunately we need two cache names as we temporarily have
* two caches.
......
......@@ -40,9 +40,20 @@
(unsigned long)bytes, ptrs); \
kernel_neon_end(); \
} \
static void raid6_neon ## _n ## _xor_syndrome(int disks, \
int start, int stop, \
size_t bytes, void **ptrs) \
{ \
void raid6_neon ## _n ## _xor_syndrome_real(int, \
int, int, unsigned long, void**); \
kernel_neon_begin(); \
raid6_neon ## _n ## _xor_syndrome_real(disks, \
start, stop, (unsigned long)bytes, ptrs); \
kernel_neon_end(); \
} \
struct raid6_calls const raid6_neonx ## _n = { \
raid6_neon ## _n ## _gen_syndrome, \
NULL, /* XOR not yet implemented */ \
raid6_neon ## _n ## _xor_syndrome, \
raid6_have_neon, \
"neonx" #_n, \
0 \
......
......@@ -3,6 +3,7 @@
* neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
*
* Copyright (C) 2012 Rob Herring
* Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* Based on altivec.uc:
* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
......@@ -78,3 +79,48 @@ void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
vst1q_u8(&q[d+NSIZE*$$], wq$$);
}
}
void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
unsigned long bytes, void **ptrs)
{
uint8_t **dptr = (uint8_t **)ptrs;
uint8_t *p, *q;
int d, z, z0;
register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
const unative_t x1d = NBYTES(0x1d);
z0 = stop; /* P/Q right side optimization */
p = dptr[disks-2]; /* XOR parity */
q = dptr[disks-1]; /* RS syndrome */
for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
/* P/Q data pages */
for ( z = z0-1 ; z >= start ; z-- ) {
wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
wp$$ = veorq_u8(wp$$, wd$$);
w2$$ = MASK(wq$$);
w1$$ = SHLBYTE(wq$$);
w2$$ = vandq_u8(w2$$, x1d);
w1$$ = veorq_u8(w1$$, w2$$);
wq$$ = veorq_u8(w1$$, wd$$);
}
/* P/Q left side optimization */
for ( z = start-1 ; z >= 0 ; z-- ) {
w2$$ = MASK(wq$$);
w1$$ = SHLBYTE(wq$$);
w2$$ = vandq_u8(w2$$, x1d);
wq$$ = veorq_u8(w1$$, w2$$);
}
w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
wq$$ = veorq_u8(wq$$, w1$$);
vst1q_u8(&p[d+NSIZE*$$], wp$$);
vst1q_u8(&q[d+NSIZE*$$], wq$$);
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment