Commit 0bd7c5d8 authored by Jens Axboe's avatar Jens Axboe

Merge tag 'md-next-20231219' of...

Merge tag 'md-next-20231219' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.8/block

Pull MD updates from Song:

"1. Remove deprecated flavors, by Song Liu;
 2. raid1 read error check support, by Li Nan;
 3. Better handle events off-by-1 case, by Alex Lyakas."

* tag 'md-next-20231219' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md:
  md: Remove deprecated CONFIG_MD_FAULTY
  md: Remove deprecated CONFIG_MD_MULTIPATH
  md: Remove deprecated CONFIG_MD_LINEAR
  md/raid1: support read error check
  md: factor out a helper exceed_read_errors() to check read_errors
  md: Whenassemble the array, consult the superblock of the freshest device
  md/raid1: remove unnecessary null checking
parents 4c434392 415c7451
...@@ -61,19 +61,6 @@ config MD_BITMAP_FILE ...@@ -61,19 +61,6 @@ config MD_BITMAP_FILE
various kernel APIs and can only work with files on a file system not various kernel APIs and can only work with files on a file system not
actually sitting on the MD device. actually sitting on the MD device.
config MD_LINEAR
tristate "Linear (append) mode (deprecated)"
depends on BLK_DEV_MD
help
If you say Y here, then your multiple devices driver will be able to
use the so-called linear mode, i.e. it will combine the hard disk
partitions by simply appending one to the other.
To compile this as a module, choose M here: the module
will be called linear.
If unsure, say Y.
config MD_RAID0 config MD_RAID0
tristate "RAID-0 (striping) mode" tristate "RAID-0 (striping) mode"
depends on BLK_DEV_MD depends on BLK_DEV_MD
...@@ -172,27 +159,6 @@ config MD_RAID456 ...@@ -172,27 +159,6 @@ config MD_RAID456
If unsure, say Y. If unsure, say Y.
config MD_MULTIPATH
tristate "Multipath I/O support (deprecated)"
depends on BLK_DEV_MD
help
MD_MULTIPATH provides a simple multi-path personality for use
the MD framework. It is not under active development. New
projects should consider using DM_MULTIPATH which has more
features and more testing.
If unsure, say N.
config MD_FAULTY
tristate "Faulty test module for MD (deprecated)"
depends on BLK_DEV_MD
help
The "faulty" module allows for a block device that occasionally returns
read or write errors. It is useful for testing.
In unsure, say N.
config MD_CLUSTER config MD_CLUSTER
tristate "Cluster Support for MD" tristate "Cluster Support for MD"
depends on BLK_DEV_MD depends on BLK_DEV_MD
......
...@@ -29,22 +29,16 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o ...@@ -29,22 +29,16 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
md-mod-y += md.o md-bitmap.o md-mod-y += md.o md-bitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o
linear-y += md-linear.o
multipath-y += md-multipath.o
faulty-y += md-faulty.o
# Note: link order is important. All raid personalities # Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise # and must come before md.o, as they each initialise
# themselves, and md.o may use the personalities when it # themselves, and md.o may use the personalities when it
# auto-initialised. # auto-initialised.
obj-$(CONFIG_MD_LINEAR) += linear.o
obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID0) += raid0.o
obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID1) += raid1.o
obj-$(CONFIG_MD_RAID10) += raid10.o obj-$(CONFIG_MD_RAID10) += raid10.o
obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_RAID456) += raid456.o
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_MD_FAULTY) += faulty.o
obj-$(CONFIG_MD_CLUSTER) += md-cluster.o obj-$(CONFIG_MD_CLUSTER) += md-cluster.o
obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BCACHE) += bcache/
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
......
...@@ -49,7 +49,6 @@ static int md_setup_ents __initdata; ...@@ -49,7 +49,6 @@ static int md_setup_ents __initdata;
* instead of just one. -- KTK * instead of just one. -- KTK
* 18May2000: Added support for persistent-superblock arrays: * 18May2000: Added support for persistent-superblock arrays:
* md=n,0,factor,fault,device-list uses RAID0 for device n * md=n,0,factor,fault,device-list uses RAID0 for device n
* md=n,-1,factor,fault,device-list uses LINEAR for device n
* md=n,device-list reads a RAID superblock from the devices * md=n,device-list reads a RAID superblock from the devices
* elements in device-list are read by name_to_kdev_t so can be * elements in device-list are read by name_to_kdev_t so can be
* a hex number or something like /dev/hda1 /dev/sdb * a hex number or something like /dev/hda1 /dev/sdb
...@@ -88,7 +87,7 @@ static int __init md_setup(char *str) ...@@ -88,7 +87,7 @@ static int __init md_setup(char *str)
md_setup_ents++; md_setup_ents++;
switch (get_option(&str, &level)) { /* RAID level */ switch (get_option(&str, &level)) { /* RAID level */
case 2: /* could be 0 or -1.. */ case 2: /* could be 0 or -1.. */
if (level == 0 || level == LEVEL_LINEAR) { if (level == 0) {
if (get_option(&str, &factor) != 2 || /* Chunk Size */ if (get_option(&str, &factor) != 2 || /* Chunk Size */
get_option(&str, &fault) != 2) { get_option(&str, &fault) != 2) {
printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
...@@ -96,10 +95,7 @@ static int __init md_setup(char *str) ...@@ -96,10 +95,7 @@ static int __init md_setup(char *str)
} }
md_setup_args[ent].level = level; md_setup_args[ent].level = level;
md_setup_args[ent].chunk = 1 << (factor+12); md_setup_args[ent].chunk = 1 << (factor+12);
if (level == LEVEL_LINEAR) pername = "raid0";
pername = "linear";
else
pername = "raid0";
break; break;
} }
fallthrough; fallthrough;
......
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* faulty.c : Multiple Devices driver for Linux
*
* Copyright (C) 2004 Neil Brown
*
* fautly-device-simulator personality for md
*/
/*
* The "faulty" personality causes some requests to fail.
*
* Possible failure modes are:
* reads fail "randomly" but succeed on retry
* writes fail "randomly" but succeed on retry
* reads for some address fail and then persist until a write
* reads for some address fail and then persist irrespective of write
* writes for some address fail and persist
* all writes fail
*
* Different modes can be active at a time, but only
* one can be set at array creation. Others can be added later.
* A mode can be one-shot or recurrent with the recurrence being
* once in every N requests.
* The bottom 5 bits of the "layout" indicate the mode. The
* remainder indicate a period, or 0 for one-shot.
*
* There is an implementation limit on the number of concurrently
* persisting-faulty blocks. When a new fault is requested that would
* exceed the limit, it is ignored.
* All current faults can be clear using a layout of "0".
*
* Requests are always sent to the device. If they are to fail,
* we clone the bio and insert a new b_end_io into the chain.
*/
#define WriteTransient 0
#define ReadTransient 1
#define WritePersistent 2
#define ReadPersistent 3
#define WriteAll 4 /* doesn't go to device */
#define ReadFixable 5
#define Modes 6
#define ClearErrors 31
#define ClearFaults 30
#define AllPersist 100 /* internal use only */
#define NoPersist 101
#define ModeMask 0x1f
#define ModeShift 5
#define MaxFault 50
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/raid/md_u.h>
#include <linux/slab.h>
#include "md.h"
#include <linux/seq_file.h>
static void faulty_fail(struct bio *bio)
{
struct bio *b = bio->bi_private;
b->bi_iter.bi_size = bio->bi_iter.bi_size;
b->bi_iter.bi_sector = bio->bi_iter.bi_sector;
bio_put(bio);
bio_io_error(b);
}
struct faulty_conf {
int period[Modes];
atomic_t counters[Modes];
sector_t faults[MaxFault];
int modes[MaxFault];
int nfaults;
struct md_rdev *rdev;
};
static int check_mode(struct faulty_conf *conf, int mode)
{
if (conf->period[mode] == 0 &&
atomic_read(&conf->counters[mode]) <= 0)
return 0; /* no failure, no decrement */
if (atomic_dec_and_test(&conf->counters[mode])) {
if (conf->period[mode])
atomic_set(&conf->counters[mode], conf->period[mode]);
return 1;
}
return 0;
}
static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir)
{
/* If we find a ReadFixable sector, we fix it ... */
int i;
for (i=0; i<conf->nfaults; i++)
if (conf->faults[i] >= start &&
conf->faults[i] < end) {
/* found it ... */
switch (conf->modes[i] * 2 + dir) {
case WritePersistent*2+WRITE: return 1;
case ReadPersistent*2+READ: return 1;
case ReadFixable*2+READ: return 1;
case ReadFixable*2+WRITE:
conf->modes[i] = NoPersist;
return 0;
case AllPersist*2+READ:
case AllPersist*2+WRITE: return 1;
default:
return 0;
}
}
return 0;
}
static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
{
int i;
int n = conf->nfaults;
for (i=0; i<conf->nfaults; i++)
if (conf->faults[i] == start) {
switch(mode) {
case NoPersist: conf->modes[i] = mode; return;
case WritePersistent:
if (conf->modes[i] == ReadPersistent ||
conf->modes[i] == ReadFixable)
conf->modes[i] = AllPersist;
else
conf->modes[i] = WritePersistent;
return;
case ReadPersistent:
if (conf->modes[i] == WritePersistent)
conf->modes[i] = AllPersist;
else
conf->modes[i] = ReadPersistent;
return;
case ReadFixable:
if (conf->modes[i] == WritePersistent ||
conf->modes[i] == ReadPersistent)
conf->modes[i] = AllPersist;
else
conf->modes[i] = ReadFixable;
return;
}
} else if (conf->modes[i] == NoPersist)
n = i;
if (n >= MaxFault)
return;
conf->faults[n] = start;
conf->modes[n] = mode;
if (conf->nfaults == n)
conf->nfaults = n+1;
}
static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
{
struct faulty_conf *conf = mddev->private;
int failit = 0;
if (bio_data_dir(bio) == WRITE) {
/* write request */
if (atomic_read(&conf->counters[WriteAll])) {
/* special case - don't decrement, don't submit_bio_noacct,
* just fail immediately
*/
bio_io_error(bio);
return true;
}
if (check_sector(conf, bio->bi_iter.bi_sector,
bio_end_sector(bio), WRITE))
failit = 1;
if (check_mode(conf, WritePersistent)) {
add_sector(conf, bio->bi_iter.bi_sector,
WritePersistent);
failit = 1;
}
if (check_mode(conf, WriteTransient))
failit = 1;
} else {
/* read request */
if (check_sector(conf, bio->bi_iter.bi_sector,
bio_end_sector(bio), READ))
failit = 1;
if (check_mode(conf, ReadTransient))
failit = 1;
if (check_mode(conf, ReadPersistent)) {
add_sector(conf, bio->bi_iter.bi_sector,
ReadPersistent);
failit = 1;
}
if (check_mode(conf, ReadFixable)) {
add_sector(conf, bio->bi_iter.bi_sector,
ReadFixable);
failit = 1;
}
}
md_account_bio(mddev, &bio);
if (failit) {
struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
&mddev->bio_set);
b->bi_private = bio;
b->bi_end_io = faulty_fail;
bio = b;
} else
bio_set_dev(bio, conf->rdev->bdev);
submit_bio_noacct(bio);
return true;
}
static void faulty_status(struct seq_file *seq, struct mddev *mddev)
{
struct faulty_conf *conf = mddev->private;
int n;
if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
seq_printf(seq, " WriteTransient=%d(%d)",
n, conf->period[WriteTransient]);
if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
seq_printf(seq, " ReadTransient=%d(%d)",
n, conf->period[ReadTransient]);
if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
seq_printf(seq, " WritePersistent=%d(%d)",
n, conf->period[WritePersistent]);
if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
seq_printf(seq, " ReadPersistent=%d(%d)",
n, conf->period[ReadPersistent]);
if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
seq_printf(seq, " ReadFixable=%d(%d)",
n, conf->period[ReadFixable]);
if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
seq_printf(seq, " WriteAll");
seq_printf(seq, " nfaults=%d", conf->nfaults);
}
static int faulty_reshape(struct mddev *mddev)
{
int mode = mddev->new_layout & ModeMask;
int count = mddev->new_layout >> ModeShift;
struct faulty_conf *conf = mddev->private;
if (mddev->new_layout < 0)
return 0;
/* new layout */
if (mode == ClearFaults)
conf->nfaults = 0;
else if (mode == ClearErrors) {
int i;
for (i=0 ; i < Modes ; i++) {
conf->period[i] = 0;
atomic_set(&conf->counters[i], 0);
}
} else if (mode < Modes) {
conf->period[mode] = count;
if (!count) count++;
atomic_set(&conf->counters[mode], count);
} else
return -EINVAL;
mddev->new_layout = -1;
mddev->layout = -1; /* makes sure further changes come through */
return 0;
}
static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks)
{
WARN_ONCE(raid_disks,
"%s does not support generic reshape\n", __func__);
if (sectors == 0)
return mddev->dev_sectors;
return sectors;
}
static int faulty_run(struct mddev *mddev)
{
struct md_rdev *rdev;
int i;
struct faulty_conf *conf;
if (md_check_no_bitmap(mddev))
return -EINVAL;
conf = kmalloc(sizeof(*conf), GFP_KERNEL);
if (!conf)
return -ENOMEM;
for (i=0; i<Modes; i++) {
atomic_set(&conf->counters[i], 0);
conf->period[i] = 0;
}
conf->nfaults = 0;
rdev_for_each(rdev, mddev) {
conf->rdev = rdev;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
}
md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
mddev->private = conf;
faulty_reshape(mddev);
return 0;
}
static void faulty_free(struct mddev *mddev, void *priv)
{
struct faulty_conf *conf = priv;
kfree(conf);
}
static struct md_personality faulty_personality =
{
.name = "faulty",
.level = LEVEL_FAULTY,
.owner = THIS_MODULE,
.make_request = faulty_make_request,
.run = faulty_run,
.free = faulty_free,
.status = faulty_status,
.check_reshape = faulty_reshape,
.size = faulty_size,
};
static int __init raid_init(void)
{
return register_md_personality(&faulty_personality);
}
static void raid_exit(void)
{
unregister_md_personality(&faulty_personality);
}
module_init(raid_init);
module_exit(raid_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Fault injection personality for MD (deprecated)");
MODULE_ALIAS("md-personality-10"); /* faulty */
MODULE_ALIAS("md-faulty");
MODULE_ALIAS("md-level--5");
// SPDX-License-Identifier: GPL-2.0-or-later
/*
linear.c : Multiple Devices driver for Linux
Copyright (C) 1994-96 Marc ZYNGIER
<zyngier@ufr-info-p7.ibp.fr> or
<maz@gloups.fdn.fr>
Linear mode management functions.
*/
#include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <trace/events/block.h>
#include "md.h"
#include "md-linear.h"
/*
* find which device holds a particular offset
*/
static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
{
int lo, mid, hi;
struct linear_conf *conf;
lo = 0;
hi = mddev->raid_disks - 1;
conf = mddev->private;
/*
* Binary Search
*/
while (hi > lo) {
mid = (hi + lo) / 2;
if (sector < conf->disks[mid].end_sector)
hi = mid;
else
lo = mid + 1;
}
return conf->disks + lo;
}
static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
{
struct linear_conf *conf;
sector_t array_sectors;
conf = mddev->private;
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
array_sectors = conf->array_sectors;
return array_sectors;
}
static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
{
struct linear_conf *conf;
struct md_rdev *rdev;
int i, cnt;
conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
if (!conf)
return NULL;
/*
* conf->raid_disks is copy of mddev->raid_disks. The reason to
* keep a copy of mddev->raid_disks in struct linear_conf is,
* mddev->raid_disks may not be consistent with pointers number of
* conf->disks[] when it is updated in linear_add() and used to
* iterate old conf->disks[] earray in linear_congested().
* Here conf->raid_disks is always consitent with number of
* pointers in conf->disks[] array, and mddev->private is updated
* with rcu_assign_pointer() in linear_addr(), such race can be
* avoided.
*/
conf->raid_disks = raid_disks;
cnt = 0;
conf->array_sectors = 0;
rdev_for_each(rdev, mddev) {
int j = rdev->raid_disk;
struct dev_info *disk = conf->disks + j;
sector_t sectors;
if (j < 0 || j >= raid_disks || disk->rdev) {
pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
mdname(mddev));
goto out;
}
disk->rdev = rdev;
if (mddev->chunk_sectors) {
sectors = rdev->sectors;
sector_div(sectors, mddev->chunk_sectors);
rdev->sectors = sectors * mddev->chunk_sectors;
}
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
conf->array_sectors += rdev->sectors;
cnt++;
}
if (cnt != raid_disks) {
pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
mdname(mddev));
goto out;
}
/*
* Here we calculate the device offsets.
*/
conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
for (i = 1; i < raid_disks; i++)
conf->disks[i].end_sector =
conf->disks[i-1].end_sector +
conf->disks[i].rdev->sectors;
return conf;
out:
kfree(conf);
return NULL;
}
static int linear_run (struct mddev *mddev)
{
struct linear_conf *conf;
int ret;
if (md_check_no_bitmap(mddev))
return -EINVAL;
conf = linear_conf(mddev, mddev->raid_disks);
if (!conf)
return 1;
mddev->private = conf;
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
ret = md_integrity_register(mddev);
if (ret) {
kfree(conf);
mddev->private = NULL;
}
return ret;
}
static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
{
/* Adding a drive to a linear array allows the array to grow.
* It is permitted if the new drive has a matching superblock
* already on it, with raid_disk equal to raid_disks.
* It is achieved by creating a new linear_private_data structure
* and swapping it in in-place of the current one.
* The current one is never freed until the array is stopped.
* This avoids races.
*/
struct linear_conf *newconf, *oldconf;
if (rdev->saved_raid_disk != mddev->raid_disks)
return -EINVAL;
rdev->raid_disk = rdev->saved_raid_disk;
rdev->saved_raid_disk = -1;
newconf = linear_conf(mddev,mddev->raid_disks+1);
if (!newconf)
return -ENOMEM;
/* newconf->raid_disks already keeps a copy of * the increased
* value of mddev->raid_disks, WARN_ONCE() is just used to make
* sure of this. It is possible that oldconf is still referenced
* in linear_congested(), therefore kfree_rcu() is used to free
* oldconf until no one uses it anymore.
*/
oldconf = rcu_dereference_protected(mddev->private,
lockdep_is_held(&mddev->reconfig_mutex));
mddev->raid_disks++;
WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
"copied raid_disks doesn't match mddev->raid_disks");
rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
kfree_rcu(oldconf, rcu);
return 0;
}
static void linear_free(struct mddev *mddev, void *priv)
{
struct linear_conf *conf = priv;
kfree(conf);
}
static bool linear_make_request(struct mddev *mddev, struct bio *bio)
{
struct dev_info *tmp_dev;
sector_t start_sector, end_sector, data_offset;
sector_t bio_sector = bio->bi_iter.bi_sector;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)
&& md_flush_request(mddev, bio))
return true;
tmp_dev = which_dev(mddev, bio_sector);
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
end_sector = tmp_dev->end_sector;
data_offset = tmp_dev->rdev->data_offset;
if (unlikely(bio_sector >= end_sector ||
bio_sector < start_sector))
goto out_of_bounds;
if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
md_error(mddev, tmp_dev->rdev);
bio_io_error(bio);
return true;
}
if (unlikely(bio_end_sector(bio) > end_sector)) {
/* This bio crosses a device boundary, so we have to split it */
struct bio *split = bio_split(bio, end_sector - bio_sector,
GFP_NOIO, &mddev->bio_set);
bio_chain(split, bio);
submit_bio_noacct(bio);
bio = split;
}
md_account_bio(mddev, &bio);
bio_set_dev(bio, tmp_dev->rdev->bdev);
bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
start_sector + data_offset;
if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
!bdev_max_discard_sectors(bio->bi_bdev))) {
/* Just ignore it */
bio_endio(bio);
} else {
if (mddev->gendisk)
trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
bio_sector);
mddev_check_write_zeroes(mddev, bio);
submit_bio_noacct(bio);
}
return true;
out_of_bounds:
pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n",
mdname(mddev),
(unsigned long long)bio->bi_iter.bi_sector,
tmp_dev->rdev->bdev,
(unsigned long long)tmp_dev->rdev->sectors,
(unsigned long long)start_sector);
bio_io_error(bio);
return true;
}
static void linear_status (struct seq_file *seq, struct mddev *mddev)
{
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
}
static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
{
if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
char *md_name = mdname(mddev);
pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
md_name, rdev->bdev);
}
}
static void linear_quiesce(struct mddev *mddev, int state)
{
}
static struct md_personality linear_personality =
{
.name = "linear",
.level = LEVEL_LINEAR,
.owner = THIS_MODULE,
.make_request = linear_make_request,
.run = linear_run,
.free = linear_free,
.status = linear_status,
.hot_add_disk = linear_add,
.size = linear_size,
.quiesce = linear_quiesce,
.error_handler = linear_error,
};
static int __init linear_init (void)
{
return register_md_personality (&linear_personality);
}
static void linear_exit (void)
{
unregister_md_personality (&linear_personality);
}
module_init(linear_init);
module_exit(linear_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
MODULE_ALIAS("md-linear");
MODULE_ALIAS("md-level--1");
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* multipath.c : Multiple Devices driver for Linux
*
* Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
*
* Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
*
* MULTIPATH management functions.
*
* derived from raid1.c.
*/
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/raid/md_u.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include "md.h"
#include "md-multipath.h"
#define MAX_WORK_PER_DISK 128
#define NR_RESERVED_BUFS 32
static int multipath_map (struct mpconf *conf)
{
int i, disks = conf->raid_disks;
/*
* Later we do read balancing on the read side
* now we use the first available disk.
*/
for (i = 0; i < disks; i++) {
struct md_rdev *rdev = conf->multipaths[i].rdev;
if (rdev && test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
return i;
}
}
pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
return (-1);
}
static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
{
unsigned long flags;
struct mddev *mddev = mp_bh->mddev;
struct mpconf *conf = mddev->private;
spin_lock_irqsave(&conf->device_lock, flags);
list_add(&mp_bh->retry_list, &conf->retry_list);
spin_unlock_irqrestore(&conf->device_lock, flags);
md_wakeup_thread(mddev->thread);
}
/*
* multipath_end_bh_io() is called when we have finished servicing a multipathed
* operation and are ready to return a success/failure code to the buffer
* cache layer.
*/
static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
{
struct bio *bio = mp_bh->master_bio;
struct mpconf *conf = mp_bh->mddev->private;
bio->bi_status = status;
bio_endio(bio);
mempool_free(mp_bh, &conf->pool);
}
static void multipath_end_request(struct bio *bio)
{
struct multipath_bh *mp_bh = bio->bi_private;
struct mpconf *conf = mp_bh->mddev->private;
struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
if (!bio->bi_status)
multipath_end_bh_io(mp_bh, 0);
else if (!(bio->bi_opf & REQ_RAHEAD)) {
/*
* oops, IO error:
*/
md_error (mp_bh->mddev, rdev);
pr_info("multipath: %pg: rescheduling sector %llu\n",
rdev->bdev,
(unsigned long long)bio->bi_iter.bi_sector);
multipath_reschedule_retry(mp_bh);
} else
multipath_end_bh_io(mp_bh, bio->bi_status);
rdev_dec_pending(rdev, conf->mddev);
}
static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
{
struct mpconf *conf = mddev->private;
struct multipath_bh * mp_bh;
struct multipath_info *multipath;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)
&& md_flush_request(mddev, bio))
return true;
md_account_bio(mddev, &bio);
mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
mp_bh->master_bio = bio;
mp_bh->mddev = mddev;
mp_bh->path = multipath_map(conf);
if (mp_bh->path < 0) {
bio_io_error(bio);
mempool_free(mp_bh, &conf->pool);
return true;
}
multipath = conf->multipaths + mp_bh->path;
bio_init_clone(multipath->rdev->bdev, &mp_bh->bio, bio, GFP_NOIO);
mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
mp_bh->bio.bi_end_io = multipath_end_request;
mp_bh->bio.bi_private = mp_bh;
mddev_check_write_zeroes(mddev, &mp_bh->bio);
submit_bio_noacct(&mp_bh->bio);
return true;
}
static void multipath_status(struct seq_file *seq, struct mddev *mddev)
{
struct mpconf *conf = mddev->private;
int i;
lockdep_assert_held(&mddev->lock);
seq_printf (seq, " [%d/%d] [", conf->raid_disks,
conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = READ_ONCE(conf->multipaths[i].rdev);
seq_printf(seq, "%s",
rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
seq_putc(seq, ']');
}
/*
* Careful, this can execute in IRQ contexts as well!
*/
static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
{
struct mpconf *conf = mddev->private;
if (conf->raid_disks - mddev->degraded <= 1) {
/*
* Uh oh, we can do nothing if this is our last path, but
* first check if this is a queued request for a device
* which has just failed.
*/
pr_warn("multipath: only one IO path left and IO error.\n");
/* leave it active... it's all we have */
return;
}
/*
* Mark disk as unusable
*/
if (test_and_clear_bit(In_sync, &rdev->flags)) {
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
spin_unlock_irqrestore(&conf->device_lock, flags);
}
set_bit(Faulty, &rdev->flags);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
pr_err("multipath: IO failure on %pg, disabling IO path.\n"
"multipath: Operation continuing on %d IO paths.\n",
rdev->bdev,
conf->raid_disks - mddev->degraded);
}
static void print_multipath_conf(struct mpconf *conf)
{
int i;
struct multipath_info *tmp;
pr_debug("MULTIPATH conf printout:\n");
if (!conf) {
pr_debug("(conf==NULL)\n");
return;
}
pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);
lockdep_assert_held(&conf->mddev->reconfig_mutex);
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->multipaths + i;
if (tmp->rdev)
pr_debug(" disk%d, o:%d, dev:%pg\n",
i,!test_bit(Faulty, &tmp->rdev->flags),
tmp->rdev->bdev);
}
}
static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct mpconf *conf = mddev->private;
int err = -EEXIST;
int path;
struct multipath_info *p;
int first = 0;
int last = mddev->raid_disks - 1;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
print_multipath_conf(conf);
for (path = first; path <= last; path++)
if ((p=conf->multipaths+path)->rdev == NULL) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
err = md_integrity_add_rdev(rdev, mddev);
if (err)
break;
spin_lock_irq(&conf->device_lock);
mddev->degraded--;
rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags);
spin_unlock_irq(&conf->device_lock);
WRITE_ONCE(p->rdev, rdev);
err = 0;
break;
}
print_multipath_conf(conf);
return err;
}
static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct mpconf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
struct multipath_info *p = conf->multipaths + number;
print_multipath_conf(conf);
if (rdev == p->rdev) {
if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
err = -EBUSY;
goto abort;
}
WRITE_ONCE(p->rdev, NULL);
err = md_integrity_register(mddev);
}
abort:
print_multipath_conf(conf);
return err;
}
/*
* This is a kernel thread which:
*
* 1. Retries failed read operations on working multipaths.
* 2. Updates the raid superblock when problems encounter.
* 3. Performs writes following reads for array syncronising.
*/
static void multipathd(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct multipath_bh *mp_bh;
struct bio *bio;
unsigned long flags;
struct mpconf *conf = mddev->private;
struct list_head *head = &conf->retry_list;
md_check_recovery(mddev);
for (;;) {
spin_lock_irqsave(&conf->device_lock, flags);
if (list_empty(head))
break;
mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
list_del(head->prev);
spin_unlock_irqrestore(&conf->device_lock, flags);
bio = &mp_bh->bio;
bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
if ((mp_bh->path = multipath_map (conf))<0) {
pr_err("multipath: %pg: unrecoverable IO read error for block %llu\n",
bio->bi_bdev,
(unsigned long long)bio->bi_iter.bi_sector);
multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
} else {
pr_err("multipath: %pg: redirecting sector %llu to another IO path\n",
bio->bi_bdev,
(unsigned long long)bio->bi_iter.bi_sector);
*bio = *(mp_bh->master_bio);
bio->bi_iter.bi_sector +=
conf->multipaths[mp_bh->path].rdev->data_offset;
bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev);
bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
bio->bi_end_io = multipath_end_request;
bio->bi_private = mp_bh;
submit_bio_noacct(bio);
}
}
spin_unlock_irqrestore(&conf->device_lock, flags);
}
static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks)
{
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
return mddev->dev_sectors;
}
static int multipath_run (struct mddev *mddev)
{
struct mpconf *conf;
int disk_idx;
struct multipath_info *disk;
struct md_rdev *rdev;
int working_disks;
int ret;
if (md_check_no_bitmap(mddev))
return -EINVAL;
if (mddev->level != LEVEL_MULTIPATH) {
pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
mdname(mddev), mddev->level);
goto out;
}
/*
* copy the already verified devices into our private MULTIPATH
* bookkeeping area. [whatever we allocate in multipath_run(),
* should be freed in multipath_free()]
*/
conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
mddev->private = conf;
if (!conf)
goto out;
conf->multipaths = kcalloc(mddev->raid_disks,
sizeof(struct multipath_info),
GFP_KERNEL);
if (!conf->multipaths)
goto out_free_conf;
working_disks = 0;
rdev_for_each(rdev, mddev) {
disk_idx = rdev->raid_disk;
if (disk_idx < 0 ||
disk_idx >= mddev->raid_disks)
continue;
disk = conf->multipaths + disk_idx;
disk->rdev = rdev;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
if (!test_bit(Faulty, &rdev->flags))
working_disks++;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
if (!working_disks) {
pr_warn("multipath: no operational IO paths for %s\n",
mdname(mddev));
goto out_free_conf;
}
mddev->degraded = conf->raid_disks - working_disks;
ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS,
sizeof(struct multipath_bh));
if (ret)
goto out_free_conf;
rcu_assign_pointer(mddev->thread,
md_register_thread(multipathd, mddev, "multipath"));
if (!mddev->thread)
goto out_free_conf;
pr_info("multipath: array %s active with %d out of %d IO paths\n",
mdname(mddev), conf->raid_disks - mddev->degraded,
mddev->raid_disks);
/*
* Ok, everything is just fine now
*/
md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
if (md_integrity_register(mddev))
goto out_free_conf;
return 0;
out_free_conf:
mempool_exit(&conf->pool);
kfree(conf->multipaths);
kfree(conf);
mddev->private = NULL;
out:
return -EIO;
}
static void multipath_free(struct mddev *mddev, void *priv)
{
struct mpconf *conf = priv;
mempool_exit(&conf->pool);
kfree(conf->multipaths);
kfree(conf);
}
static struct md_personality multipath_personality =
{
.name = "multipath",
.level = LEVEL_MULTIPATH,
.owner = THIS_MODULE,
.make_request = multipath_make_request,
.run = multipath_run,
.free = multipath_free,
.status = multipath_status,
.error_handler = multipath_error,
.hot_add_disk = multipath_add_disk,
.hot_remove_disk= multipath_remove_disk,
.size = multipath_size,
};
static int __init multipath_init (void)
{
return register_md_personality (&multipath_personality);
}
static void __exit multipath_exit (void)
{
unregister_md_personality (&multipath_personality);
}
module_init(multipath_init);
module_exit(multipath_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("simple multi-path personality for MD (deprecated)");
MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
MODULE_ALIAS("md-multipath");
MODULE_ALIAS("md-level--4");
...@@ -1206,6 +1206,7 @@ struct super_type { ...@@ -1206,6 +1206,7 @@ struct super_type {
struct md_rdev *refdev, struct md_rdev *refdev,
int minor_version); int minor_version);
int (*validate_super)(struct mddev *mddev, int (*validate_super)(struct mddev *mddev,
struct md_rdev *freshest,
struct md_rdev *rdev); struct md_rdev *rdev);
void (*sync_super)(struct mddev *mddev, void (*sync_super)(struct mddev *mddev,
struct md_rdev *rdev); struct md_rdev *rdev);
...@@ -1286,17 +1287,11 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor ...@@ -1286,17 +1287,11 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
rdev->sb_size = MD_SB_BYTES; rdev->sb_size = MD_SB_BYTES;
rdev->badblocks.shift = -1; rdev->badblocks.shift = -1;
if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = sb->this_disk.number;
rdev->desc_nr = -1;
else /* not spare disk */
rdev->desc_nr = sb->this_disk.number; if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
/* not spare disk, or LEVEL_MULTIPATH */
if (sb->level == LEVEL_MULTIPATH ||
(rdev->desc_nr >= 0 &&
rdev->desc_nr < MD_SB_DISKS &&
sb->disks[rdev->desc_nr].state &
((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
spare_disk = false; spare_disk = false;
if (!refdev) { if (!refdev) {
...@@ -1343,8 +1338,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor ...@@ -1343,8 +1338,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
/* /*
* validate_super for 0.90.0 * validate_super for 0.90.0
* note: we are not using "freshest" for 0.9 superblock
*/ */
static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
{ {
mdp_disk_t *desc; mdp_disk_t *desc;
mdp_super_t *sb = page_address(rdev->sb_page); mdp_super_t *sb = page_address(rdev->sb_page);
...@@ -1442,31 +1438,28 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1442,31 +1438,28 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
return 0; return 0;
} }
if (mddev->level != LEVEL_MULTIPATH) { desc = sb->disks + rdev->desc_nr;
desc = sb->disks + rdev->desc_nr;
if (desc->state & (1<<MD_DISK_FAULTY)) if (desc->state & (1<<MD_DISK_FAULTY))
set_bit(Faulty, &rdev->flags); set_bit(Faulty, &rdev->flags);
else if (desc->state & (1<<MD_DISK_SYNC) /* && else if (desc->state & (1<<MD_DISK_SYNC)) {
desc->raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags);
set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk;
rdev->saved_raid_disk = desc->raid_disk;
} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
/* active but not in sync implies recovery up to
* reshape position. We don't know exactly where
* that is, so set to zero for now
*/
if (mddev->minor_version >= 91) {
rdev->recovery_offset = 0;
rdev->raid_disk = desc->raid_disk; rdev->raid_disk = desc->raid_disk;
rdev->saved_raid_disk = desc->raid_disk;
} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
/* active but not in sync implies recovery up to
* reshape position. We don't know exactly where
* that is, so set to zero for now */
if (mddev->minor_version >= 91) {
rdev->recovery_offset = 0;
rdev->raid_disk = desc->raid_disk;
}
} }
if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) }
set_bit(WriteMostly, &rdev->flags); if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
if (desc->state & (1<<MD_DISK_FAILFAST)) set_bit(WriteMostly, &rdev->flags);
set_bit(FailFast, &rdev->flags); if (desc->state & (1<<MD_DISK_FAILFAST))
} else /* MULTIPATH are always insync */ set_bit(FailFast, &rdev->flags);
set_bit(In_sync, &rdev->flags);
return 0; return 0;
} }
...@@ -1756,10 +1749,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ...@@ -1756,10 +1749,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
&& rdev->new_data_offset < sb_start + (rdev->sb_size/512)) && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
return -EINVAL; return -EINVAL;
if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) rdev->desc_nr = le32_to_cpu(sb->dev_number);
rdev->desc_nr = -1;
else
rdev->desc_nr = le32_to_cpu(sb->dev_number);
if (!rdev->bb_page) { if (!rdev->bb_page) {
rdev->bb_page = alloc_page(GFP_KERNEL); rdev->bb_page = alloc_page(GFP_KERNEL);
...@@ -1812,12 +1802,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ...@@ -1812,12 +1802,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
sb->level != 0) sb->level != 0)
return -EINVAL; return -EINVAL;
/* not spare disk, or LEVEL_MULTIPATH */ /* not spare disk */
if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
(rdev->desc_nr >= 0 && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
rdev->desc_nr < le32_to_cpu(sb->max_dev) && le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
spare_disk = false; spare_disk = false;
if (!refdev) { if (!refdev) {
...@@ -1856,10 +1844,11 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ...@@ -1856,10 +1844,11 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
return ret; return ret;
} }
static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
{ {
struct mdp_superblock_1 *sb = page_address(rdev->sb_page); struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
__u64 ev1 = le64_to_cpu(sb->events); __u64 ev1 = le64_to_cpu(sb->events);
int role;
rdev->raid_disk = -1; rdev->raid_disk = -1;
clear_bit(Faulty, &rdev->flags); clear_bit(Faulty, &rdev->flags);
...@@ -1952,13 +1941,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1952,13 +1941,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
} }
} else if (mddev->pers == NULL) { } else if (mddev->pers == NULL) {
/* Insist of good event counter while assembling, except for /* Insist of good event counter while assembling, except for
* spares (which don't need an event count) */ * spares (which don't need an event count).
++ev1; * Similar to mdadm, we allow event counter difference of 1
* from the freshest device.
*/
if (rdev->desc_nr >= 0 && if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
if (ev1 < mddev->events) if (ev1 + 1 < mddev->events)
return -EINVAL; return -EINVAL;
} else if (mddev->bitmap) { } else if (mddev->bitmap) {
/* If adding to array with a bitmap, then we can accept an /* If adding to array with a bitmap, then we can accept an
...@@ -1973,58 +1964,85 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1973,58 +1964,85 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
/* just a hot-add of a new device, leave raid_disk at -1 */ /* just a hot-add of a new device, leave raid_disk at -1 */
return 0; return 0;
} }
if (mddev->level != LEVEL_MULTIPATH) {
int role; if (rdev->desc_nr < 0 ||
if (rdev->desc_nr < 0 || rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { role = MD_DISK_ROLE_SPARE;
role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1;
rdev->desc_nr = -1; } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
} else /*
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); * If we are assembling, and our event counter is smaller than the
switch(role) { * highest event counter, we cannot trust our superblock about the role.
case MD_DISK_ROLE_SPARE: /* spare */ * It could happen that our rdev was marked as Faulty, and all other
break; * superblocks were updated with +1 event counter.
case MD_DISK_ROLE_FAULTY: /* faulty */ * Then, before the next superblock update, which typically happens when
set_bit(Faulty, &rdev->flags); * remove_and_add_spares() removes the device from the array, there was
break; * a crash or reboot.
case MD_DISK_ROLE_JOURNAL: /* journal device */ * If we allow current rdev without consulting the freshest superblock,
if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { * we could cause data corruption.
/* journal device without journal feature */ * Note that in this case our event counter is smaller by 1 than the
pr_warn("md: journal device provided without journal feature, ignoring the device\n"); * highest, otherwise, this rdev would not be allowed into array;
return -EINVAL; * both kernel and mdadm allow event counter difference of 1.
} */
set_bit(Journal, &rdev->flags); struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
rdev->journal_tail = le64_to_cpu(sb->journal_tail); u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
rdev->raid_disk = 0;
break; if (rdev->desc_nr >= freshest_max_dev) {
default: /* this is unexpected, better not proceed */
rdev->saved_raid_disk = role; pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
if ((le32_to_cpu(sb->feature_map) & mdname(mddev), rdev->bdev, rdev->desc_nr,
MD_FEATURE_RECOVERY_OFFSET)) { freshest->bdev, freshest_max_dev);
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); return -EUCLEAN;
if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
} else {
/*
* If the array is FROZEN, then the device can't
* be in_sync with rest of array.
*/
if (!test_bit(MD_RECOVERY_FROZEN,
&mddev->recovery))
set_bit(In_sync, &rdev->flags);
}
rdev->raid_disk = role;
break;
} }
if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags); role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
if (sb->devflags & FailFast1) pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
set_bit(FailFast, &rdev->flags); mdname(mddev), rdev->bdev, role, role, freshest->bdev);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) } else {
set_bit(Replacement, &rdev->flags); role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
} else /* MULTIPATH are always insync */ }
set_bit(In_sync, &rdev->flags); switch (role) {
case MD_DISK_ROLE_SPARE: /* spare */
break;
case MD_DISK_ROLE_FAULTY: /* faulty */
set_bit(Faulty, &rdev->flags);
break;
case MD_DISK_ROLE_JOURNAL: /* journal device */
if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
/* journal device without journal feature */
pr_warn("md: journal device provided without journal feature, ignoring the device\n");
return -EINVAL;
}
set_bit(Journal, &rdev->flags);
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
rdev->raid_disk = 0;
break;
default:
rdev->saved_raid_disk = role;
if ((le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_OFFSET)) {
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
} else {
/*
* If the array is FROZEN, then the device can't
* be in_sync with rest of array.
*/
if (!test_bit(MD_RECOVERY_FROZEN,
&mddev->recovery))
set_bit(In_sync, &rdev->flags);
}
rdev->raid_disk = role;
break;
}
if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags);
if (sb->devflags & FailFast1)
set_bit(FailFast, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
return 0; return 0;
} }
...@@ -2842,10 +2860,6 @@ void md_update_sb(struct mddev *mddev, int force_change) ...@@ -2842,10 +2860,6 @@ void md_update_sb(struct mddev *mddev, int force_change)
} else } else
pr_debug("md: %pg (skipping faulty)\n", pr_debug("md: %pg (skipping faulty)\n",
rdev->bdev); rdev->bdev);
if (mddev->level == LEVEL_MULTIPATH)
/* only need to write one superblock... */
break;
} }
if (md_super_wait(mddev) < 0) if (md_super_wait(mddev) < 0)
goto rewrite; goto rewrite;
...@@ -2887,7 +2901,7 @@ static int add_bound_rdev(struct md_rdev *rdev) ...@@ -2887,7 +2901,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
* and should be added immediately. * and should be added immediately.
*/ */
super_types[mddev->major_version]. super_types[mddev->major_version].
validate_super(mddev, rdev); validate_super(mddev, NULL/*freshest*/, rdev);
err = mddev->pers->hot_add_disk(mddev, rdev); err = mddev->pers->hot_add_disk(mddev, rdev);
if (err) { if (err) {
md_kick_rdev_from_array(rdev); md_kick_rdev_from_array(rdev);
...@@ -3824,7 +3838,7 @@ static int analyze_sbs(struct mddev *mddev) ...@@ -3824,7 +3838,7 @@ static int analyze_sbs(struct mddev *mddev)
} }
super_types[mddev->major_version]. super_types[mddev->major_version].
validate_super(mddev, freshest); validate_super(mddev, NULL/*freshest*/, freshest);
i = 0; i = 0;
rdev_for_each_safe(rdev, tmp, mddev) { rdev_for_each_safe(rdev, tmp, mddev) {
...@@ -3839,20 +3853,15 @@ static int analyze_sbs(struct mddev *mddev) ...@@ -3839,20 +3853,15 @@ static int analyze_sbs(struct mddev *mddev)
} }
if (rdev != freshest) { if (rdev != freshest) {
if (super_types[mddev->major_version]. if (super_types[mddev->major_version].
validate_super(mddev, rdev)) { validate_super(mddev, freshest, rdev)) {
pr_warn("md: kicking non-fresh %pg from array!\n", pr_warn("md: kicking non-fresh %pg from array!\n",
rdev->bdev); rdev->bdev);
md_kick_rdev_from_array(rdev); md_kick_rdev_from_array(rdev);
continue; continue;
} }
} }
if (mddev->level == LEVEL_MULTIPATH) { if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) &&
rdev->desc_nr = i++; !test_bit(Journal, &rdev->flags)) {
rdev->raid_disk = rdev->desc_nr;
set_bit(In_sync, &rdev->flags);
} else if (rdev->raid_disk >=
(mddev->raid_disks - min(0, mddev->delta_disks)) &&
!test_bit(Journal, &rdev->flags)) {
rdev->raid_disk = -1; rdev->raid_disk = -1;
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
} }
...@@ -6847,7 +6856,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) ...@@ -6847,7 +6856,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
rdev->saved_raid_disk = rdev->raid_disk; rdev->saved_raid_disk = rdev->raid_disk;
} else } else
super_types[mddev->major_version]. super_types[mddev->major_version].
validate_super(mddev, rdev); validate_super(mddev, NULL/*freshest*/, rdev);
if ((info->state & (1<<MD_DISK_SYNC)) && if ((info->state & (1<<MD_DISK_SYNC)) &&
rdev->raid_disk != info->raid_disk) { rdev->raid_disk != info->raid_disk) {
/* This was a hot-add request, but events doesn't /* This was a hot-add request, but events doesn't
...@@ -8090,7 +8099,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) ...@@ -8090,7 +8099,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
return; return;
mddev->pers->error_handler(mddev, rdev); mddev->pers->error_handler(mddev, rdev);
if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) if (mddev->pers->level == 0)
return; return;
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
......
...@@ -173,3 +173,57 @@ static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) ...@@ -173,3 +173,57 @@ static inline void raid1_prepare_flush_writes(struct bitmap *bitmap)
else else
md_bitmap_unplug(bitmap); md_bitmap_unplug(bitmap);
} }
/*
* Used by fix_read_error() to decay the per rdev read_errors.
* We halve the read error count for every hour that has elapsed
* since the last recorded read error.
*/
static inline void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
{
long cur_time_mon;
unsigned long hours_since_last;
unsigned int read_errors = atomic_read(&rdev->read_errors);
cur_time_mon = ktime_get_seconds();
if (rdev->last_read_error == 0) {
/* first time we've seen a read error */
rdev->last_read_error = cur_time_mon;
return;
}
hours_since_last = (long)(cur_time_mon -
rdev->last_read_error) / 3600;
rdev->last_read_error = cur_time_mon;
/*
* if hours_since_last is > the number of bits in read_errors
* just set read errors to 0. We do this to avoid
* overflowing the shift of read_errors by hours_since_last.
*/
if (hours_since_last >= 8 * sizeof(read_errors))
atomic_set(&rdev->read_errors, 0);
else
atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
}
static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
{
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int read_errors;
check_decay_read_errors(mddev, rdev);
read_errors = atomic_inc_return(&rdev->read_errors);
if (read_errors > max_read_errors) {
pr_notice("md/"RAID_1_10_NAME":%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
mdname(mddev), rdev->bdev, read_errors, max_read_errors);
pr_notice("md/"RAID_1_10_NAME":%s: %pg: Failing raid device\n",
mdname(mddev), rdev->bdev);
md_error(mddev, rdev);
return true;
}
return false;
}
...@@ -49,6 +49,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); ...@@ -49,6 +49,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#define raid1_log(md, fmt, args...) \ #define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
#define RAID_1_10_NAME "raid1"
#include "raid1-10.c" #include "raid1-10.c"
#define START(node) ((node)->start) #define START(node) ((node)->start)
...@@ -1124,8 +1125,6 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, ...@@ -1124,8 +1125,6 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO, behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO,
&r1_bio->mddev->bio_set); &r1_bio->mddev->bio_set);
if (!behind_bio)
return;
/* discard op, we don't support writezero/writesame yet */ /* discard op, we don't support writezero/writesame yet */
if (!bio_has_data(bio)) { if (!bio_has_data(bio)) {
...@@ -2257,16 +2256,24 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) ...@@ -2257,16 +2256,24 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
* 3. Performs writes following reads for array synchronising. * 3. Performs writes following reads for array synchronising.
*/ */
static void fix_read_error(struct r1conf *conf, int read_disk, static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
sector_t sect, int sectors)
{ {
sector_t sect = r1_bio->sector;
int sectors = r1_bio->sectors;
int read_disk = r1_bio->read_disk;
struct mddev *mddev = conf->mddev; struct mddev *mddev = conf->mddev;
struct md_rdev *rdev = rcu_dereference(conf->mirrors[read_disk].rdev);
if (exceed_read_errors(mddev, rdev)) {
r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
return;
}
while(sectors) { while(sectors) {
int s = sectors; int s = sectors;
int d = read_disk; int d = read_disk;
int success = 0; int success = 0;
int start; int start;
struct md_rdev *rdev;
if (s > (PAGE_SIZE>>9)) if (s > (PAGE_SIZE>>9))
s = PAGE_SIZE >> 9; s = PAGE_SIZE >> 9;
...@@ -2507,8 +2514,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) ...@@ -2507,8 +2514,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
if (mddev->ro == 0 if (mddev->ro == 0
&& !test_bit(FailFast, &rdev->flags)) { && !test_bit(FailFast, &rdev->flags)) {
freeze_array(conf, 1); freeze_array(conf, 1);
fix_read_error(conf, r1_bio->read_disk, fix_read_error(conf, r1_bio);
r1_bio->sector, r1_bio->sectors);
unfreeze_array(conf); unfreeze_array(conf);
} else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) { } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
md_error(mddev, rdev); md_error(mddev, rdev);
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
#include <linux/raid/md_p.h> #include <linux/raid/md_p.h>
#include <trace/events/block.h> #include <trace/events/block.h>
#include "md.h" #include "md.h"
#define RAID_1_10_NAME "raid10"
#include "raid10.h" #include "raid10.h"
#include "raid0.h" #include "raid0.h"
#include "md-bitmap.h" #include "md-bitmap.h"
...@@ -2592,42 +2594,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2592,42 +2594,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
} }
} }
/*
* Used by fix_read_error() to decay the per rdev read_errors.
* We halve the read error count for every hour that has elapsed
* since the last recorded read error.
*
*/
static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
{
long cur_time_mon;
unsigned long hours_since_last;
unsigned int read_errors = atomic_read(&rdev->read_errors);
cur_time_mon = ktime_get_seconds();
if (rdev->last_read_error == 0) {
/* first time we've seen a read error */
rdev->last_read_error = cur_time_mon;
return;
}
hours_since_last = (long)(cur_time_mon -
rdev->last_read_error) / 3600;
rdev->last_read_error = cur_time_mon;
/*
* if hours_since_last is > the number of bits in read_errors
* just set read errors to 0. We do this to avoid
* overflowing the shift of read_errors by hours_since_last.
*/
if (hours_since_last >= 8 * sizeof(read_errors))
atomic_set(&rdev->read_errors, 0);
else
atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
}
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
int sectors, struct page *page, enum req_op op) int sectors, struct page *page, enum req_op op)
{ {
...@@ -2665,7 +2631,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 ...@@ -2665,7 +2631,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
int sect = 0; /* Offset from r10_bio->sector */ int sect = 0; /* Offset from r10_bio->sector */
int sectors = r10_bio->sectors, slot = r10_bio->read_slot; int sectors = r10_bio->sectors, slot = r10_bio->read_slot;
struct md_rdev *rdev; struct md_rdev *rdev;
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[slot].devnum; int d = r10_bio->devs[slot].devnum;
/* still own a reference to this rdev, so it cannot /* still own a reference to this rdev, so it cannot
...@@ -2678,15 +2643,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 ...@@ -2678,15 +2643,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
more fix_read_error() attempts */ more fix_read_error() attempts */
return; return;
check_decay_read_errors(mddev, rdev); if (exceed_read_errors(mddev, rdev)) {
atomic_inc(&rdev->read_errors);
if (atomic_read(&rdev->read_errors) > max_read_errors) {
pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
mdname(mddev), rdev->bdev,
atomic_read(&rdev->read_errors), max_read_errors);
pr_notice("md/raid10:%s: %pg: Failing raid device\n",
mdname(mddev), rdev->bdev);
md_error(mddev, rdev);
r10_bio->devs[slot].bio = IO_BLOCKED; r10_bio->devs[slot].bio = IO_BLOCKED;
return; return;
} }
......
...@@ -2,15 +2,11 @@ ...@@ -2,15 +2,11 @@
/* /*
md_p.h : physical layout of Linux RAID devices md_p.h : physical layout of Linux RAID devices
Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option) the Free Software Foundation; either version 2, or (at your option)
any later version. any later version.
You should have received a copy of the GNU General Public License
(for example /usr/src/linux/COPYING); if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/ */
#ifndef _MD_P_H #ifndef _MD_P_H
...@@ -237,7 +233,7 @@ struct mdp_superblock_1 { ...@@ -237,7 +233,7 @@ struct mdp_superblock_1 {
char set_name[32]; /* set and interpreted by user-space */ char set_name[32]; /* set and interpreted by user-space */
__le64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ __le64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/
__le32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ __le32 level; /* 0,1,4,5 */
__le32 layout; /* only for raid5 and raid10 currently */ __le32 layout; /* only for raid5 and raid10 currently */
__le64 size; /* used size of component devices, in 512byte sectors */ __le64 size; /* used size of component devices, in 512byte sectors */
......
...@@ -2,15 +2,11 @@ ...@@ -2,15 +2,11 @@
/* /*
md_u.h : user <=> kernel API between Linux raidtools and RAID drivers md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
Copyright (C) 1998 Ingo Molnar Copyright (C) 1998 Ingo Molnar
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option) the Free Software Foundation; either version 2, or (at your option)
any later version. any later version.
You should have received a copy of the GNU General Public License
(for example /usr/src/linux/COPYING); if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/ */
#ifndef _UAPI_MD_U_H #ifndef _UAPI_MD_U_H
...@@ -107,11 +103,6 @@ typedef struct mdu_array_info_s { ...@@ -107,11 +103,6 @@ typedef struct mdu_array_info_s {
} mdu_array_info_t; } mdu_array_info_t;
/* non-obvious values for 'level' */
#define LEVEL_MULTIPATH (-4)
#define LEVEL_LINEAR (-1)
#define LEVEL_FAULTY (-5)
/* we need a value for 'no level specified' and 0 /* we need a value for 'no level specified' and 0
* means 'raid0', so we need something else. This is * means 'raid0', so we need something else. This is
* for internal use only * for internal use only
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment