Commit ab6094f9 authored by Linus Torvalds's avatar Linus Torvalds

Merge penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/md-merge

into penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/linux
parents 8309f3a8 86711d5e
/*
linear.c : Multiple Devices driver for Linux
Copyright (C) 1994-96 Marc ZYNGIER
Copyright (C) 1994-96 Marc ZYNGIER
<zyngier@ufr-info-p7.ibp.fr> or
<maz@gloups.fdn.fr>
......@@ -33,39 +33,45 @@ static int linear_run (mddev_t *mddev)
linear_conf_t *conf;
struct linear_hash *table;
mdk_rdev_t *rdev;
int size, i, j, nb_zone;
int size, i, nb_zone, cnt;
unsigned int curr_offset;
struct list_head *tmp;
MOD_INC_USE_COUNT;
conf = kmalloc (sizeof (*conf), GFP_KERNEL);
if (!conf)
goto out;
memset(conf, 0, sizeof(*conf));
mddev->private = conf;
if (md_check_ordering(mddev)) {
printk("linear: disks are not ordered, aborting!\n");
goto out;
}
/*
* Find the smallest device.
*/
conf->smallest = NULL;
curr_offset = 0;
ITERATE_RDEV_ORDERED(mddev,rdev,j) {
cnt = 0;
ITERATE_RDEV(mddev,rdev,tmp) {
int j = rdev->sb->this_disk.raid_disk;
dev_info_t *disk = conf->disks + j;
if (j < 0 || j > mddev->sb->raid_disks || disk->bdev) {
printk("linear: disk numbering problem. Aborting!\n");
goto out;
}
disk->dev = rdev->dev;
disk->bdev = rdev->bdev;
atomic_inc(&rdev->bdev->bd_count);
disk->size = rdev->size;
disk->offset = curr_offset;
curr_offset += disk->size;
if (!conf->smallest || (disk->size < conf->smallest->size))
conf->smallest = disk;
cnt++;
}
if (cnt != mddev->sb->raid_disks) {
printk("linear: not enough drives present. Aborting!\n");
goto out;
}
nb_zone = conf->nr_zones =
......@@ -81,10 +87,13 @@ static int linear_run (mddev_t *mddev)
* Here we generate the linear hash table
*/
table = conf->hash_table;
i = 0;
size = 0;
for (j = 0; j < mddev->nb_dev; j++) {
dev_info_t *disk = conf->disks + j;
curr_offset = 0;
for (i = 0; i < cnt; i++) {
dev_info_t *disk = conf->disks + i;
disk->offset = curr_offset;
curr_offset += disk->size;
if (size < 0) {
table[-1].dev1 = disk;
......@@ -130,12 +139,13 @@ static int linear_stop (mddev_t *mddev)
return 0;
}
static int linear_make_request (mddev_t *mddev, int rw, struct bio *bio)
static int linear_make_request (request_queue_t *q, struct bio *bio)
{
linear_conf_t *conf = mddev_to_conf(mddev);
struct linear_hash *hash;
dev_info_t *tmp_dev;
long block;
mddev_t *mddev = q->queuedata;
linear_conf_t *conf = mddev_to_conf(mddev);
struct linear_hash *hash;
dev_info_t *tmp_dev;
long block;
block = bio->bi_sector >> 1;
hash = conf->hash_table + (block / conf->smallest->size);
......@@ -186,7 +196,7 @@ static int linear_status (char *page, mddev_t *mddev)
}
sz += sprintf(page+sz, "\n");
#endif
sz += sprintf(page+sz, " %dk rounding", mddev->param.chunk_size/1024);
sz += sprintf(page+sz, " %dk rounding", mddev->sb->chunk_size/1024);
return sz;
}
......
......@@ -107,7 +107,7 @@ static ctl_table raid_root_table[] = {
* subsystems want to have a pre-defined structure
*/
struct hd_struct md_hd_struct[MAX_MD_DEVS];
static int md_maxreadahead[MAX_MD_DEVS];
static void md_recover_arrays(void);
static mdk_thread_t *md_recovery_thread;
int md_size[MAX_MD_DEVS];
......@@ -129,93 +129,111 @@ static struct gendisk md_gendisk=
/*
* Enables to iterate over all existing md arrays
* all_mddevs_lock protects this list as well as mddev_map.
*/
static LIST_HEAD(all_mddevs);
static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;
/*
* The mapping between kdev and mddev is not necessary a simple
* one! Eg. HSM uses several sub-devices to implement Logical
* Volumes. All these sub-devices map to the same mddev.
* iterates through all used mddevs in the system.
* We take care to grab the all_mddevs_lock whenever navigating
* the list, and to always hold a refcount when unlocked.
* Any code which breaks out of this loop while own
* a reference to the current mddev and must mddev_put it.
*/
dev_mapping_t mddev_map[MAX_MD_DEVS];
#define ITERATE_MDDEV(mddev,tmp) \
\
for (spin_lock(&all_mddevs_lock), \
(tmp = all_mddevs.next), \
(mddev = NULL); \
(void)(tmp != &all_mddevs && \
mddev_get(list_entry(tmp, mddev_t, all_mddevs))),\
spin_unlock(&all_mddevs_lock), \
(mddev ? mddev_put(mddev):(void)NULL), \
(mddev = list_entry(tmp, mddev_t, all_mddevs)), \
(tmp != &all_mddevs); \
spin_lock(&all_mddevs_lock), \
(tmp = tmp->next) \
)
static mddev_t *mddev_map[MAX_MD_DEVS];
static int md_fail_request (request_queue_t *q, struct bio *bio)
{
bio_io_error(bio);
return 0;
}
void add_mddev_mapping(mddev_t * mddev, kdev_t dev, void *data)
static inline mddev_t *mddev_get(mddev_t *mddev)
{
unsigned int minor = minor(dev);
if (major(dev) != MD_MAJOR) {
MD_BUG();
return;
}
if (mddev_map[minor].mddev) {
MD_BUG();
return;
}
mddev_map[minor].mddev = mddev;
mddev_map[minor].data = data;
atomic_inc(&mddev->active);
return mddev;
}
void del_mddev_mapping(mddev_t * mddev, kdev_t dev)
static void mddev_put(mddev_t *mddev)
{
unsigned int minor = minor(dev);
if (major(dev) != MD_MAJOR) {
MD_BUG();
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
if (!mddev->sb && list_empty(&mddev->disks)) {
list_del(&mddev->all_mddevs);
mddev_map[mdidx(mddev)] = NULL;
kfree(mddev);
MOD_DEC_USE_COUNT;
}
if (mddev_map[minor].mddev != mddev) {
MD_BUG();
return;
}
mddev_map[minor].mddev = NULL;
mddev_map[minor].data = NULL;
spin_unlock(&all_mddevs_lock);
}
static int md_make_request (request_queue_t *q, struct bio *bio)
static mddev_t * mddev_find(int unit)
{
mddev_t *mddev = kdev_to_mddev(to_kdev_t(bio->bi_bdev->bd_dev));
mddev_t *mddev, *new = NULL;
if (mddev && mddev->pers)
return mddev->pers->make_request(mddev, bio_rw(bio), bio);
else {
bio_io_error(bio);
return 0;
retry:
spin_lock(&all_mddevs_lock);
if (mddev_map[unit]) {
mddev = mddev_get(mddev_map[unit]);
spin_unlock(&all_mddevs_lock);
if (new)
kfree(new);
return mddev;
}
}
static mddev_t * alloc_mddev(kdev_t dev)
{
mddev_t *mddev;
if (major(dev) != MD_MAJOR) {
MD_BUG();
return 0;
if (new) {
mddev_map[unit] = new;
list_add(&new->all_mddevs, &all_mddevs);
spin_unlock(&all_mddevs_lock);
MOD_INC_USE_COUNT;
return new;
}
mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
if (!mddev)
spin_unlock(&all_mddevs_lock);
new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
if (!new)
return NULL;
memset(mddev, 0, sizeof(*mddev));
memset(new, 0, sizeof(*new));
mddev->__minor = minor(dev);
init_MUTEX(&mddev->reconfig_sem);
init_MUTEX(&mddev->recovery_sem);
init_MUTEX(&mddev->resync_sem);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
atomic_set(&mddev->active, 0);
new->__minor = unit;
init_MUTEX(&new->reconfig_sem);
INIT_LIST_HEAD(&new->disks);
INIT_LIST_HEAD(&new->all_mddevs);
atomic_set(&new->active, 1);
/*
* The 'base' mddev is the one with data NULL.
* personalities can create additional mddevs
* if necessary.
*/
add_mddev_mapping(mddev, dev, 0);
list_add(&mddev->all_mddevs, &all_mddevs);
goto retry;
}
MOD_INC_USE_COUNT;
static inline int mddev_lock(mddev_t * mddev)
{
return down_interruptible(&mddev->reconfig_sem);
}
return mddev;
static inline int mddev_trylock(mddev_t * mddev)
{
return down_trylock(&mddev->reconfig_sem);
}
static inline void mddev_unlock(mddev_t * mddev)
{
up(&mddev->reconfig_sem);
}
mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
......@@ -249,13 +267,12 @@ char * partition_name(kdev_t dev)
struct gendisk *hd;
static char nomem [] = "<nomem>";
dev_name_t *dname;
struct list_head *tmp = device_names.next;
struct list_head *tmp;
while (tmp != &device_names) {
list_for_each(tmp, &device_names) {
dname = list_entry(tmp, dev_name_t, list);
if (kdev_same(dname->dev, dev))
return dname->name;
tmp = tmp->next;
}
dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
......@@ -275,7 +292,6 @@ char * partition_name(kdev_t dev)
}
dname->dev = dev;
INIT_LIST_HEAD(&dname->list);
list_add(&dname->list, &device_names);
return dname->name;
......@@ -326,69 +342,6 @@ static unsigned int zoned_raid_size(mddev_t *mddev)
return 0;
}
/*
* We check wether all devices are numbered from 0 to nb_dev-1. The
* order is guaranteed even after device name changes.
*
* Some personalities (raid0, linear) use this. Personalities that
* provide data have to be able to deal with loss of individual
* disks, so they do their checking themselves.
*/
int md_check_ordering(mddev_t *mddev)
{
int i, c;
mdk_rdev_t *rdev;
struct list_head *tmp;
/*
* First, all devices must be fully functional
*/
ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev->faulty) {
printk(KERN_ERR "md: md%d's device %s faulty, aborting.\n",
mdidx(mddev), partition_name(rdev->dev));
goto abort;
}
}
c = 0;
ITERATE_RDEV(mddev,rdev,tmp) {
c++;
}
if (c != mddev->nb_dev) {
MD_BUG();
goto abort;
}
if (mddev->nb_dev != mddev->sb->raid_disks) {
printk(KERN_ERR "md: md%d, array needs %d disks, has %d, aborting.\n",
mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
goto abort;
}
/*
* Now the numbering check
*/
for (i = 0; i < mddev->nb_dev; i++) {
c = 0;
ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev->desc_nr == i)
c++;
}
if (!c) {
printk(KERN_ERR "md: md%d, missing disk #%d, aborting.\n",
mdidx(mddev), i);
goto abort;
}
if (c > 1) {
printk(KERN_ERR "md: md%d, too many disks #%d, aborting.\n",
mdidx(mddev), i);
goto abort;
}
}
return 0;
abort:
return 1;
}
static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb)
{
if (disk_active(disk)) {
......@@ -618,8 +571,7 @@ static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
list_add(&rdev->same_set, &mddev->disks);
rdev->mddev = mddev;
mddev->nb_dev++;
printk(KERN_INFO "md: bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev));
}
static void unbind_rdev_from_array(mdk_rdev_t * rdev)
......@@ -628,11 +580,8 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
MD_BUG();
return;
}
list_del(&rdev->same_set);
INIT_LIST_HEAD(&rdev->same_set);
rdev->mddev->nb_dev--;
printk(KERN_INFO "md: unbind<%s,%d>\n", partition_name(rdev->dev),
rdev->mddev->nb_dev);
list_del_init(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev));
rdev->mddev = NULL;
}
......@@ -682,13 +631,11 @@ static void export_rdev(mdk_rdev_t * rdev)
MD_BUG();
unlock_rdev(rdev);
free_disk_sb(rdev);
list_del(&rdev->all);
INIT_LIST_HEAD(&rdev->all);
if (rdev->pending.next != &rdev->pending) {
list_del_init(&rdev->all);
if (!list_empty(&rdev->pending)) {
printk(KERN_INFO "md: (%s was pending)\n",
partition_name(rdev->dev));
list_del(&rdev->pending);
INIT_LIST_HEAD(&rdev->pending);
list_del_init(&rdev->pending);
}
#ifndef MODULE
md_autodetect_dev(rdev->dev);
......@@ -722,7 +669,7 @@ static void export_array(mddev_t *mddev)
}
kick_rdev_from_array(rdev);
}
if (mddev->nb_dev)
if (!list_empty(&mddev->disks))
MD_BUG();
}
......@@ -736,21 +683,6 @@ static void free_mddev(mddev_t *mddev)
export_array(mddev);
md_size[mdidx(mddev)] = 0;
md_hd_struct[mdidx(mddev)].nr_sects = 0;
/*
* Make sure nobody else is using this mddev
* (careful, we rely on the global kernel lock here)
*/
while (atomic_read(&mddev->resync_sem.count) != 1)
schedule();
while (atomic_read(&mddev->recovery_sem.count) != 1)
schedule();
del_mddev_mapping(mddev, mk_kdev(MD_MAJOR, mdidx(mddev)));
list_del(&mddev->all_mddevs);
INIT_LIST_HEAD(&mddev->all_mddevs);
kfree(mddev);
MOD_DEC_USE_COUNT;
}
#undef BAD_CSUM
......@@ -892,12 +824,10 @@ static mdk_rdev_t * find_rdev_all(kdev_t dev)
struct list_head *tmp;
mdk_rdev_t *rdev;
tmp = all_raid_disks.next;
while (tmp != &all_raid_disks) {
list_for_each(tmp, &all_raid_disks) {
rdev = list_entry(tmp, mdk_rdev_t, all);
if (kdev_same(rdev->dev, dev))
return rdev;
tmp = tmp->next;
}
return NULL;
}
......@@ -993,12 +923,13 @@ static int sync_sbs(mddev_t * mddev)
return 0;
}
int md_update_sb(mddev_t * mddev)
void __md_update_sb(mddev_t * mddev)
{
int err, count = 100;
struct list_head *tmp;
mdk_rdev_t *rdev;
mddev->sb_dirty = 0;
repeat:
mddev->sb->utime = CURRENT_TIME;
if (!(++mddev->sb->events_lo))
......@@ -1020,7 +951,7 @@ int md_update_sb(mddev_t * mddev)
* nonpersistent superblocks
*/
if (mddev->sb->not_persistent)
return 0;
return;
printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
mdidx(mddev));
......@@ -1048,9 +979,18 @@ int md_update_sb(mddev_t * mddev)
}
printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
}
return 0;
}
void md_update_sb(mddev_t *mddev)
{
if (mddev_lock(mddev))
return;
if (mddev->sb_dirty)
__md_update_sb(mddev);
mddev_unlock(mddev);
}
/*
* Import a device. If 'on_disk', then sanity check the superblock
*
......@@ -1122,6 +1062,7 @@ static int md_import_device(kdev_t newdev, int on_disk)
}
list_add(&rdev->all, &all_raid_disks);
INIT_LIST_HEAD(&rdev->pending);
INIT_LIST_HEAD(&rdev->same_set);
if (rdev->faulty && rdev->sb)
free_disk_sb(rdev);
......@@ -1574,7 +1515,6 @@ static int device_size_calculation(mddev_t * mddev)
if (sb->level == -3)
readahead = 0;
}
md_maxreadahead[mdidx(mddev)] = readahead;
printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
mdidx(mddev), readahead*(PAGE_SIZE/1024));
......@@ -1605,7 +1545,7 @@ static int do_md_run(mddev_t * mddev)
mdk_rdev_t *rdev;
if (!mddev->nb_dev) {
if (list_empty(&mddev->disks)) {
MD_BUG();
return -EINVAL;
}
......@@ -1630,9 +1570,6 @@ static int do_md_run(mddev_t * mddev)
chunk_size = mddev->sb->chunk_size;
pnum = level_to_pers(mddev->sb->level);
mddev->param.chunk_size = chunk_size;
mddev->param.personality = pnum;
if ((pnum != MULTIPATH) && (pnum != RAID1)) {
if (!chunk_size) {
/*
......@@ -1712,6 +1649,9 @@ static int do_md_run(mddev_t * mddev)
}
mddev->pers = pers[pnum];
blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
mddev->queue.queuedata = mddev;
err = mddev->pers->run(mddev);
if (err) {
printk(KERN_ERR "md: pers->run() failed ...\n");
......@@ -1719,9 +1659,15 @@ static int do_md_run(mddev_t * mddev)
return -EINVAL;
}
mddev->sb->state &= ~(1 << MD_SB_CLEAN);
md_update_sb(mddev);
mddev->in_sync = (mddev->sb->state & (1<<MD_SB_CLEAN));
/* if personality doesn't have "sync_request", then
* a dirty array doesn't mean anything
*/
if (mddev->pers->sync_request)
mddev->sb->state &= ~(1 << MD_SB_CLEAN);
__md_update_sb(mddev);
md_recover_arrays();
/*
* md_size has units of 1K blocks, which are
* twice as large as sectors.
......@@ -1736,21 +1682,21 @@ static int do_md_run(mddev_t * mddev)
#undef TOO_BIG_CHUNKSIZE
#undef BAD_CHUNKSIZE
#define OUT(x) do { err = (x); goto out; } while (0)
static int restart_array(mddev_t *mddev)
{
int err = 0;
int err;
/*
* Complain if it has no devices
*/
if (!mddev->nb_dev)
OUT(-ENXIO);
err = -ENXIO;
if (list_empty(&mddev->disks))
goto out;
if (mddev->pers) {
err = -EBUSY;
if (!mddev->ro)
OUT(-EBUSY);
goto out;
mddev->ro = 0;
set_device_ro(mddev_to_kdev(mddev), 0);
......@@ -1761,8 +1707,7 @@ static int restart_array(mddev_t *mddev)
* Kick recovery or resync if necessary
*/
md_recover_arrays();
if (mddev->pers->restart_resync)
mddev->pers->restart_resync(mddev);
err = 0;
} else {
printk(KERN_ERR "md: md%d has no personality assigned.\n",
mdidx(mddev));
......@@ -1780,49 +1725,43 @@ static int restart_array(mddev_t *mddev)
static int do_md_stop(mddev_t * mddev, int ro)
{
int err = 0, resync_interrupted = 0;
int err = 0;
kdev_t dev = mddev_to_kdev(mddev);
if (atomic_read(&mddev->active)>1) {
printk(STILL_IN_USE, mdidx(mddev));
OUT(-EBUSY);
err = -EBUSY;
goto out;
}
if (mddev->pers) {
/*
* It is safe to call stop here, it only frees private
* data. Also, it tells us if a device is unstoppable
* (eg. resyncing is in progress)
*/
if (mddev->pers->stop_resync)
if (mddev->pers->stop_resync(mddev))
resync_interrupted = 1;
if (mddev->recovery_running)
md_interrupt_thread(md_recovery_thread);
/*
* This synchronizes with signal delivery to the
* resync or reconstruction thread. It also nicely
* hangs the process if some reconstruction has not
* finished.
*/
down(&mddev->recovery_sem);
up(&mddev->recovery_sem);
if (mddev->sync_thread) {
if (mddev->recovery_running > 0)
mddev->recovery_running = -EINTR;
md_unregister_thread(mddev->sync_thread);
mddev->sync_thread = NULL;
if (mddev->spare) {
mddev->pers->diskop(mddev, &mddev->spare,
DISKOP_SPARE_INACTIVE);
mddev->spare = NULL;
}
}
invalidate_device(dev, 1);
if (ro) {
err = -ENXIO;
if (mddev->ro)
OUT(-ENXIO);
goto out;
mddev->ro = 1;
} else {
if (mddev->ro)
set_device_ro(dev, 0);
if (mddev->pers->stop(mddev)) {
err = -EBUSY;
if (mddev->ro)
set_device_ro(dev, 1);
OUT(-EBUSY);
goto out;
}
if (mddev->ro)
mddev->ro = 0;
......@@ -1832,11 +1771,11 @@ static int do_md_stop(mddev_t * mddev, int ro)
* mark it clean only if there was no resync
* interrupted.
*/
if (!mddev->recovery_running && !resync_interrupted) {
if (mddev->in_sync) {
printk(KERN_INFO "md: marking sb clean...\n");
mddev->sb->state |= 1 << MD_SB_CLEAN;
}
md_update_sb(mddev);
__md_update_sb(mddev);
}
if (ro)
set_device_ro(dev, 1);
......@@ -1848,15 +1787,13 @@ static int do_md_stop(mddev_t * mddev, int ro)
if (!ro) {
printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
free_mddev(mddev);
} else
printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
err = 0;
out:
return err;
}
#undef OUT
/*
* We have to safely support old arrays too.
*/
......@@ -1877,7 +1814,7 @@ static void autorun_array(mddev_t *mddev)
struct list_head *tmp;
int err;
if (mddev->disks.prev == &mddev->disks) {
if (list_empty(&mddev->disks)) {
MD_BUG();
return;
}
......@@ -1912,17 +1849,15 @@ static void autorun_array(mddev_t *mddev)
*
* If "unit" is allocated, then bump its reference count
*/
static void autorun_devices(kdev_t countdev)
static void autorun_devices(void)
{
struct list_head candidates;
struct list_head *tmp;
mdk_rdev_t *rdev0, *rdev;
mddev_t *mddev;
kdev_t md_kdev;
printk(KERN_INFO "md: autorun ...\n");
while (pending_raid_disks.next != &pending_raid_disks) {
while (!list_empty(&pending_raid_disks)) {
rdev0 = list_entry(pending_raid_disks.next,
mdk_rdev_t, pending);
......@@ -1946,29 +1881,34 @@ static void autorun_devices(kdev_t countdev)
* mostly sane superblocks. It's time to allocate the
* mddev.
*/
md_kdev = mk_kdev(MD_MAJOR, rdev0->sb->md_minor);
mddev = kdev_to_mddev(md_kdev);
if (mddev) {
printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
mdidx(mddev), partition_name(rdev0->dev));
ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
export_rdev(rdev);
continue;
}
mddev = alloc_mddev(md_kdev);
mddev = mddev_find(rdev0->sb->md_minor);
if (!mddev) {
printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
break;
}
if (kdev_same(md_kdev, countdev))
atomic_inc(&mddev->active);
printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
bind_rdev_to_array(rdev, mddev);
list_del(&rdev->pending);
INIT_LIST_HEAD(&rdev->pending);
if (mddev_lock(mddev))
printk(KERN_WARNING "md: md%d locked, cannot run\n",
mdidx(mddev));
else if (mddev->sb || !list_empty(&mddev->disks)) {
printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
mdidx(mddev), partition_name(rdev0->dev));
mddev_unlock(mddev);
} else {
printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
bind_rdev_to_array(rdev, mddev);
list_del_init(&rdev->pending);
}
autorun_array(mddev);
mddev_unlock(mddev);
}
autorun_array(mddev);
/* on success, candidates will be empty, on error
* it wont...
*/
ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
export_rdev(rdev);
mddev_put(mddev);
}
printk(KERN_INFO "md: ... autorun DONE.\n");
}
......@@ -2005,7 +1945,7 @@ static void autorun_devices(kdev_t countdev)
#define AUTORUNNING KERN_INFO \
"md: auto-running md%d.\n"
static int autostart_array(kdev_t startdev, kdev_t countdev)
static int autostart_array(kdev_t startdev)
{
int err = -EINVAL, i;
mdp_super_t *sb = NULL;
......@@ -2065,7 +2005,7 @@ static int autostart_array(kdev_t startdev, kdev_t countdev)
/*
* possibly return codes
*/
autorun_devices(countdev);
autorun_devices();
return 0;
abort:
......@@ -2191,7 +2131,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
MD_BUG();
return -EINVAL;
}
if (mddev->nb_dev) {
if (!list_empty(&mddev->disks)) {
mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
mdk_rdev_t, same_set);
if (!uuid_equal(rdev0, rdev)) {
......@@ -2346,8 +2286,7 @@ static int hot_remove_disk(mddev_t * mddev, kdev_t dev)
remove_descriptor(disk, mddev->sb);
kick_rdev_from_array(rdev);
mddev->sb_dirty = 1;
md_update_sb(mddev);
__md_update_sb(mddev);
return 0;
busy:
......@@ -2458,9 +2397,7 @@ static int hot_add_disk(mddev_t * mddev, kdev_t dev)
mddev->sb->spare_disks++;
mddev->sb->working_disks++;
mddev->sb_dirty = 1;
md_update_sb(mddev);
__md_update_sb(mddev);
/*
* Kick recovery, maybe this spare has to be added to the
......@@ -2520,36 +2457,6 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
}
#undef SET_SB
static int set_disk_info(mddev_t * mddev, void * arg)
{
printk(KERN_INFO "md: not yet");
return -EINVAL;
}
static int clear_array(mddev_t * mddev)
{
printk(KERN_INFO "md: not yet");
return -EINVAL;
}
static int write_raid_info(mddev_t * mddev)
{
printk(KERN_INFO "md: not yet");
return -EINVAL;
}
static int protect_array(mddev_t * mddev)
{
printk(KERN_INFO "md: not yet");
return -EINVAL;
}
static int unprotect_array(mddev_t * mddev)
{
printk(KERN_INFO "md: not yet");
return -EINVAL;
}
static int set_disk_faulty(mddev_t *mddev, kdev_t dev)
{
mdk_rdev_t *rdev;
......@@ -2595,7 +2502,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
case PRINT_RAID_DEBUG:
err = 0;
md_print_devices();
goto done_unlock;
goto done;
#ifndef MODULE
case RAID_AUTORUN:
......@@ -2632,40 +2539,30 @@ static int md_ioctl(struct inode *inode, struct file *file,
* Commands creating/starting a new array:
*/
mddev = kdev_to_mddev(dev);
mddev = inode->i_bdev->bd_inode->u.generic_ip;
switch (cmd)
{
case SET_ARRAY_INFO:
case START_ARRAY:
if (mddev) {
printk(KERN_WARNING "md: array md%d already exists!\n",
mdidx(mddev));
err = -EEXIST;
goto abort;
}
default:;
if (!mddev) {
BUG();
goto abort;
}
err = mddev_lock(mddev);
if (err) {
printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",
err, cmd);
goto abort;
}
switch (cmd)
{
case SET_ARRAY_INFO:
mddev = alloc_mddev(dev);
if (!mddev) {
err = -ENOMEM;
goto abort;
}
atomic_inc(&mddev->active);
/*
* alloc_mddev() should possibly self-lock.
*/
err = lock_mddev(mddev);
if (err) {
printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n",
err, cmd);
goto abort;
if (!list_empty(&mddev->disks)) {
printk(KERN_WARNING "md: array md%d already has disks!\n",
mdidx(mddev));
err = -EBUSY;
goto abort_unlock;
}
if (mddev->sb) {
printk(KERN_WARNING "md: array md%d already has a superblock!\n",
mdidx(mddev));
......@@ -2690,13 +2587,13 @@ static int md_ioctl(struct inode *inode, struct file *file,
/*
* possibly make it lock the array ...
*/
err = autostart_array(val_to_kdev(arg), dev);
err = autostart_array(val_to_kdev(arg));
if (err) {
printk(KERN_WARNING "md: autostart %s failed!\n",
partition_name(val_to_kdev(arg)));
goto abort;
goto abort_unlock;
}
goto done;
goto done_unlock;
default:;
}
......@@ -2704,16 +2601,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
/*
* Commands querying/configuring an existing array:
*/
if (!mddev) {
err = -ENODEV;
goto abort;
}
err = lock_mddev(mddev);
if (err) {
printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
goto abort;
}
/* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
err = -ENODEV;
......@@ -2738,8 +2625,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
goto done_unlock;
case STOP_ARRAY:
if (!(err = do_md_stop (mddev, 0)))
mddev = NULL;
err = do_md_stop (mddev, 0);
goto done_unlock;
case STOP_ARRAY_RO:
......@@ -2784,10 +2670,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
switch (cmd)
{
case CLEAR_ARRAY:
err = clear_array(mddev);
goto done_unlock;
case ADD_NEW_DISK:
{
mdu_disk_info_t info;
......@@ -2808,35 +2690,12 @@ static int md_ioctl(struct inode *inode, struct file *file,
err = hot_add_disk(mddev, val_to_kdev(arg));
goto done_unlock;
case SET_DISK_INFO:
err = set_disk_info(mddev, (void *)arg);
goto done_unlock;
case WRITE_RAID_INFO:
err = write_raid_info(mddev);
goto done_unlock;
case UNPROTECT_ARRAY:
err = unprotect_array(mddev);
goto done_unlock;
case PROTECT_ARRAY:
err = protect_array(mddev);
goto done_unlock;
case SET_DISK_FAULTY:
err = set_disk_faulty(mddev, val_to_kdev(arg));
goto done_unlock;
case RUN_ARRAY:
{
/* The data is never used....
mdu_param_t param;
err = copy_from_user(&param, (mdu_param_t *)arg,
sizeof(param));
if (err)
goto abort_unlock;
*/
err = do_md_run (mddev);
/*
* we have to clean up the mess if
......@@ -2845,8 +2704,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
*/
if (err) {
mddev->sb_dirty = 0;
if (!do_md_stop (mddev, 0))
mddev = NULL;
do_md_stop (mddev, 0);
}
goto done_unlock;
}
......@@ -2861,8 +2719,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
done_unlock:
abort_unlock:
if (mddev)
unlock_mddev(mddev);
mddev_unlock(mddev);
return err;
done:
......@@ -2875,19 +2732,34 @@ static int md_ioctl(struct inode *inode, struct file *file,
static int md_open(struct inode *inode, struct file *file)
{
/*
* Always succeed, but increment the usage count
* Succeed if we can find or allocate a mddev structure.
*/
mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
if (mddev)
atomic_inc(&mddev->active);
return (0);
mddev_t *mddev = mddev_find(minor(inode->i_rdev));
int err = -ENOMEM;
if (!mddev)
goto out;
if ((err = mddev_lock(mddev)))
goto put;
err = 0;
mddev_unlock(mddev);
inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev);
put:
mddev_put(mddev);
out:
return err;
}
static int md_release(struct inode *inode, struct file * file)
{
mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
if (mddev)
atomic_dec(&mddev->active);
mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip;
if (!mddev)
BUG();
mddev_put(mddev);
return 0;
}
......@@ -2918,6 +2790,7 @@ int md_thread(void * arg)
*/
daemonize();
reparent_to_init();
sprintf(current->comm, thread->name);
current->exit_signal = SIGCHLD;
......@@ -2941,17 +2814,10 @@ int md_thread(void * arg)
complete(thread->event);
while (thread->run) {
void (*run)(void *data);
DECLARE_WAITQUEUE(wait, current);
add_wait_queue(&thread->wqueue, &wait);
set_task_state(current, TASK_INTERRUPTIBLE);
if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
dprintk("md: thread %p went to sleep.\n", thread);
schedule();
dprintk("md: thread %p woke up.\n", thread);
}
current->state = TASK_RUNNING;
remove_wait_queue(&thread->wqueue, &wait);
wait_event_interruptible(thread->wqueue,
test_bit(THREAD_WAKEUP, &thread->flags));
clear_bit(THREAD_WAKEUP, &thread->flags);
run = thread->run;
......@@ -3026,7 +2892,7 @@ void md_unregister_thread(mdk_thread_t *thread)
kfree(thread);
}
void md_recover_arrays(void)
static void md_recover_arrays(void)
{
if (!md_recovery_thread) {
MD_BUG();
......@@ -3042,7 +2908,7 @@ int md_error(mddev_t *mddev, struct block_device *bdev)
kdev_t rdev = to_kdev_t(bdev->bd_dev);
dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
major(dev),minor(dev),major(rdev),minor(rdev),
MD_MAJOR,mdidx(mddev),major(rdev),minor(rdev),
__builtin_return_address(0),__builtin_return_address(1),
__builtin_return_address(2),__builtin_return_address(3));
......@@ -3055,17 +2921,14 @@ int md_error(mddev_t *mddev, struct block_device *bdev)
return 0;
if (!mddev->pers->error_handler
|| mddev->pers->error_handler(mddev,rdev) <= 0) {
free_disk_sb(rrdev);
rrdev->faulty = 1;
} else
return 1;
/*
* if recovery was running, stop it now.
*/
if (mddev->pers->stop_resync)
mddev->pers->stop_resync(mddev);
if (mddev->recovery_running)
md_interrupt_thread(md_recovery_thread);
if (mddev->recovery_running)
mddev->recovery_running = -EIO;
md_recover_arrays();
return 0;
......@@ -3080,7 +2943,7 @@ static int status_unused(char * page)
sz += sprintf(page + sz, "unused devices: ");
ITERATE_RDEV_ALL(rdev,tmp) {
if (!rdev->same_set.next && !rdev->same_set.prev) {
if (list_empty(&rdev->same_set)) {
/*
* The device is not yet used by any array.
*/
......@@ -3123,18 +2986,9 @@ static int status_resync(char * page, mddev_t * mddev)
sz += sprintf(page + sz, ".");
sz += sprintf(page + sz, "] ");
}
if (!mddev->recovery_running)
/*
* true resync
*/
sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)",
res/10, res % 10, resync, max_blocks);
else
/*
* recovery ...
*/
sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)",
res/10, res % 10, resync, max_blocks);
sz += sprintf(page + sz, " %s =%3lu.%lu%% (%lu/%lu)",
(mddev->spare ? "recovery" : "resync"),
res/10, res % 10, resync, max_blocks);
/*
* We do not want to overflow, so the order of operands and
......@@ -3172,7 +3026,7 @@ static int md_status_read_proc(char *page, char **start, off_t off,
sz += sprintf(page+sz, "\n");
ITERATE_MDDEV(mddev,tmp) {
ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) {
sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
mddev->pers ? "" : "in");
if (mddev->pers) {
......@@ -3192,7 +3046,7 @@ static int md_status_read_proc(char *page, char **start, off_t off,
size += rdev->size;
}
if (mddev->nb_dev) {
if (!list_empty(&mddev->disks)) {
if (mddev->pers)
sz += sprintf(page + sz, "\n %d blocks",
md_size[mdidx(mddev)]);
......@@ -3202,19 +3056,20 @@ static int md_status_read_proc(char *page, char **start, off_t off,
if (!mddev->pers) {
sz += sprintf(page+sz, "\n");
mddev_unlock(mddev);
continue;
}
sz += mddev->pers->status (page+sz, mddev);
sz += sprintf(page+sz, "\n ");
if (mddev->curr_resync) {
if (mddev->curr_resync > 1)
sz += status_resync (page+sz, mddev);
} else {
if (atomic_read(&mddev->resync_sem.count) != 1)
else if (mddev->curr_resync == 1)
sz += sprintf(page + sz, " resync=DELAYED");
}
sz += sprintf(page + sz, "\n");
mddev_unlock(mddev);
}
sz += status_unused(page + sz);
......@@ -3315,60 +3170,70 @@ static int is_mddev_idle(mddev_t *mddev)
return idle;
}
DECLARE_WAIT_QUEUE_HEAD(resync_wait);
void md_done_sync(mddev_t *mddev, int blocks, int ok)
{
/* another "blocks" (512byte) blocks have been synced */
atomic_sub(blocks, &mddev->recovery_active);
wake_up(&mddev->recovery_wait);
if (!ok) {
mddev->recovery_running = -EIO;
md_recover_arrays();
// stop recovery, signal do_sync ....
}
}
DECLARE_WAIT_QUEUE_HEAD(resync_wait);
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
static void md_do_sync(void *data)
{
mddev_t *mddev = data;
mddev_t *mddev2;
unsigned int max_sectors, currspeed = 0,
j, window, err, serialize;
j, window, err;
unsigned long mark[SYNC_MARKS];
unsigned long mark_cnt[SYNC_MARKS];
int last_mark,m;
struct list_head *tmp;
unsigned long last_check;
/* just incase thread restarts... */
if (mddev->recovery_running <= 0)
return;
err = down_interruptible(&mddev->resync_sem);
if (err)
goto out_nolock;
/* we overload curr_resync somewhat here.
* 0 == not engaged in resync at all
* 2 == checking that there is no conflict with another sync
* 1 == like 2, but have yielded to allow conflicting resync to
* commense
* other == active in resync - this many blocks
*/
do {
mddev->curr_resync = 2;
recheck:
serialize = 0;
ITERATE_MDDEV(mddev2,tmp) {
if (mddev2 == mddev)
continue;
if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
printk(KERN_INFO "md: delaying resync of md%d until md%d "
"has finished resync (they share one or more physical units)\n",
mdidx(mddev), mdidx(mddev2));
serialize = 1;
break;
}
}
if (serialize) {
interruptible_sleep_on(&resync_wait);
if (signal_pending(current)) {
flush_curr_signals();
err = -EINTR;
goto out;
ITERATE_MDDEV(mddev2,tmp) {
if (mddev2 == mddev)
continue;
if (mddev2->curr_resync &&
match_mddev_units(mddev,mddev2)) {
printk(KERN_INFO "md: delaying resync of md%d until md%d "
"has finished resync (they share one or more physical units)\n",
mdidx(mddev), mdidx(mddev2));
if (mddev < mddev2) /* arbitrarily yield */
mddev->curr_resync = 1;
if (wait_event_interruptible(resync_wait,
mddev2->curr_resync < 2)) {
flush_curr_signals();
err = -EINTR;
mddev_put(mddev2);
goto out;
}
}
}
goto recheck;
}
} while (mddev->curr_resync < 2);
mddev->curr_resync = 1;
max_sectors = mddev->sb->size << 1;
printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
......@@ -3406,7 +3271,7 @@ int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
}
atomic_add(sectors, &mddev->recovery_active);
j += sectors;
mddev->curr_resync = j;
if (j>1) mddev->curr_resync = j;
if (last_check + window > j)
continue;
......@@ -3432,7 +3297,6 @@ int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
/*
* got a signal, exit.
*/
mddev->curr_resync = 0;
printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
flush_curr_signals();
err = -EINTR;
......@@ -3467,106 +3331,116 @@ int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
*/
out:
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
up(&mddev->resync_sem);
out_nolock:
/* tell personality that we are finished */
mddev->pers->sync_request(mddev, max_sectors, 1);
mddev->curr_resync = 0;
wake_up(&resync_wait);
return err;
if (err)
mddev->recovery_running = err;
if (mddev->recovery_running > 0)
mddev->recovery_running = 0;
if (mddev->recovery_running == 0)
mddev->in_sync = 1;
md_recover_arrays();
}
/*
* This is a kernel thread which syncs a spare disk with the active array
*
* the amount of foolproofing might seem to be a tad excessive, but an
* early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
* of my root partition with the first 0.5 gigs of my /home partition ... so
* i'm a bit nervous ;)
* This is the kernel thread that watches all md arrays for re-sync action
* that might be needed.
* It does not do any resync itself, but rather "forks" off other threads
* to do that as needed.
* When it is determined that resync is needed, we set "->recovery_running" and
* create a thread at ->sync_thread.
* When the thread finishes is clears recovery_running (or set and error)
* and wakeup up this thread which will reap the thread and finish up.
*/
void md_do_recovery(void *data)
{
int err;
mddev_t *mddev;
mdp_super_t *sb;
mdp_disk_t *spare;
struct list_head *tmp;
printk(KERN_INFO "md: recovery thread got woken up ...\n");
restart:
ITERATE_MDDEV(mddev,tmp) {
dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) {
sb = mddev->sb;
if (!sb)
continue;
if (mddev->recovery_running)
continue;
if (sb->active_disks == sb->raid_disks)
continue;
if (!sb->spare_disks) {
printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
"-- continuing in degraded mode\n", mdidx(mddev));
continue;
}
/*
* now here we get the spare and resync it.
*/
spare = get_spare(mddev);
if (!spare)
continue;
printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
mdidx(mddev), partition_name(mk_kdev(spare->major,spare->minor)));
if (!mddev->pers->diskop)
continue;
if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
continue;
down(&mddev->recovery_sem);
mddev->recovery_running = 1;
err = md_do_sync(mddev, spare);
if (err == -EIO) {
printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n",
mdidx(mddev), partition_name(mk_kdev(spare->major,spare->minor)));
if (!disk_faulty(spare)) {
mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
mark_disk_faulty(spare);
mark_disk_nonsync(spare);
mark_disk_inactive(spare);
sb->spare_disks--;
sb->working_disks--;
sb->failed_disks++;
if (!sb || !mddev->pers || !mddev->pers->diskop || mddev->ro)
goto unlock;
if (mddev->recovery_running > 0)
/* resync/recovery still happening */
goto unlock;
if (mddev->sync_thread) {
/* resync has finished, collect result */
md_unregister_thread(mddev->sync_thread);
mddev->sync_thread = NULL;
if (mddev->recovery_running < 0) {
/* some sort of failure.
* If we were doing a reconstruction,
* we need to retrieve the spare
*/
if (mddev->spare) {
mddev->pers->diskop(mddev, &mddev->spare,
DISKOP_SPARE_INACTIVE);
mddev->spare = NULL;
}
} else {
/* success...*/
if (mddev->spare) {
mddev->pers->diskop(mddev, &mddev->spare,
DISKOP_SPARE_ACTIVE);
mark_disk_sync(mddev->spare);
mark_disk_active(mddev->spare);
sb->active_disks++;
sb->spare_disks--;
mddev->spare = NULL;
}
}
} else
if (disk_faulty(spare))
mddev->pers->diskop(mddev, &spare,
DISKOP_SPARE_INACTIVE);
if (err == -EINTR || err == -ENOMEM) {
/*
* Recovery got interrupted, or ran out of mem ...
* signal back that we have finished using the array.
*/
mddev->pers->diskop(mddev, &spare,
DISKOP_SPARE_INACTIVE);
up(&mddev->recovery_sem);
__md_update_sb(mddev);
mddev->recovery_running = 0;
continue;
} else {
wake_up(&resync_wait);
goto unlock;
}
if (mddev->recovery_running) {
/* that's odd.. */
mddev->recovery_running = 0;
up(&mddev->recovery_sem);
wake_up(&resync_wait);
}
if (!disk_faulty(spare)) {
/*
* the SPARE_ACTIVE diskop possibly changes the
* pointer too
*/
mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
mark_disk_sync(spare);
mark_disk_active(spare);
sb->active_disks++;
sb->spare_disks--;
if (sb->active_disks < sb->raid_disks) {
mddev->spare = get_spare(mddev);
if (!mddev->spare)
printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
"-- continuing in degraded mode\n", mdidx(mddev));
else
printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
mdidx(mddev), partition_name(mk_kdev(mddev->spare->major,mddev->spare->minor)));
}
if (!mddev->spare && mddev->in_sync) {
/* nothing we can do ... */
goto unlock;
}
if (mddev->pers->sync_request) {
mddev->sync_thread = md_register_thread(md_do_sync,
mddev,
"md_resync");
if (!mddev->sync_thread) {
printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev));
if (mddev->spare)
mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_INACTIVE);
mddev->spare = NULL;
mddev->recovery_running = 0;
} else {
if (mddev->spare)
mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_WRITE);
mddev->recovery_running = 1;
md_wakeup_thread(mddev->sync_thread);
}
}
mddev->sb_dirty = 1;
md_update_sb(mddev);
goto restart;
unlock:
mddev_unlock(mddev);
}
printk(KERN_INFO "md: recovery thread finished ...\n");
dprintk(KERN_INFO "md: recovery thread finished ...\n");
}
......@@ -3582,7 +3456,8 @@ int md_notify_reboot(struct notifier_block *this,
return NOTIFY_DONE;
ITERATE_MDDEV(mddev,tmp)
do_md_stop (mddev, 1);
if (mddev_trylock(mddev)==0)
do_md_stop (mddev, 1);
/*
* certain more exotic SCSI devices are known to be
* volatile wrt too early system reboots. While the
......@@ -3606,7 +3481,6 @@ static void md_geninit(void)
for(i = 0; i < MAX_MD_DEVS; i++) {
md_size[i] = 0;
md_maxreadahead[i] = 32;
}
blk_size[MAJOR_NR] = md_size;
......@@ -3617,6 +3491,18 @@ static void md_geninit(void)
#endif
}
request_queue_t * md_queue_proc(kdev_t dev)
{
mddev_t *mddev = mddev_find(minor(dev));
request_queue_t *q = BLK_DEFAULT_QUEUE(MAJOR_NR);
if (!mddev || atomic_read(&mddev->active)<2)
BUG();
if (mddev->pers)
q = &mddev->queue;
mddev_put(mddev); /* the caller must hold a reference... */
return q;
}
int __init md_init(void)
{
static char * name = "mdrecoveryd";
......@@ -3641,8 +3527,9 @@ int __init md_init(void)
S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
}
/* forward all md request to md_make_request */
blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request);
/* all requests on an uninitialised device get failed... */
blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request);
blk_dev[MAJOR_NR].queue = md_queue_proc;
add_gendisk(&md_gendisk);
......@@ -3720,7 +3607,7 @@ static void autostart_arrays(void)
}
dev_cnt = 0;
autorun_devices(to_kdev_t(-1));
autorun_devices();
}
static struct {
......@@ -3859,17 +3746,27 @@ void __init md_setup_drive(void)
if (!md_setup_args.device_set[minor])
continue;
if (mddev_map[minor].mddev) {
printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
mddev = mddev_find(minor);
if (!mddev) {
printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
continue;
}
if (mddev_lock(mddev)) {
printk(KERN_WARNING
"md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
"md: Ignoring md=%d, cannot lock!\n",
minor);
mddev_put(mddev);
continue;
}
printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
mddev = alloc_mddev(mk_kdev(MD_MAJOR,minor));
if (!mddev) {
printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
if (mddev->sb || !list_empty(&mddev->disks)) {
printk(KERN_WARNING
"md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
minor);
mddev_unlock(mddev);
mddev_put(mddev);
continue;
}
if (md_setup_args.pers[minor]) {
......@@ -3923,6 +3820,8 @@ void __init md_setup_drive(void)
do_md_stop(mddev, 0);
printk(KERN_WARNING "md: starting md%d failed\n", minor);
}
mddev_unlock(mddev);
mddev_put(mddev);
}
}
......@@ -3973,9 +3872,10 @@ int init_module(void)
static void free_device_names(void)
{
while (device_names.next != &device_names) {
struct list_head *tmp = device_names.next;
list_del(tmp);
while (!list_empty(&device_names)) {
struct dname *tmp = list_entry(device_names.next,
dev_name_t, list);
list_del(&tmp->list);
kfree(tmp);
}
}
......@@ -4006,10 +3906,8 @@ EXPORT_SYMBOL(register_md_personality);
EXPORT_SYMBOL(unregister_md_personality);
EXPORT_SYMBOL(partition_name);
EXPORT_SYMBOL(md_error);
EXPORT_SYMBOL(md_do_sync);
EXPORT_SYMBOL(md_sync_acct);
EXPORT_SYMBOL(md_done_sync);
EXPORT_SYMBOL(md_recover_arrays);
EXPORT_SYMBOL(md_register_thread);
EXPORT_SYMBOL(md_unregister_thread);
EXPORT_SYMBOL(md_update_sb);
......@@ -4017,7 +3915,5 @@ EXPORT_SYMBOL(md_wakeup_thread);
EXPORT_SYMBOL(md_print_devices);
EXPORT_SYMBOL(find_rdev_nr);
EXPORT_SYMBOL(md_interrupt_thread);
EXPORT_SYMBOL(mddev_map);
EXPORT_SYMBOL(md_check_ordering);
EXPORT_SYMBOL(get_spare);
MODULE_LICENSE("GPL");
......@@ -244,27 +244,19 @@ static int multipath_read_balance (multipath_conf_t *conf)
return 0;
}
static int multipath_make_request (mddev_t *mddev, int rw, struct bio * bio)
static int multipath_make_request (request_queue_t *q, struct bio * bio)
{
mddev_t *mddev = q->queuedata;
multipath_conf_t *conf = mddev_to_conf(mddev);
struct bio *real_bio;
struct multipath_bh * mp_bh;
struct multipath_info *multipath;
/*
* make_request() can abort the operation when READA is being
* used and no empty request is available.
*
* Currently, just replace the command with READ/WRITE.
*/
if (rw == READA)
rw = READ;
mp_bh = multipath_alloc_mpbh (conf);
mp_bh->master_bio = bio;
mp_bh->mddev = mddev;
mp_bh->cmd = rw;
mp_bh->cmd = bio_data_dir(bio);
/*
* read balancing logic:
......@@ -273,7 +265,7 @@ static int multipath_make_request (mddev_t *mddev, int rw, struct bio * bio)
real_bio = bio_clone(bio, GFP_NOIO);
real_bio->bi_bdev = multipath->bdev;
real_bio->bi_rw = rw;
real_bio->bi_rw = bio_data_dir(bio);
real_bio->bi_end_io = multipath_end_request;
real_bio->bi_private = mp_bh;
mp_bh->bio = real_bio;
......@@ -708,7 +700,6 @@ static void multipathd (void *data)
mddev = mp_bh->mddev;
if (mddev->sb_dirty) {
printk(KERN_INFO "dirty sb detected, updating.\n");
mddev->sb_dirty = 0;
md_update_sb(mddev);
}
bio = mp_bh->bio;
......
......@@ -29,21 +29,26 @@
static int create_strip_zones (mddev_t *mddev)
{
int i, c, j, j1, j2;
int i, c, j;
unsigned long current_offset, curr_zone_offset;
raid0_conf_t *conf = mddev_to_conf(mddev);
mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
struct list_head *tmp1, *tmp2;
struct strip_zone *zone;
int cnt;
/*
* The number of 'same size groups'
*/
conf->nr_strip_zones = 0;
ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
ITERATE_RDEV(mddev,rdev1,tmp1) {
printk("raid0: looking at %s\n", partition_name(rdev1->dev));
c = 0;
ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
printk("raid0: comparing %s(%ld) with %s(%ld)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size);
ITERATE_RDEV(mddev,rdev2,tmp2) {
printk("raid0: comparing %s(%ld) with %s(%ld)\n",
partition_name(rdev1->dev), rdev1->size,
partition_name(rdev2->dev), rdev2->size);
if (rdev2 == rdev1) {
printk("raid0: END\n");
break;
......@@ -51,7 +56,7 @@ static int create_strip_zones (mddev_t *mddev)
if (rdev2->size == rdev1->size)
{
/*
* Not unique, dont count it as a new
* Not unique, don't count it as a new
* group
*/
printk("raid0: EQUAL\n");
......@@ -66,29 +71,62 @@ static int create_strip_zones (mddev_t *mddev)
printk("raid0: %d zones\n", conf->nr_strip_zones);
}
}
printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
conf->nr_strip_zones);
if (!conf->strip_zone)
return 1;
memset(conf->strip_zone, 0,sizeof(struct strip_zone)*
conf->nr_strip_zones);
/* The first zone must contain all devices, so here we check that
* there is a properly alignment of slots to devices and find them all
*/
zone = &conf->strip_zone[0];
cnt = 0;
smallest = NULL;
ITERATE_RDEV(mddev, rdev1, tmp1) {
int j = rdev1->sb->this_disk.raid_disk;
if (j < 0 || j >= mddev->sb->raid_disks) {
printk("raid0: bad disk number %d - aborting!\n", j);
goto abort;
}
if (zone->dev[j]) {
printk("raid0: multiple devices for %d - aborting!\n", j);
goto abort;
}
zone->dev[j] = rdev1;
if (!smallest || (rdev1->size <smallest->size))
smallest = rdev1;
cnt++;
}
if (cnt != mddev->sb->raid_disks) {
printk("raid0: too few disks (%d of %d) - aborting!\n", cnt,
mddev->sb->raid_disks);
goto abort;
}
zone->nb_dev = cnt;
zone->size = smallest->size * cnt;
zone->zone_offset = 0;
conf->smallest = NULL;
current_offset = 0;
curr_zone_offset = 0;
conf->smallest = zone;
current_offset = smallest->size;
curr_zone_offset = zone->size;
for (i = 0; i < conf->nr_strip_zones; i++)
/* now do the other zones */
for (i = 1; i < conf->nr_strip_zones; i++)
{
struct strip_zone *zone = conf->strip_zone + i;
zone = conf->strip_zone + i;
printk("raid0: zone %d\n", i);
zone->dev_offset = current_offset;
smallest = NULL;
c = 0;
ITERATE_RDEV_ORDERED(mddev,rdev,j) {
for (j=0; j<cnt; j++) {
rdev = conf->strip_zone[0].dev[j];
printk("raid0: checking %s ...", partition_name(rdev->dev));
if (rdev->size > current_offset)
{
......@@ -118,6 +156,9 @@ static int create_strip_zones (mddev_t *mddev)
}
printk("raid0: done.\n");
return 0;
abort:
vfree(conf->strip_zone);
return 1;
}
static int raid0_run (mddev_t *mddev)
......@@ -132,11 +173,6 @@ static int raid0_run (mddev_t *mddev)
goto out;
mddev->private = (void *)conf;
if (md_check_ordering(mddev)) {
printk("raid0: disks are not ordered, aborting!\n");
goto out_free_conf;
}
if (create_strip_zones (mddev))
goto out_free_conf;
......@@ -225,8 +261,9 @@ static int raid0_stop (mddev_t *mddev)
* Of course, those facts may not be valid anymore (and surely won't...)
* Hey guys, there's some work out there ;-)
*/
static int raid0_make_request (mddev_t *mddev, int rw, struct bio *bio)
static int raid0_make_request (request_queue_t *q, struct bio *bio)
{
mddev_t *mddev = q->queuedata;
unsigned int sect_in_chunk, chunksize_bits, chunk_size;
raid0_conf_t *conf = mddev_to_conf(mddev);
struct raid0_hash *hash;
......@@ -234,7 +271,7 @@ static int raid0_make_request (mddev_t *mddev, int rw, struct bio *bio)
mdk_rdev_t *tmp_dev;
unsigned long chunk, block, rsect;
chunk_size = mddev->param.chunk_size >> 10;
chunk_size = mddev->sb->chunk_size >> 10;
chunksize_bits = ffz(~chunk_size);
block = bio->bi_sector >> 1;
hash = conf->hash_table + block / conf->smallest->size;
......@@ -323,7 +360,7 @@ static int raid0_status (char *page, mddev_t *mddev)
conf->strip_zone[j].size);
}
#endif
sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024);
sz += sprintf(page + sz, " %dk chunks", mddev->sb->chunk_size/1024);
return sz;
}
......
......@@ -334,7 +334,7 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
* device if no resync is going on, or below the resync window.
* We take the first readable disk when above the resync window.
*/
if (conf->resync_mirrors && (this_sector + sectors >= conf->next_resync)) {
if (!conf->mddev->in_sync && (this_sector + sectors >= conf->next_resync)) {
/* make sure that disk is operational */
new_disk = 0;
while (!conf->mirrors[new_disk].operational || conf->mirrors[new_disk].write_only) {
......@@ -434,8 +434,9 @@ static void resume_device(conf_t *conf)
spin_unlock_irq(&conf->resync_lock);
}
static int make_request(mddev_t *mddev, int rw, struct bio * bio)
static int make_request(request_queue_t *q, struct bio * bio)
{
mddev_t *mddev = q->queuedata;
conf_t *conf = mddev_to_conf(mddev);
mirror_info_t *mirror;
r1bio_t *r1_bio;
......@@ -456,20 +457,16 @@ static int make_request(mddev_t *mddev, int rw, struct bio * bio)
* make_request() can abort the operation when READA is being
* used and no empty request is available.
*
* Currently, just replace the command with READ.
*/
if (rw == READA)
rw = READ;
r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
r1_bio->master_bio = bio;
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector;
r1_bio->cmd = rw;
r1_bio->cmd = bio_data_dir(bio);
if (rw == READ) {
if (r1_bio->cmd == READ) {
/*
* read balancing logic:
*/
......@@ -483,7 +480,7 @@ static int make_request(mddev_t *mddev, int rw, struct bio * bio)
read_bio->bi_sector = r1_bio->sector;
read_bio->bi_bdev = mirror->bdev;
read_bio->bi_end_io = end_request;
read_bio->bi_rw = rw;
read_bio->bi_rw = r1_bio->cmd;
read_bio->bi_private = r1_bio;
generic_make_request(read_bio);
......@@ -507,7 +504,7 @@ static int make_request(mddev_t *mddev, int rw, struct bio * bio)
mbio->bi_sector = r1_bio->sector;
mbio->bi_bdev = conf->mirrors[i].bdev;
mbio->bi_end_io = end_request;
mbio->bi_rw = rw;
mbio->bi_rw = r1_bio->cmd;
mbio->bi_private = r1_bio;
sum_bios++;
......@@ -656,6 +653,9 @@ static void close_sync(conf_t *conf)
if (conf->barrier) BUG();
if (waitqueue_active(&conf->wait_idle)) BUG();
if (waitqueue_active(&conf->wait_resume)) BUG();
mempool_destroy(conf->r1buf_pool);
conf->r1buf_pool = NULL;
}
static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
......@@ -772,7 +772,6 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
* Deactivate a spare disk:
*/
case DISKOP_SPARE_INACTIVE:
close_sync(conf);
sdisk = conf->mirrors + spare_disk;
sdisk->operational = 0;
sdisk->write_only = 0;
......@@ -785,7 +784,6 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
* property)
*/
case DISKOP_SPARE_ACTIVE:
close_sync(conf);
sdisk = conf->mirrors + spare_disk;
fdisk = conf->mirrors + failed_disk;
......@@ -919,10 +917,6 @@ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
}
abort:
spin_unlock_irq(&conf->device_lock);
if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) {
mempool_destroy(conf->r1buf_pool);
conf->r1buf_pool = NULL;
}
print_conf(conf);
return err;
......@@ -1012,7 +1006,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
* we read from here, no need to write
*/
continue;
if (i < conf->raid_disks && !conf->resync_mirrors)
if (i < conf->raid_disks && mddev->in_sync)
/*
* don't need to write this we are just rebuilding
*/
......@@ -1088,7 +1082,6 @@ static void raid1d(void *data)
conf = mddev_to_conf(mddev);
if (mddev->sb_dirty) {
printk(KERN_INFO "raid1: dirty sb detected, updating.\n");
mddev->sb_dirty = 0;
md_update_sb(mddev);
}
bio = r1_bio->master_bio;
......@@ -1118,31 +1111,6 @@ static void raid1d(void *data)
spin_unlock_irqrestore(&retry_list_lock, flags);
}
/*
* Private kernel thread to reconstruct mirrors after an unclean
* shutdown.
*/
static void raid1syncd(void *data)
{
conf_t *conf = data;
mddev_t *mddev = conf->mddev;
if (!conf->resync_mirrors)
return;
if (conf->resync_mirrors == 2)
return;
down(&mddev->recovery_sem);
if (!md_do_sync(mddev, NULL)) {
/*
* Only if everything went Ok.
*/
conf->resync_mirrors = 0;
}
close_sync(conf);
up(&mddev->recovery_sem);
}
static int init_resync(conf_t *conf)
{
......@@ -1177,9 +1145,16 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
sector_t max_sector, nr_sectors;
int disk, partial;
if (!sector_nr)
if (sector_nr == 0)
if (init_resync(conf))
return -ENOMEM;
max_sector = mddev->sb->size << 1;
if (sector_nr >= max_sector) {
close_sync(conf);
return 0;
}
/*
* If there is non-resync activity waiting for us then
* put in a delay to throttle resync.
......@@ -1216,10 +1191,6 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
r1_bio->sector = sector_nr;
r1_bio->cmd = SPECIAL;
max_sector = mddev->sb->size << 1;
if (sector_nr >= max_sector)
BUG();
bio = r1_bio->master_bio;
nr_sectors = RESYNC_BLOCK_SIZE >> 9;
if (max_sector - sector_nr < nr_sectors)
......@@ -1302,7 +1273,6 @@ static int run(mddev_t *mddev)
mdp_disk_t *descriptor;
mdk_rdev_t *rdev;
struct list_head *tmp;
int start_recovery = 0;
MOD_INC_USE_COUNT;
......@@ -1454,10 +1424,6 @@ static int run(mddev_t *mddev)
conf->last_used = j;
if (conf->working_disks != sb->raid_disks) {
printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
start_recovery = 1;
}
{
const char * name = "raid1d";
......@@ -1469,20 +1435,6 @@ static int run(mddev_t *mddev)
}
}
if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
(conf->working_disks > 1)) {
const char * name = "raid1syncd";
conf->resync_thread = md_register_thread(raid1syncd, conf, name);
if (!conf->resync_thread) {
printk(THREAD_ERROR, mdidx(mddev));
goto out_free_conf;
}
printk(START_RESYNC, mdidx(mddev));
conf->resync_mirrors = 1;
md_wakeup_thread(conf->resync_thread);
}
/*
* Regenerate the "device is in sync with the raid set" bit for
......@@ -1499,10 +1451,6 @@ static int run(mddev_t *mddev)
}
sb->active_disks = conf->working_disks;
if (start_recovery)
md_recover_arrays();
printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
/*
* Ok, everything is just fine now
......@@ -1522,47 +1470,12 @@ static int run(mddev_t *mddev)
return -EIO;
}
static int stop_resync(mddev_t *mddev)
{
conf_t *conf = mddev_to_conf(mddev);
if (conf->resync_thread) {
if (conf->resync_mirrors) {
conf->resync_mirrors = 2;
md_interrupt_thread(conf->resync_thread);
printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
return 1;
}
return 0;
}
return 0;
}
static int restart_resync(mddev_t *mddev)
{
conf_t *conf = mddev_to_conf(mddev);
if (conf->resync_mirrors) {
if (!conf->resync_thread) {
MD_BUG();
return 0;
}
conf->resync_mirrors = 1;
md_wakeup_thread(conf->resync_thread);
return 1;
}
return 0;
}
static int stop(mddev_t *mddev)
{
conf_t *conf = mddev_to_conf(mddev);
int i;
md_unregister_thread(conf->thread);
if (conf->resync_thread)
md_unregister_thread(conf->resync_thread);
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
for (i = 0; i < MD_SB_DISKS; i++)
......@@ -1583,8 +1496,6 @@ static mdk_personality_t raid1_personality =
status: status,
error_handler: error,
diskop: diskop,
stop_resync: stop_resync,
restart_resync: restart_resync,
sync_request: sync_request
};
......
......@@ -634,7 +634,6 @@ static void copy_data(int frombio, struct bio *bio,
else
page_offset = (signed)(sector - bio->bi_sector) * -512;
bio_for_each_segment(bvl, bio, i) {
char *ba = __bio_kmap(bio, i);
int len = bio_iovec_idx(bio,i)->bv_len;
int clen;
int b_offset = 0;
......@@ -649,13 +648,16 @@ static void copy_data(int frombio, struct bio *bio,
clen = STRIPE_SIZE - page_offset;
else clen = len;
if (len > 0) {
if (clen > 0) {
char *ba = __bio_kmap(bio, i);
if (frombio)
memcpy(pa+page_offset, ba+b_offset, clen);
else
memcpy(ba+b_offset, pa+page_offset, clen);
}
__bio_kunmap(bio, i);
__bio_kunmap(bio, i);
}
if (clen < len) /* hit end of page */
break;
page_offset += len;
}
}
......@@ -810,6 +812,8 @@ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx,
spin_unlock_irq(&conf->device_lock);
spin_unlock(&sh->lock);
PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx);
if (forwrite) {
/* check if page is coverred */
sector_t sector = sh->dev[dd_idx].sector;
......@@ -823,8 +827,6 @@ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx,
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
}
PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx);
}
......@@ -1036,7 +1038,7 @@ static void handle_stripe(struct stripe_head *sh)
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (conf->disks[i].operational
/* && !(conf->resync_parity && i == sh->pd_idx) */
/* && !(!mddev->insync && i == sh->pd_idx) */
)
rmw++;
else rmw += 2*disks; /* cannot read it */
......@@ -1226,14 +1228,15 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
}
static void raid5_unplug_device(void *data)
{
raid5_conf_t *conf = (raid5_conf_t *)data;
request_queue_t *q = data;
mddev_t *mddev = q->queuedata;
raid5_conf_t *conf = mddev_to_conf(mddev);
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags);
raid5_activate_delayed(conf);
conf->plugged = 0;
if (blk_remove_plug(q))
raid5_activate_delayed(conf);
md_wakeup_thread(conf->thread);
spin_unlock_irqrestore(&conf->device_lock, flags);
......@@ -1242,31 +1245,21 @@ static void raid5_unplug_device(void *data)
static inline void raid5_plug_device(raid5_conf_t *conf)
{
spin_lock_irq(&conf->device_lock);
if (list_empty(&conf->delayed_list))
if (!conf->plugged) {
conf->plugged = 1;
queue_task(&conf->plug_tq, &tq_disk);
}
blk_plug_device(&conf->mddev->queue);
spin_unlock_irq(&conf->device_lock);
}
static int make_request (mddev_t *mddev, int rw, struct bio * bi)
static int make_request (request_queue_t *q, struct bio * bi)
{
raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
mddev_t *mddev = q->queuedata;
raid5_conf_t *conf = mddev_to_conf(mddev);
const unsigned int raid_disks = conf->raid_disks;
const unsigned int data_disks = raid_disks - 1;
unsigned int dd_idx, pd_idx;
sector_t new_sector;
sector_t logical_sector, last_sector;
int read_ahead = 0;
struct stripe_head *sh;
if (rw == READA) {
rw = READ;
read_ahead=1;
}
logical_sector = bi->bi_sector & ~(STRIPE_SECTORS-1);
last_sector = bi->bi_sector + (bi->bi_size>>9);
......@@ -1281,10 +1274,10 @@ static int make_request (mddev_t *mddev, int rw, struct bio * bi)
PRINTK("raid5: make_request, sector %ul logical %ul\n",
new_sector, logical_sector);
sh = get_active_stripe(conf, new_sector, pd_idx, read_ahead);
sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
if (sh) {
add_stripe_bio(sh, bi, dd_idx, rw);
add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
raid5_plug_device(conf);
handle_stripe(sh);
......@@ -1311,6 +1304,10 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
int raid_disks = conf->raid_disks;
int data_disks = raid_disks-1;
if (sector_nr >= mddev->sb->size <<1)
/* just being told to finish up .. nothing to do */
return 0;
first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
......@@ -1343,17 +1340,15 @@ static void raid5d (void *data)
handled = 0;
if (mddev->sb_dirty) {
mddev->sb_dirty = 0;
if (mddev->sb_dirty)
md_update_sb(mddev);
}
spin_lock_irq(&conf->device_lock);
while (1) {
struct list_head *first;
if (list_empty(&conf->handle_list) &&
atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
!conf->plugged &&
!blk_queue_plugged(&mddev->queue) &&
!list_empty(&conf->delayed_list))
raid5_activate_delayed(conf);
......@@ -1382,31 +1377,6 @@ static void raid5d (void *data)
PRINTK("--- raid5d inactive\n");
}
/*
* Private kernel thread for parity reconstruction after an unclean
* shutdown. Reconstruction on spare drives in case of a failed drive
* is done by the generic mdsyncd.
*/
static void raid5syncd (void *data)
{
raid5_conf_t *conf = data;
mddev_t *mddev = conf->mddev;
if (!conf->resync_parity)
return;
if (conf->resync_parity == 2)
return;
down(&mddev->recovery_sem);
if (md_do_sync(mddev,NULL)) {
up(&mddev->recovery_sem);
printk("raid5: resync aborted!\n");
return;
}
conf->resync_parity = 0;
up(&mddev->recovery_sem);
printk("raid5: resync finished.\n");
}
static int run (mddev_t *mddev)
{
raid5_conf_t *conf;
......@@ -1416,7 +1386,6 @@ static int run (mddev_t *mddev)
mdk_rdev_t *rdev;
struct disk_info *disk;
struct list_head *tmp;
int start_recovery = 0;
MOD_INC_USE_COUNT;
......@@ -1444,10 +1413,7 @@ static int run (mddev_t *mddev)
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
conf->plugged = 0;
conf->plug_tq.sync = 0;
conf->plug_tq.routine = &raid5_unplug_device;
conf->plug_tq.data = conf;
mddev->queue.unplug_fn = raid5_unplug_device;
PRINTK("raid5: run(md%d) called.\n", mdidx(mddev));
......@@ -1571,9 +1537,10 @@ static int run (mddev_t *mddev)
goto abort;
}
if (conf->working_disks != sb->raid_disks) {
printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
start_recovery = 1;
if (conf->failed_disks == 1 &&
!(sb->state & (1<<MD_SB_CLEAN))) {
printk(KERN_ERR "raid5: cannot start dirty degraded array for md%d\n", mdidx(mddev));
goto abort;
}
{
......@@ -1587,10 +1554,11 @@ static int run (mddev_t *mddev)
}
memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
if (grow_stripes(conf, conf->max_nr_stripes)) {
printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
shrink_stripes(conf);
md_unregister_thread(conf->thread);
goto abort;
} else
printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
......@@ -1615,23 +1583,6 @@ static int run (mddev_t *mddev)
else
printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
const char * name = "raid5syncd";
conf->resync_thread = md_register_thread(raid5syncd, conf,name);
if (!conf->resync_thread) {
printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
goto abort;
}
printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
conf->resync_parity = 1;
md_wakeup_thread(conf->resync_thread);
}
print_raid5_conf(conf);
if (start_recovery)
md_recover_arrays();
print_raid5_conf(conf);
/* Ok, everything is just fine now */
......@@ -1650,48 +1601,12 @@ static int run (mddev_t *mddev)
return -EIO;
}
static int stop_resync (mddev_t *mddev)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
mdk_thread_t *thread = conf->resync_thread;
if (thread) {
if (conf->resync_parity) {
conf->resync_parity = 2;
md_interrupt_thread(thread);
printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
return 1;
}
return 0;
}
return 0;
}
static int restart_resync (mddev_t *mddev)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
if (conf->resync_parity) {
if (!conf->resync_thread) {
MD_BUG();
return 0;
}
printk("raid5: waking up raid5resync.\n");
conf->resync_parity = 1;
md_wakeup_thread(conf->resync_thread);
return 1;
} else
printk("raid5: no restart-resync needed.\n");
return 0;
}
static int stop (mddev_t *mddev)
{
raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
if (conf->resync_thread)
md_unregister_thread(conf->resync_thread);
md_unregister_thread(conf->thread);
shrink_stripes(conf);
free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
......@@ -2066,8 +1981,6 @@ static mdk_personality_t raid5_personality=
status: status,
error_handler: error,
diskop: diskop,
stop_resync: stop_resync,
restart_resync: restart_resync,
sync_request: sync_request
};
......
......@@ -63,8 +63,6 @@
extern int md_size[MAX_MD_DEVS];
extern struct hd_struct md_hd_struct[MAX_MD_DEVS];
extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev);
extern char * partition_name (kdev_t dev);
extern inline char * bdev_partition_name (struct block_device *bdev)
{
......@@ -77,14 +75,9 @@ extern mdk_thread_t * md_register_thread (void (*run) (void *data),
extern void md_unregister_thread (mdk_thread_t *thread);
extern void md_wakeup_thread(mdk_thread_t *thread);
extern void md_interrupt_thread (mdk_thread_t *thread);
extern int md_update_sb (mddev_t *mddev);
extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
extern void md_update_sb (mddev_t *mddev);
extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_sync_acct(kdev_t dev, unsigned long nr_sectors);
extern void md_recover_arrays (void);
extern int md_check_ordering (mddev_t *mddev);
extern int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x);
extern int md_error (mddev_t *mddev, struct block_device *bdev);
extern int md_run_setup(void);
......
......@@ -64,24 +64,6 @@ typedef struct mdk_rdev_s mdk_rdev_t;
#define MAX_MD_DEVS (1<<MINORBITS) /* Max number of md dev */
/*
* Maps a kdev to an mddev/subdev. How 'data' is handled is up to
* the personality. (eg. HSM uses this to identify individual LVs)
*/
typedef struct dev_mapping_s {
mddev_t *mddev;
void *data;
} dev_mapping_t;
extern dev_mapping_t mddev_map [MAX_MD_DEVS];
static inline mddev_t * kdev_to_mddev (kdev_t dev)
{
if (major(dev) != MD_MAJOR)
BUG();
return mddev_map[minor(dev)].mddev;
}
/*
* options passed in raidrun:
*/
......@@ -196,31 +178,38 @@ struct mddev_s
mdk_personality_t *pers;
int __minor;
mdp_super_t *sb;
int nb_dev;
struct list_head disks;
int sb_dirty;
mdu_param_t param;
int ro;
struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
unsigned long curr_resync; /* blocks scheduled */
unsigned long resync_mark; /* a recent timestamp */
unsigned long resync_mark_cnt;/* blocks written at resync_mark */
char *name;
/* recovery_running is 0 for no recovery/resync,
* 1 for active recovery
* 2 for active resync
* -error for an error (e.g. -EINTR)
* it can only be set > 0 under reconfig_sem
*/
int recovery_running;
int in_sync; /* know to not need resync */
struct semaphore reconfig_sem;
struct semaphore recovery_sem;
struct semaphore resync_sem;
atomic_t active;
mdp_disk_t *spare;
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
request_queue_t queue; /* for plugging ... */
struct list_head all_mddevs;
};
struct mdk_personality_s
{
char *name;
int (*make_request)(mddev_t *mddev, int rw, struct bio *bio);
int (*make_request)(request_queue_t *q, struct bio *bio);
int (*run)(mddev_t *mddev);
int (*stop)(mddev_t *mddev);
int (*status)(char *page, mddev_t *mddev);
......@@ -237,9 +226,6 @@ struct mdk_personality_s
* SPARE_ACTIVE expects such a change)
*/
int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
int (*stop_resync)(mddev_t *mddev);
int (*restart_resync)(mddev_t *mddev);
int (*sync_request)(mddev_t *mddev, sector_t sector_nr, int go_faster);
};
......@@ -279,13 +265,6 @@ extern mdp_disk_t *get_spare(mddev_t *mddev);
#define ITERATE_RDEV(mddev,rdev,tmp) \
ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
/*
* Same as above, but assumes that the device has rdev->desc_nr numbered
* from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
*/
#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \
for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
/*
* Iterates through all 'RAID managed disks'
......@@ -299,26 +278,6 @@ extern mdp_disk_t *get_spare(mddev_t *mddev);
#define ITERATE_RDEV_PENDING(rdev,tmp) \
ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
/*
* iterates through all used mddevs in the system.
*/
#define ITERATE_MDDEV(mddev,tmp) \
\
for (tmp = all_mddevs.next; \
mddev = list_entry(tmp, mddev_t, all_mddevs), \
tmp = tmp->next, tmp->prev != &all_mddevs \
; )
static inline int lock_mddev (mddev_t * mddev)
{
return down_interruptible(&mddev->reconfig_sem);
}
static inline void unlock_mddev (mddev_t * mddev)
{
up(&mddev->reconfig_sem);
}
#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
x = y; y = __tmp; } while (0)
......
......@@ -33,8 +33,7 @@ struct r1_private_data_s {
int working_disks;
int last_used;
sector_t next_seq_sect;
mdk_thread_t *thread, *resync_thread;
int resync_mirrors;
mdk_thread_t *thread;
mirror_info_t *spare;
spinlock_t device_lock;
......
......@@ -177,7 +177,7 @@ struct stripe_head {
* is put on a "delayed" queue until there are no stripes currently
* in a pre-read phase. Further, if the "delayed" queue is empty when
* a stripe is put on it then we "plug" the queue and do not process it
* until an unplg call is made. (the tq_disk list is run).
* until an unplug call is made. (blk_run_queues is run).
*
* When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
* it to the count of prereading stripes.
......@@ -205,12 +205,11 @@ struct disk_info {
struct raid5_private_data {
struct stripe_head **stripe_hashtbl;
mddev_t *mddev;
mdk_thread_t *thread, *resync_thread;
mdk_thread_t *thread;
struct disk_info disks[MD_SB_DISKS];
struct disk_info *spare;
int chunk_size, level, algorithm;
int raid_disks, working_disks, failed_disks;
int resync_parity;
int max_nr_stripes;
struct list_head handle_list; /* stripes needing handling */
......@@ -229,9 +228,6 @@ struct raid5_private_data {
* waiting for 25% to be free
*/
spinlock_t device_lock;
int plugged;
struct tq_struct plug_tq;
};
typedef struct raid5_private_data raid5_conf_t;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment