Commit d571b483 authored by Neil Brown's avatar Neil Brown Committed by Linus Torvalds

[PATCH] md: factor out MD superblock handling code

Define an interface for interpreting and updating superblocks
so we can more easily define new formats.

With this patch, (almost) all superblock layout information is
locating in a small set of routines dedicated to superblock
handling.  This will allow us to provide a similar set for
a different format.

The two exceptions are:
 1/ autostart_array where the devices listed in the superblock
    are searched for.
 2/ raid5 'knows' the maximum number of devices for
     compute_parity.

These will be addressed in a later patch.
parent 6932d2d5
...@@ -307,8 +307,6 @@ static int alloc_disk_sb(mdk_rdev_t * rdev) ...@@ -307,8 +307,6 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
printk(OUT_OF_MEM); printk(OUT_OF_MEM);
return -EINVAL; return -EINVAL;
} }
rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
clear_page(rdev->sb);
return 0; return 0;
} }
...@@ -317,7 +315,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) ...@@ -317,7 +315,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
{ {
if (rdev->sb_page) { if (rdev->sb_page) {
page_cache_release(rdev->sb_page); page_cache_release(rdev->sb_page);
rdev->sb = NULL; rdev->sb_loaded = 0;
rdev->sb_page = NULL; rdev->sb_page = NULL;
rdev->sb_offset = 0; rdev->sb_offset = 0;
rdev->size = 0; rdev->size = 0;
...@@ -365,10 +363,12 @@ static int read_disk_sb(mdk_rdev_t * rdev) ...@@ -365,10 +363,12 @@ static int read_disk_sb(mdk_rdev_t * rdev)
{ {
sector_t sb_offset; sector_t sb_offset;
if (!rdev->sb) { if (!rdev->sb_page) {
MD_BUG(); MD_BUG();
return -EINVAL; return -EINVAL;
} }
if (rdev->sb_loaded)
return 0;
/* /*
* Calculate the position of the superblock, * Calculate the position of the superblock,
...@@ -381,8 +381,7 @@ static int read_disk_sb(mdk_rdev_t * rdev) ...@@ -381,8 +381,7 @@ static int read_disk_sb(mdk_rdev_t * rdev)
if (!sync_page_io(rdev->bdev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) if (!sync_page_io(rdev->bdev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
goto fail; goto fail;
rdev->sb_loaded = 1;
printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
return 0; return 0;
fail: fail:
...@@ -390,6 +389,56 @@ static int read_disk_sb(mdk_rdev_t * rdev) ...@@ -390,6 +389,56 @@ static int read_disk_sb(mdk_rdev_t * rdev)
return -EINVAL; return -EINVAL;
} }
static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{
if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
(sb1->set_uuid1 == sb2->set_uuid1) &&
(sb1->set_uuid2 == sb2->set_uuid2) &&
(sb1->set_uuid3 == sb2->set_uuid3))
return 1;
return 0;
}
static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{
int ret;
mdp_super_t *tmp1, *tmp2;
tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
if (!tmp1 || !tmp2) {
ret = 0;
printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
goto abort;
}
*tmp1 = *sb1;
*tmp2 = *sb2;
/*
* nr_disks is not constant
*/
tmp1->nr_disks = 0;
tmp2->nr_disks = 0;
if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
ret = 0;
else
ret = 1;
abort:
if (tmp1)
kfree(tmp1);
if (tmp2)
kfree(tmp2);
return ret;
}
static unsigned int calc_sb_csum(mdp_super_t * sb) static unsigned int calc_sb_csum(mdp_super_t * sb)
{ {
unsigned int disk_csum, csum; unsigned int disk_csum, csum;
...@@ -402,39 +451,284 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) ...@@ -402,39 +451,284 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
} }
/* /*
* Check one RAID superblock for generic plausibility * Handle superblock details.
* We want to be able to handle multiple superblock formats
* so we have a common interface to them all, and an array of
* different handlers.
* We rely on user-space to write the initial superblock, and support
* reading and updating of superblocks.
* Interface methods are:
* int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev)
* loads and validates a superblock on dev.
* if refdev != NULL, compare superblocks on both devices
* Return:
* 0 - dev has a superblock that is compatible with refdev
* 1 - dev has a superblock that is compatible and newer than refdev
* so dev should be used as the refdev in future
* -EINVAL superblock incompatible or invalid
* -othererror e.g. -EIO
*
* int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
* Verify that dev is acceptable into mddev.
* The first time, mddev->raid_disks will be 0, and data from
* dev should be merged in. Subsequent calls check that dev
* is new enough. Return 0 or -EINVAL
*
* void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
* Update the superblock for rdev with data in mddev
* This does not write to disc.
*
*/ */
static int check_disk_sb(mdk_rdev_t * rdev) struct super_type {
char *name;
struct module *owner;
int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev);
int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
};
/*
* load_super for 0.90.0
*/
static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
{ {
mdp_super_t *sb; mdp_super_t *sb;
int ret = -EINVAL; int ret;
sb = rdev->sb; ret = read_disk_sb(rdev);
if (!sb) { if (ret) return ret;
MD_BUG();
goto abort; ret = -EINVAL;
}
sb = (mdp_super_t*)page_address(rdev->sb_page);
if (sb->md_magic != MD_SB_MAGIC) { if (sb->md_magic != MD_SB_MAGIC) {
printk(BAD_MAGIC, bdev_partition_name(rdev->bdev)); printk(BAD_MAGIC, bdev_partition_name(rdev->bdev));
goto abort; goto abort;
} }
if (sb->major_version != 0 ||
sb->minor_version != 90) {
printk(KERN_WARNING "Bad version number %d.%d on %s\n",
sb->major_version, sb->minor_version,
bdev_partition_name(rdev->bdev));
goto abort;
}
if (sb->md_minor >= MAX_MD_DEVS) { if (sb->md_minor >= MAX_MD_DEVS) {
printk(BAD_MINOR, bdev_partition_name(rdev->bdev), sb->md_minor); printk(BAD_MINOR, bdev_partition_name(rdev->bdev), sb->md_minor);
goto abort; goto abort;
} }
if (sb->raid_disks <= 0)
goto abort;
if (calc_sb_csum(sb) != sb->sb_csum) { if (calc_sb_csum(sb) != sb->sb_csum) {
printk(BAD_CSUM, bdev_partition_name(rdev->bdev)); printk(BAD_CSUM, bdev_partition_name(rdev->bdev));
goto abort; goto abort;
} }
ret = 0;
abort: rdev->preferred_minor = sb->md_minor;
if (refdev == 0)
ret = 1;
else {
__u64 ev1, ev2;
mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
if (!uuid_equal(refsb, sb)) {
printk(KERN_WARNING "md: %s has different UUID to %s\n",
bdev_partition_name(rdev->bdev),
bdev_partition_name(refdev->bdev));
goto abort;
}
if (!sb_equal(refsb, sb)) {
printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
bdev_partition_name(rdev->bdev),
bdev_partition_name(refdev->bdev));
goto abort;
}
ev1 = md_event(sb);
ev2 = md_event(refsb);
if (ev1 > ev2)
ret = 1;
else
ret = 0;
}
abort:
return ret; return ret;
} }
/*
* validate_super for 0.90.0
*/
static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
{
mdp_disk_t *desc;
mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
if (mddev->raid_disks == 0) {
mddev->major_version = sb->major_version;
mddev->minor_version = sb->minor_version;
mddev->patch_version = sb->patch_version;
mddev->persistent = ! sb->not_persistent;
mddev->chunk_size = sb->chunk_size;
mddev->ctime = sb->ctime;
mddev->utime = sb->utime;
mddev->level = sb->level;
mddev->layout = sb->layout;
mddev->raid_disks = sb->raid_disks;
mddev->state = sb->state;
mddev->size = sb->size;
mddev->events = md_event(sb);
memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
mddev->max_disks = MD_SB_DISKS;
} else {
__u64 ev1;
ev1 = md_event(sb);
++ev1;
if (ev1 < mddev->events)
return -EINVAL;
}
if (mddev->level != LEVEL_MULTIPATH) {
rdev->desc_nr = sb->this_disk.number;
rdev->raid_disk = -1;
rdev->in_sync = rdev->faulty = 0;
desc = sb->disks + rdev->desc_nr;
if (desc->state & (1<<MD_DISK_FAULTY))
rdev->faulty = 1;
else if (desc->state & (1<<MD_DISK_SYNC) &&
desc->raid_disk < mddev->raid_disks) {
rdev->in_sync = 1;
rdev->raid_disk = desc->raid_disk;
}
}
return 0;
}
/*
* sync_super for 0.90.0
*/
static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
{
mdp_super_t *sb;
struct list_head *tmp;
mdk_rdev_t *rdev2;
int next_spare = mddev->raid_disks;
/* make rdev->sb match mddev data..
*
* 1/ zero out disks
* 2/ Add info for each disk, keeping track of highest desc_nr
* 3/ any empty disks < highest become removed
*
* disks[0] gets initialised to REMOVED because
* we cannot be sure from other fields if it has
* been initialised or not.
*/
int highest = 0;
int i;
int active=0, working=0,failed=0,spare=0,nr_disks=0;
sb = (mdp_super_t*)page_address(rdev->sb_page);
memset(sb, 0, sizeof(*sb));
sb->md_magic = MD_SB_MAGIC;
sb->major_version = mddev->major_version;
sb->minor_version = mddev->minor_version;
sb->patch_version = mddev->patch_version;
sb->gvalid_words = 0; /* ignored */
memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
memcpy(&sb->set_uuid3, mddev->uuid+12,4);
sb->ctime = mddev->ctime;
sb->level = mddev->level;
sb->size = mddev->size;
sb->raid_disks = mddev->raid_disks;
sb->md_minor = mddev->__minor;
sb->not_persistent = !mddev->persistent;
sb->utime = mddev->utime;
sb->state = mddev->state;
sb->events_hi = (mddev->events>>32);
sb->events_lo = (u32)mddev->events;
sb->layout = mddev->layout;
sb->chunk_size = mddev->chunk_size;
sb->disks[0].state = (1<<MD_DISK_REMOVED);
ITERATE_RDEV(mddev,rdev2,tmp) {
mdp_disk_t *d;
if (rdev2->raid_disk >= 0)
rdev2->desc_nr = rdev2->raid_disk;
else
rdev2->desc_nr = next_spare++;
d = &sb->disks[rdev2->desc_nr];
nr_disks++;
d->number = rdev2->desc_nr;
d->major = MAJOR(rdev2->bdev->bd_dev);
d->minor = MINOR(rdev2->bdev->bd_dev);
if (rdev2->raid_disk >= 0)
d->raid_disk = rdev2->raid_disk;
else
d->raid_disk = rdev2->desc_nr; /* compatability */
if (rdev2->faulty) {
d->state = (1<<MD_DISK_FAULTY);
failed++;
} else if (rdev2->in_sync) {
d->state = (1<<MD_DISK_ACTIVE);
d->state |= (1<<MD_DISK_SYNC);
active++;
working++;
} else {
d->state = 0;
spare++;
working++;
}
if (rdev2->desc_nr > highest)
highest = rdev2->desc_nr;
}
/* now set the "removed" bit on any non-trailing holes */
for (i=0; i<highest; i++) {
mdp_disk_t *d = &sb->disks[i];
if (d->state == 0 && d->number == 0) {
d->number = i;
d->raid_disk = i;
d->state = (1<<MD_DISK_REMOVED);
}
}
sb->nr_disks = nr_disks;
sb->active_disks = active;
sb->working_disks = working;
sb->failed_disks = failed;
sb->spare_disks = spare;
sb->this_disk = sb->disks[rdev->desc_nr];
sb->sb_csum = calc_sb_csum(sb);
}
struct super_type super_types[] = {
[0] = {
.name = "0.90.0",
.owner = THIS_MODULE,
.load_super = super_90_load,
.validate_super = super_90_validate,
.sync_super = super_90_sync,
},
};
static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
{ {
struct list_head *tmp; struct list_head *tmp;
...@@ -618,9 +912,9 @@ static void print_rdev(mdk_rdev_t *rdev) ...@@ -618,9 +912,9 @@ static void print_rdev(mdk_rdev_t *rdev)
printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ", printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ",
bdev_partition_name(rdev->bdev), bdev_partition_name(rdev->bdev),
(unsigned long long)rdev->size, rdev->faulty, rdev->in_sync, rdev->desc_nr); (unsigned long long)rdev->size, rdev->faulty, rdev->in_sync, rdev->desc_nr);
if (rdev->sb) { if (rdev->sb_loaded) {
printk(KERN_INFO "md: rdev superblock:\n"); printk(KERN_INFO "md: rdev superblock:\n");
print_sb(rdev->sb); print_sb((mdp_super_t*)page_address(rdev->sb_page));
} else } else
printk(KERN_INFO "md: no rdev superblock!\n"); printk(KERN_INFO "md: no rdev superblock!\n");
} }
...@@ -648,61 +942,13 @@ void md_print_devices(void) ...@@ -648,61 +942,13 @@ void md_print_devices(void)
printk("\n"); printk("\n");
} }
static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{
int ret;
mdp_super_t *tmp1, *tmp2;
tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
if (!tmp1 || !tmp2) {
ret = 0;
printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
goto abort;
}
*tmp1 = *sb1;
*tmp2 = *sb2;
/*
* nr_disks is not constant
*/
tmp1->nr_disks = 0;
tmp2->nr_disks = 0;
if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
ret = 0;
else
ret = 1;
abort:
if (tmp1)
kfree(tmp1);
if (tmp2)
kfree(tmp2);
return ret;
}
static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
{
if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
(rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
(rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
(rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
return 1;
return 0;
}
static int write_disk_sb(mdk_rdev_t * rdev) static int write_disk_sb(mdk_rdev_t * rdev)
{ {
sector_t sb_offset; sector_t sb_offset;
sector_t size; sector_t size;
if (!rdev->sb) { if (!rdev->sb_loaded) {
MD_BUG(); MD_BUG();
return 1; return 1;
} }
...@@ -710,10 +956,6 @@ static int write_disk_sb(mdk_rdev_t * rdev) ...@@ -710,10 +956,6 @@ static int write_disk_sb(mdk_rdev_t * rdev)
MD_BUG(); MD_BUG();
return 1; return 1;
} }
if (rdev->sb->md_magic != MD_SB_MAGIC) {
MD_BUG();
return 1;
}
sb_offset = calc_dev_sboffset(rdev->bdev); sb_offset = calc_dev_sboffset(rdev->bdev);
if (rdev->sb_offset != sb_offset) { if (rdev->sb_offset != sb_offset) {
...@@ -751,116 +993,10 @@ static int write_disk_sb(mdk_rdev_t * rdev) ...@@ -751,116 +993,10 @@ static int write_disk_sb(mdk_rdev_t * rdev)
static void sync_sbs(mddev_t * mddev) static void sync_sbs(mddev_t * mddev)
{ {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
mdp_super_t *sb;
struct list_head *tmp; struct list_head *tmp;
int next_spare = mddev->raid_disks;
/* make all rdev->sb match mddev data..
* we setup the data in the first rdev and copy it
* to the others.
*
* 1/ zero out disks
* 2/ Add info for each disk, keeping track of highest desc_nr
* 3/ any empty disks < highest become removed
*
* disks[0] gets initialised to REMOVED because
* we cannot be sure from other fields if it has
* been initialised or not.
*/
int highest = 0;
int i;
int active=0, working=0,failed=0,spare=0,nr_disks=0;
if (list_empty(&mddev->disks)) {
MD_BUG();
return;
}
rdev = list_entry(mddev->disks.next, mdk_rdev_t, same_set);
sb = rdev->sb;
memset(sb, 0, sizeof(*sb));
sb->md_magic = MD_SB_MAGIC;
sb->major_version = mddev->major_version;
sb->minor_version = mddev->minor_version;
sb->patch_version = mddev->patch_version;
sb->gvalid_words = 0; /* ignored */
memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
memcpy(&sb->set_uuid3, mddev->uuid+12,4);
sb->ctime = mddev->ctime;
sb->level = mddev->level;
sb->size = mddev->size;
sb->raid_disks = mddev->raid_disks;
sb->md_minor = mddev->__minor;
sb->not_persistent = !mddev->persistent;
sb->utime = mddev->utime;
sb->state = mddev->state;
sb->events_hi = (mddev->events>>32);
sb->events_lo = (u32)mddev->events;
sb->layout = mddev->layout; ITERATE_RDEV(mddev,rdev,tmp)
sb->chunk_size = mddev->chunk_size; super_90_sync(mddev, rdev);
sb->disks[0].state = (1<<MD_DISK_REMOVED);
ITERATE_RDEV(mddev,rdev,tmp) {
mdp_disk_t *d;
if (rdev->raid_disk >= 0)
rdev->desc_nr = rdev->raid_disk;
else
rdev->desc_nr = next_spare++;
d = &sb->disks[rdev->desc_nr];
nr_disks++;
d->number = rdev->desc_nr;
d->major = MAJOR(rdev->bdev->bd_dev);
d->minor = MINOR(rdev->bdev->bd_dev);
if (rdev->raid_disk >= 0)
d->raid_disk = rdev->raid_disk;
else
d->raid_disk = rdev->desc_nr; /* compatability */
if (rdev->faulty) {
d->state = (1<<MD_DISK_FAULTY);
failed++;
} else if (rdev->in_sync) {
d->state = (1<<MD_DISK_ACTIVE);
d->state |= (1<<MD_DISK_SYNC);
active++;
working++;
} else {
d->state = 0;
spare++;
working++;
}
if (rdev->desc_nr > highest)
highest = rdev->desc_nr;
}
/* now set the "removed" bit on any non-trailing holes */
for (i=0; i<highest; i++) {
mdp_disk_t *d = &sb->disks[i];
if (d->state == 0 && d->number == 0) {
d->number = i;
d->raid_disk = i;
d->state = (1<<MD_DISK_REMOVED);
}
}
sb->nr_disks = nr_disks;
sb->active_disks = active;
sb->working_disks = working;
sb->failed_disks = failed;
sb->spare_disks = spare;
ITERATE_RDEV(mddev,rdev,tmp) {
mdp_super_t *this_sb;
this_sb = rdev->sb;
if (this_sb != sb)
*this_sb = *sb;
this_sb->this_disk = this_sb->disks[rdev->desc_nr];
this_sb->sb_csum = calc_sb_csum(this_sb);
}
} }
static void md_update_sb(mddev_t * mddev) static void md_update_sb(mddev_t * mddev)
...@@ -903,8 +1039,6 @@ static void md_update_sb(mddev_t * mddev) ...@@ -903,8 +1039,6 @@ static void md_update_sb(mddev_t * mddev)
printk("%s ", bdev_partition_name(rdev->bdev)); printk("%s ", bdev_partition_name(rdev->bdev));
if (!rdev->faulty) { if (!rdev->faulty) {
printk("[events: %08lx]",
(unsigned long)rdev->sb->events_lo);
err += write_disk_sb(rdev); err += write_disk_sb(rdev);
} else } else
printk(")\n"); printk(")\n");
...@@ -968,13 +1102,14 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk) ...@@ -968,13 +1102,14 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk)
} }
if (on_disk) { if (on_disk) {
if ((err = read_disk_sb(rdev))) { err = super_90_load(rdev, NULL);
printk(KERN_WARNING "md: could not read %s's sb, not importing!\n", if (err == -EINVAL) {
printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
bdev_partition_name(rdev->bdev)); bdev_partition_name(rdev->bdev));
goto abort_free; goto abort_free;
} }
if ((err = check_disk_sb(rdev))) { if (err < 0) {
printk(KERN_WARNING "md: %s has invalid sb, not importing!\n", printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
bdev_partition_name(rdev->bdev)); bdev_partition_name(rdev->bdev));
goto abort_free; goto abort_free;
} }
...@@ -984,7 +1119,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk) ...@@ -984,7 +1119,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk)
return rdev; return rdev;
abort_free: abort_free:
if (rdev->sb) { if (rdev->sb_page) {
if (rdev->bdev) if (rdev->bdev)
unlock_rdev(rdev); unlock_rdev(rdev);
free_disk_sb(rdev); free_disk_sb(rdev);
...@@ -1014,155 +1149,39 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk) ...@@ -1014,155 +1149,39 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk)
static int analyze_sbs(mddev_t * mddev) static int analyze_sbs(mddev_t * mddev)
{ {
int out_of_date = 0, i; int i;
struct list_head *tmp; struct list_head *tmp;
mdk_rdev_t *rdev, *freshest; mdk_rdev_t *rdev, *freshest;
mdp_super_t *sb;
/*
* Verify the RAID superblock on each real device
*/
ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev->faulty) {
MD_BUG();
goto abort;
}
if (!rdev->sb) {
MD_BUG();
goto abort;
}
if (check_disk_sb(rdev))
goto abort;
}
/* freshest = NULL;
* The superblock constant part has to be the same ITERATE_RDEV(mddev,rdev,tmp)
* for all disks in the array. switch (super_90_load(rdev, freshest)) {
*/ case 1:
sb = NULL; freshest = rdev;
break;
ITERATE_RDEV(mddev,rdev,tmp) { case 0:
if (!sb) { break;
sb = rdev->sb; default:
continue;
}
if (!sb_equal(sb, rdev->sb)) {
printk(INCONSISTENT, bdev_partition_name(rdev->bdev)); printk(INCONSISTENT, bdev_partition_name(rdev->bdev));
kick_rdev_from_array(rdev); kick_rdev_from_array(rdev);
continue;
} }
}
/*
* OK, we have all disks and the array is ready to run. Let's
* find the freshest superblock, that one will be the superblock
* that represents the whole array.
*/
freshest = NULL;
ITERATE_RDEV(mddev,rdev,tmp) { super_90_validate(mddev, freshest);
__u64 ev1, ev2;
/*
* if the checksum is invalid, use the superblock
* only as a last resort. (decrease it's age by
* one event)
*/
if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
if (rdev->sb->events_lo || rdev->sb->events_hi)
if (!(rdev->sb->events_lo--))
rdev->sb->events_hi--;
}
printk(KERN_INFO "md: %s's event counter: %08lx\n",
bdev_partition_name(rdev->bdev),
(unsigned long)rdev->sb->events_lo);
if (!freshest) {
freshest = rdev;
continue;
}
/*
* Find the newest superblock version
*/
ev1 = md_event(rdev->sb);
ev2 = md_event(freshest->sb);
if (ev1 != ev2) {
out_of_date = 1;
if (ev1 > ev2)
freshest = rdev;
}
}
if (out_of_date) {
printk(OUT_OF_DATE);
printk(KERN_INFO "md: freshest: %s\n", bdev_partition_name(freshest->bdev));
}
sb = freshest->sb;
mddev->major_version = sb->major_version;
mddev->minor_version = sb->minor_version;
mddev->patch_version = sb->patch_version;
mddev->persistent = ! sb->not_persistent;
mddev->chunk_size = sb->chunk_size;
mddev->ctime = sb->ctime;
mddev->utime = sb->utime;
mddev->level = sb->level;
mddev->layout = sb->layout;
mddev->raid_disks = sb->raid_disks;
mddev->state = sb->state;
mddev->size = sb->size;
mddev->events = md_event(sb);
memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
/*
* at this point we have picked the 'best' superblock
* from all available superblocks.
* now we validate this superblock and kick out possibly
* failed disks.
*/
ITERATE_RDEV(mddev,rdev,tmp) {
/*
* Kick all non-fresh devices
*/
__u64 ev1;
ev1 = md_event(rdev->sb);
++ev1;
if (ev1 < mddev->events) {
printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
bdev_partition_name(rdev->bdev));
kick_rdev_from_array(rdev);
continue;
}
}
/* set rdev->desc_nr for each device.
* for MULTIPATH, we just us sequential number as
* nothing else is meaningful
*/
i = 0; i = 0;
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev != freshest)
if (super_90_validate(mddev, rdev)) {
printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
bdev_partition_name(rdev->bdev));
kick_rdev_from_array(rdev);
continue;
}
if (mddev->level == LEVEL_MULTIPATH) { if (mddev->level == LEVEL_MULTIPATH) {
rdev->desc_nr = i++; rdev->desc_nr = i++;
rdev->raid_disk = rdev->desc_nr; rdev->raid_disk = rdev->desc_nr;
rdev->in_sync = 1; rdev->in_sync = 1;
} else {
mdp_disk_t *desc;
rdev->desc_nr = rdev->sb->this_disk.number;
desc = sb->disks + rdev->desc_nr;
rdev->raid_disk = -1;
rdev->in_sync = rdev->faulty = 0;
if (desc->state & (1<<MD_DISK_FAULTY)) {
rdev->faulty = 1;
kick_rdev_from_array(rdev);
} else if (desc->state & (1<<MD_DISK_SYNC) &&
desc->raid_disk < mddev->raid_disks) {
rdev->in_sync = 1;
rdev->raid_disk = desc->raid_disk;
}
} }
} }
...@@ -1579,20 +1598,6 @@ static int do_md_stop(mddev_t * mddev, int ro) ...@@ -1579,20 +1598,6 @@ static int do_md_stop(mddev_t * mddev, int ro)
return err; return err;
} }
/*
* We have to safely support old arrays too.
*/
int detect_old_array(mdp_super_t *sb)
{
if (sb->major_version > 0)
return 0;
if (sb->minor_version >= 90)
return 0;
return -EINVAL;
}
static void autorun_array(mddev_t *mddev) static void autorun_array(mddev_t *mddev)
{ {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
...@@ -1648,25 +1653,18 @@ static void autorun_devices(void) ...@@ -1648,25 +1653,18 @@ static void autorun_devices(void)
printk(KERN_INFO "md: considering %s ...\n", bdev_partition_name(rdev0->bdev)); printk(KERN_INFO "md: considering %s ...\n", bdev_partition_name(rdev0->bdev));
INIT_LIST_HEAD(&candidates); INIT_LIST_HEAD(&candidates);
ITERATE_RDEV_PENDING(rdev,tmp) { ITERATE_RDEV_PENDING(rdev,tmp)
if (uuid_equal(rdev0, rdev)) { if (super_90_load(rdev, rdev0) >= 0) {
if (!sb_equal(rdev0->sb, rdev->sb)) {
printk(KERN_WARNING
"md: %s has same UUID as %s, but superblocks differ ...\n",
bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev));
continue;
}
printk(KERN_INFO "md: adding %s ...\n", bdev_partition_name(rdev->bdev)); printk(KERN_INFO "md: adding %s ...\n", bdev_partition_name(rdev->bdev));
list_move(&rdev->same_set, &candidates); list_move(&rdev->same_set, &candidates);
} }
}
/* /*
* now we have a set of devices, with all of them having * now we have a set of devices, with all of them having
* mostly sane superblocks. It's time to allocate the * mostly sane superblocks. It's time to allocate the
* mddev. * mddev.
*/ */
mddev = mddev_find(rdev0->sb->md_minor); mddev = mddev_find(rdev0->preferred_minor);
if (!mddev) { if (!mddev) {
printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
break; break;
...@@ -1748,15 +1746,6 @@ static int autostart_array(dev_t startdev) ...@@ -1748,15 +1746,6 @@ static int autostart_array(dev_t startdev)
} }
list_add(&start_rdev->same_set, &pending_raid_disks); list_add(&start_rdev->same_set, &pending_raid_disks);
sb = start_rdev->sb;
err = detect_old_array(sb);
if (err) {
printk(KERN_WARNING "md: array version is too old to be autostarted ,"
"use raidtools 0.90 mkraid --upgrade to upgrade the array "
"without data loss!\n");
goto abort;
}
for (i = 0; i < MD_SB_DISKS; i++) { for (i = 0; i < MD_SB_DISKS; i++) {
mdp_disk_t *desc; mdp_disk_t *desc;
...@@ -1875,8 +1864,6 @@ static int get_disk_info(mddev_t * mddev, void * arg) ...@@ -1875,8 +1864,6 @@ static int get_disk_info(mddev_t * mddev, void * arg)
return -EFAULT; return -EFAULT;
nr = info.number; nr = info.number;
if (nr >= MD_SB_DISKS)
return -EINVAL;
rdev = find_rdev_nr(mddev, nr); rdev = find_rdev_nr(mddev, nr);
if (rdev) { if (rdev) {
...@@ -1918,18 +1905,13 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -1918,18 +1905,13 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
if (!list_empty(&mddev->disks)) { if (!list_empty(&mddev->disks)) {
mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
mdk_rdev_t, same_set); mdk_rdev_t, same_set);
if (!uuid_equal(rdev0, rdev)) { int err = super_90_load(rdev, NULL);
if (err < 0) {
printk(KERN_WARNING "md: %s has different UUID to %s\n", printk(KERN_WARNING "md: %s has different UUID to %s\n",
bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev)); bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev));
export_rdev(rdev); export_rdev(rdev);
return -EINVAL; return -EINVAL;
} }
if (!sb_equal(rdev0->sb, rdev->sb)) {
printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev));
export_rdev(rdev);
return -EINVAL;
}
} }
bind_rdev_to_array(rdev, mddev); bind_rdev_to_array(rdev, mddev);
return 0; return 0;
...@@ -2080,11 +2062,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) ...@@ -2080,11 +2062,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
rdev->size = size; rdev->size = size;
rdev->sb_offset = calc_dev_sboffset(rdev->bdev); rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
for (i = mddev->raid_disks; i < MD_SB_DISKS; i++) for (i = mddev->raid_disks; i < mddev->max_disks; i++)
if (find_rdev_nr(mddev,i)==NULL) if (find_rdev_nr(mddev,i)==NULL)
break; break;
if (i == MD_SB_DISKS) { if (i == mddev->max_disks) {
printk(KERN_WARNING "md%d: can not hot-add to full array!\n", printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
mdidx(mddev)); mdidx(mddev));
err = -EBUSY; err = -EBUSY;
......
...@@ -59,7 +59,7 @@ static void mp_pool_free(void *mpb, void *data) ...@@ -59,7 +59,7 @@ static void mp_pool_free(void *mpb, void *data)
static int multipath_map (mddev_t *mddev, mdk_rdev_t **rdevp) static int multipath_map (mddev_t *mddev, mdk_rdev_t **rdevp)
{ {
multipath_conf_t *conf = mddev_to_conf(mddev); multipath_conf_t *conf = mddev_to_conf(mddev);
int i, disks = MD_SB_DISKS; int i, disks = mddev->max_disks;
/* /*
* Later we do read balancing on the read side * Later we do read balancing on the read side
...@@ -147,7 +147,7 @@ static int multipath_read_balance (multipath_conf_t *conf) ...@@ -147,7 +147,7 @@ static int multipath_read_balance (multipath_conf_t *conf)
{ {
int disk; int disk;
for (disk = 0; disk < MD_SB_DISKS; disk++) { for (disk = 0; disk < conf->mddev->max_disks; disk++) {
mdk_rdev_t *rdev = conf->multipaths[disk].rdev; mdk_rdev_t *rdev = conf->multipaths[disk].rdev;
if (rdev && rdev->in_sync) if (rdev && rdev->in_sync)
return disk; return disk;
...@@ -259,7 +259,7 @@ static void print_multipath_conf (multipath_conf_t *conf) ...@@ -259,7 +259,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
printk(" --- wd:%d rd:%d\n", conf->working_disks, printk(" --- wd:%d rd:%d\n", conf->working_disks,
conf->raid_disks); conf->raid_disks);
for (i = 0; i < MD_SB_DISKS; i++) { for (i = 0; i < conf->mddev->max_disks; i++) {
tmp = conf->multipaths + i; tmp = conf->multipaths + i;
if (tmp->rdev) if (tmp->rdev)
printk(" disk%d, o:%d, dev:%s\n", printk(" disk%d, o:%d, dev:%s\n",
......
...@@ -151,8 +151,9 @@ struct mdk_rdev_s ...@@ -151,8 +151,9 @@ struct mdk_rdev_s
struct block_device *bdev; /* block device handle */ struct block_device *bdev; /* block device handle */
struct page *sb_page; struct page *sb_page;
mdp_super_t *sb; int sb_loaded;
sector_t sb_offset; sector_t sb_offset;
int preferred_minor; /* autorun support */
/* A device can be in one of three states based on two flags: /* A device can be in one of three states based on two flags:
* Not working: faulty==1 in_sync==0 * Not working: faulty==1 in_sync==0
...@@ -196,6 +197,7 @@ struct mddev_s ...@@ -196,6 +197,7 @@ struct mddev_s
time_t ctime, utime; time_t ctime, utime;
int level, layout; int level, layout;
int raid_disks; int raid_disks;
int max_disks;
unsigned long state; unsigned long state;
sector_t size; /* used size of component devices */ sector_t size; /* used size of component devices */
__u64 events; __u64 events;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment