Commit 1524c2f2 authored by Linus Torvalds's avatar Linus Torvalds

Merge bk://kernel.bkbits.net/jgarzik/random-2.5

into home.transmeta.com:/home/torvalds/v2.5/linux
parents 1c6604f1 65511c2b
Tools that manage md devices can be found at Tools that manage md devices can be found at
http://www.<country>.kernel.org/pub/linux/daemons/raid/.... http://www.<country>.kernel.org/pub/linux/utils/raid/....
You can boot (if you selected boot support in the configuration) with your md You can boot with your md device with the following kernel command
device with the following kernel command lines: lines:
for old raid arrays without persistent superblocks: for old raid arrays without persistent superblocks:
md=<md device no.>,<raid level>,<chunk size factor>,<fault level>,dev0,dev1,...,devn md=<md device no.>,<raid level>,<chunk size factor>,<fault level>,dev0,dev1,...,devn
...@@ -34,3 +34,63 @@ A possible loadlin line (Harald Hoyer <HarryH@Royal.Net>) looks like this: ...@@ -34,3 +34,63 @@ A possible loadlin line (Harald Hoyer <HarryH@Royal.Net>) looks like this:
e:\loadlin\loadlin e:\zimage root=/dev/md0 md=0,0,4,0,/dev/hdb2,/dev/hdc3 ro e:\loadlin\loadlin e:\zimage root=/dev/md0 md=0,0,4,0,/dev/hdb2,/dev/hdc3 ro
-------------------------------
The md driver can support a variety of different superblock formats.
(It doesn't yet, but it can)
The kernel does *NOT* autodetect which format superblock is being
used. It must be told.
Superblock format '0' is treated differently to others for legacy
reasons.
General Rules - apply for all superblock formats
------------------------------------------------
An array is 'created' by writing appropriate superblocks to all
devices.
It is 'assembled' by associating each of these devices with an
particular md virtual device. Once it is completely assembled, it can
be accessed.
An array should be created by a user-space tool. This will write
superblocks to all devices. It will usually mark the array as
'unclean', or with some devices missing so that the kernel md driver
can create approrpriate redundancy (copying in raid1, parity
calculation in raid4/5).
When an array is assembled, it is first initialised with the
SET_ARRAY_INFO ioctl. This contains, in particular, a major and minor
version number. The major version number selects which superblock
format is to be used. The minor number might be used to tune handling
of the format, such as suggesting where on each device to look for the
superblock.
Then each device is added using the ADD_NEW_DISK ioctl. This
provides, in particular, a major and minor number identifying the
device to add.
The array is started with the RUN_ARRAY ioctl.
Once started, new devices can be added. They should have an
appropriate superblock written to them, and then passed be in with
ADD_NEW_DISK.
Devices that have failed or are not yet active can be detached from an
array using HOT_REMOVE_DISK.
Specific Rules that apply to format-0 super block arrays, and
arrays with no superblock (non-presistant).
-------------------------------------------------------------
An array can be 'created' by describing the array (level, chunksize
etc) in a SET_ARRAY_INFO ioctl. This must has major_version==0 and
raid_disks != 0.
Then uninitialised devices can be added with ADD_NEW_DISK. The
structure passed to ADD_NEW_DISK must specify the state of the device
and it's role in the array.
One started with RUN_ARRAY, uninitialised spares can be added with
HOT_ADD_DISK.
...@@ -75,5 +75,6 @@ int __init platform_bus_init(void) ...@@ -75,5 +75,6 @@ int __init platform_bus_init(void)
return bus_register(&platform_bus_type); return bus_register(&platform_bus_type);
} }
EXPORT_SYMBOL(platform_bus_type);
EXPORT_SYMBOL(platform_device_register); EXPORT_SYMBOL(platform_device_register);
EXPORT_SYMBOL(platform_device_unregister); EXPORT_SYMBOL(platform_device_unregister);
...@@ -203,36 +203,34 @@ static int linear_make_request (request_queue_t *q, struct bio *bio) ...@@ -203,36 +203,34 @@ static int linear_make_request (request_queue_t *q, struct bio *bio)
return 0; return 0;
} }
bio->bi_bdev = tmp_dev->rdev->bdev; bio->bi_bdev = tmp_dev->rdev->bdev;
bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1); bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1) + tmp_dev->rdev->data_offset;
return 1; return 1;
} }
static int linear_status (char *page, mddev_t *mddev) static void linear_status (struct seq_file *seq, mddev_t *mddev)
{ {
int sz = 0;
#undef MD_DEBUG #undef MD_DEBUG
#ifdef MD_DEBUG #ifdef MD_DEBUG
int j; int j;
linear_conf_t *conf = mddev_to_conf(mddev); linear_conf_t *conf = mddev_to_conf(mddev);
sz += sprintf(page+sz, " "); seq_printf(seq, " ");
for (j = 0; j < conf->nr_zones; j++) for (j = 0; j < conf->nr_zones; j++)
{ {
sz += sprintf(page+sz, "[%s", seq_printf(seq, "[%s",
bdev_partition_name(conf->hash_table[j].dev0->rdev->bdev)); bdev_partition_name(conf->hash_table[j].dev0->rdev->bdev));
if (conf->hash_table[j].dev1) if (conf->hash_table[j].dev1)
sz += sprintf(page+sz, "/%s] ", seq_printf(seq, "/%s] ",
bdev_partition_name(conf->hash_table[j].dev1->rdev->bdev)); bdev_partition_name(conf->hash_table[j].dev1->rdev->bdev));
else else
sz += sprintf(page+sz, "] "); seq_printf(seq, "] ");
} }
sz += sprintf(page+sz, "\n"); seq_printf(seq, "\n");
#endif #endif
sz += sprintf(page+sz, " %dk rounding", mddev->chunk_size/1024); seq_printf(seq, " %dk rounding", mddev->chunk_size/1024);
return sz;
} }
......
...@@ -124,9 +124,6 @@ static ctl_table raid_root_table[] = { ...@@ -124,9 +124,6 @@ static ctl_table raid_root_table[] = {
{ .ctl_name = 0 } { .ctl_name = 0 }
}; };
static void md_recover_arrays(void);
static mdk_thread_t *md_recovery_thread;
sector_t md_size[MAX_MD_DEVS]; sector_t md_size[MAX_MD_DEVS];
static struct block_device_operations md_fops; static struct block_device_operations md_fops;
...@@ -222,6 +219,7 @@ static mddev_t * mddev_find(int unit) ...@@ -222,6 +219,7 @@ static mddev_t * mddev_find(int unit)
init_MUTEX(&new->reconfig_sem); init_MUTEX(&new->reconfig_sem);
INIT_LIST_HEAD(&new->disks); INIT_LIST_HEAD(&new->disks);
INIT_LIST_HEAD(&new->all_mddevs); INIT_LIST_HEAD(&new->all_mddevs);
init_timer(&new->safemode_timer);
atomic_set(&new->active, 1); atomic_set(&new->active, 1);
blk_queue_make_request(&new->queue, md_fail_request); blk_queue_make_request(&new->queue, md_fail_request);
...@@ -272,40 +270,35 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) ...@@ -272,40 +270,35 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
return NULL; return NULL;
} }
static sector_t calc_dev_sboffset(struct block_device *bdev) inline static sector_t calc_dev_sboffset(struct block_device *bdev)
{ {
sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
return MD_NEW_SIZE_BLOCKS(size); return MD_NEW_SIZE_BLOCKS(size);
} }
static sector_t calc_dev_size(struct block_device *bdev, mddev_t *mddev) static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
{ {
sector_t size; sector_t size;
if (mddev->persistent) size = rdev->sb_offset;
size = calc_dev_sboffset(bdev);
else if (chunk_size)
size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; size &= ~((sector_t)chunk_size/1024 - 1);
if (mddev->chunk_size)
size &= ~((sector_t)mddev->chunk_size/1024 - 1);
return size; return size;
} }
static sector_t zoned_raid_size(mddev_t *mddev) static sector_t zoned_raid_size(mddev_t *mddev)
{ {
sector_t mask;
mdk_rdev_t * rdev; mdk_rdev_t * rdev;
struct list_head *tmp; struct list_head *tmp;
/* /*
* do size and offset calculations. * do size and offset calculations.
*/ */
mask = ~((sector_t)mddev->chunk_size/1024 - 1);
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp)
rdev->size &= mask;
md_size[mdidx(mddev)] += rdev->size; md_size[mdidx(mddev)] += rdev->size;
}
return 0; return 0;
} }
...@@ -389,7 +382,6 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size, ...@@ -389,7 +382,6 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
static int read_disk_sb(mdk_rdev_t * rdev) static int read_disk_sb(mdk_rdev_t * rdev)
{ {
sector_t sb_offset;
if (!rdev->sb_page) { if (!rdev->sb_page) {
MD_BUG(); MD_BUG();
...@@ -398,16 +390,8 @@ static int read_disk_sb(mdk_rdev_t * rdev) ...@@ -398,16 +390,8 @@ static int read_disk_sb(mdk_rdev_t * rdev)
if (rdev->sb_loaded) if (rdev->sb_loaded)
return 0; return 0;
/*
* Calculate the position of the superblock,
* it's at the end of the disk.
*
* It also happens to be a multiple of 4Kb.
*/
sb_offset = calc_dev_sboffset(rdev->bdev);
rdev->sb_offset = sb_offset;
if (!sync_page_io(rdev->bdev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
goto fail; goto fail;
rdev->sb_loaded = 1; rdev->sb_loaded = 1;
return 0; return 0;
...@@ -486,7 +470,7 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) ...@@ -486,7 +470,7 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
* We rely on user-space to write the initial superblock, and support * We rely on user-space to write the initial superblock, and support
* reading and updating of superblocks. * reading and updating of superblocks.
* Interface methods are: * Interface methods are:
* int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev) * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
* loads and validates a superblock on dev. * loads and validates a superblock on dev.
* if refdev != NULL, compare superblocks on both devices * if refdev != NULL, compare superblocks on both devices
* Return: * Return:
...@@ -511,7 +495,7 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) ...@@ -511,7 +495,7 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
struct super_type { struct super_type {
char *name; char *name;
struct module *owner; struct module *owner;
int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev); int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
}; };
...@@ -519,10 +503,20 @@ struct super_type { ...@@ -519,10 +503,20 @@ struct super_type {
/* /*
* load_super for 0.90.0 * load_super for 0.90.0
*/ */
static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev) static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
{ {
mdp_super_t *sb; mdp_super_t *sb;
int ret; int ret;
sector_t sb_offset;
/*
* Calculate the position of the superblock,
* it's at the end of the disk.
*
* It also happens to be a multiple of 4Kb.
*/
sb_offset = calc_dev_sboffset(rdev->bdev);
rdev->sb_offset = sb_offset;
ret = read_disk_sb(rdev); ret = read_disk_sb(rdev);
if (ret) return ret; if (ret) return ret;
...@@ -557,6 +551,12 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev) ...@@ -557,6 +551,12 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
} }
rdev->preferred_minor = sb->md_minor; rdev->preferred_minor = sb->md_minor;
rdev->data_offset = 0;
if (sb->level == MULTIPATH)
rdev->desc_nr = -1;
else
rdev->desc_nr = sb->this_disk.number;
if (refdev == 0) if (refdev == 0)
ret = 1; ret = 1;
...@@ -582,7 +582,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev) ...@@ -582,7 +582,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
else else
ret = 0; ret = 0;
} }
rdev->size = calc_dev_size(rdev, sb->chunk_size);
abort: abort:
return ret; return ret;
...@@ -597,7 +597,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -597,7 +597,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
if (mddev->raid_disks == 0) { if (mddev->raid_disks == 0) {
mddev->major_version = sb->major_version; mddev->major_version = 0;
mddev->minor_version = sb->minor_version; mddev->minor_version = sb->minor_version;
mddev->patch_version = sb->patch_version; mddev->patch_version = sb->patch_version;
mddev->persistent = ! sb->not_persistent; mddev->persistent = ! sb->not_persistent;
...@@ -634,7 +634,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -634,7 +634,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
return -EINVAL; return -EINVAL;
} }
if (mddev->level != LEVEL_MULTIPATH) { if (mddev->level != LEVEL_MULTIPATH) {
rdev->desc_nr = sb->this_disk.number;
rdev->raid_disk = -1; rdev->raid_disk = -1;
rdev->in_sync = rdev->faulty = 0; rdev->in_sync = rdev->faulty = 0;
desc = sb->disks + rdev->desc_nr; desc = sb->disks + rdev->desc_nr;
...@@ -704,10 +703,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -704,10 +703,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->recovery_cp = mddev->recovery_cp; sb->recovery_cp = mddev->recovery_cp;
sb->cp_events_hi = (mddev->events>>32); sb->cp_events_hi = (mddev->events>>32);
sb->cp_events_lo = (u32)mddev->events; sb->cp_events_lo = (u32)mddev->events;
if (mddev->recovery_cp == MaxSector) { if (mddev->recovery_cp == MaxSector)
printk(KERN_INFO "md: marking sb clean...\n");
sb->state = (1<< MD_SB_CLEAN); sb->state = (1<< MD_SB_CLEAN);
}
} else } else
sb->recovery_cp = 0; sb->recovery_cp = 0;
...@@ -717,7 +714,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -717,7 +714,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->disks[0].state = (1<<MD_DISK_REMOVED); sb->disks[0].state = (1<<MD_DISK_REMOVED);
ITERATE_RDEV(mddev,rdev2,tmp) { ITERATE_RDEV(mddev,rdev2,tmp) {
mdp_disk_t *d; mdp_disk_t *d;
if (rdev2->raid_disk >= 0) if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
rdev2->desc_nr = rdev2->raid_disk; rdev2->desc_nr = rdev2->raid_disk;
else else
rdev2->desc_nr = next_spare++; rdev2->desc_nr = next_spare++;
...@@ -726,7 +723,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -726,7 +723,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
d->number = rdev2->desc_nr; d->number = rdev2->desc_nr;
d->major = MAJOR(rdev2->bdev->bd_dev); d->major = MAJOR(rdev2->bdev->bd_dev);
d->minor = MINOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev);
if (rdev2->raid_disk >= 0) if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
d->raid_disk = rdev2->raid_disk; d->raid_disk = rdev2->raid_disk;
else else
d->raid_disk = rdev2->desc_nr; /* compatibility */ d->raid_disk = rdev2->desc_nr; /* compatibility */
...@@ -766,6 +763,210 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -766,6 +763,210 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->sb_csum = calc_sb_csum(sb); sb->sb_csum = calc_sb_csum(sb);
} }
/*
* version 1 superblock
*/
static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
{
unsigned int disk_csum, csum;
int size = 256 + sb->max_dev*2;
disk_csum = sb->sb_csum;
sb->sb_csum = 0;
csum = csum_partial((void *)sb, size, 0);
sb->sb_csum = disk_csum;
return csum;
}
static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
{
struct mdp_superblock_1 *sb;
int ret;
sector_t sb_offset;
/*
* Calculate the position of the superblock.
* It is always aligned to a 4K boundary and
* depeding on minor_version, it can be:
* 0: At least 8K, but less than 12K, from end of device
* 1: At start of device
* 2: 4K from start of device.
*/
switch(minor_version) {
case 0:
sb_offset = rdev->bdev->bd_inode->i_size >> 9;
sb_offset -= 8*2;
sb_offset &= ~(4*2);
/* convert from sectors to K */
sb_offset /= 2;
break;
case 1:
sb_offset = 0;
break;
case 2:
sb_offset = 4;
break;
default:
return -EINVAL;
}
rdev->sb_offset = sb_offset;
ret = read_disk_sb(rdev);
if (ret) return ret;
sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
sb->major_version != cpu_to_le32(1) ||
le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
sb->feature_map != 0)
return -EINVAL;
if (calc_sb_1_csum(sb) != sb->sb_csum) {
printk(BAD_CSUM, bdev_partition_name(rdev->bdev));
return -EINVAL;
}
rdev->preferred_minor = 0xffff;
rdev->data_offset = le64_to_cpu(sb->data_offset);
if (refdev == 0)
return 1;
else {
__u64 ev1, ev2;
struct mdp_superblock_1 *refsb =
(struct mdp_superblock_1*)page_address(refdev->sb_page);
if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
sb->level != refsb->level ||
sb->layout != refsb->layout ||
sb->chunksize != refsb->chunksize) {
printk(KERN_WARNING "md: %s has strangely different superblock to %s\n",
bdev_partition_name(rdev->bdev),
bdev_partition_name(refdev->bdev));
return -EINVAL;
}
ev1 = le64_to_cpu(sb->events);
ev2 = le64_to_cpu(refsb->events);
if (ev1 > ev2)
return 1;
}
if (minor_version)
rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
else
rdev->size = rdev->sb_offset;
if (rdev->size < le64_to_cpu(sb->data_size)/2)
return -EINVAL;
rdev->size = le64_to_cpu(sb->data_size)/2;
if (le32_to_cpu(sb->chunksize))
rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
return 0;
}
static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
{
struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
if (mddev->raid_disks == 0) {
mddev->major_version = 1;
mddev->minor_version = 0;
mddev->patch_version = 0;
mddev->persistent = 1;
mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
mddev->level = le32_to_cpu(sb->level);
mddev->layout = le32_to_cpu(sb->layout);
mddev->raid_disks = le32_to_cpu(sb->raid_disks);
mddev->size = (u32)le64_to_cpu(sb->size);
mddev->events = le64_to_cpu(sb->events);
mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
memcpy(mddev->uuid, sb->set_uuid, 16);
mddev->max_disks = (4096-256)/2;
} else {
__u64 ev1;
ev1 = le64_to_cpu(sb->events);
++ev1;
if (ev1 < mddev->events)
return -EINVAL;
}
if (mddev->level != LEVEL_MULTIPATH) {
int role;
rdev->desc_nr = le32_to_cpu(sb->dev_number);
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
case 0xffff: /* spare */
rdev->in_sync = 0;
rdev->faulty = 0;
rdev->raid_disk = -1;
break;
case 0xfffe: /* faulty */
rdev->in_sync = 0;
rdev->faulty = 1;
rdev->raid_disk = -1;
break;
default:
rdev->in_sync = 1;
rdev->faulty = 0;
rdev->raid_disk = role;
break;
}
}
return 0;
}
static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
{
struct mdp_superblock_1 *sb;
struct list_head *tmp;
mdk_rdev_t *rdev2;
int max_dev, i;
/* make rdev->sb match mddev and rdev data. */
sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
sb->feature_map = 0;
sb->pad0 = 0;
memset(sb->pad1, 0, sizeof(sb->pad1));
memset(sb->pad2, 0, sizeof(sb->pad2));
memset(sb->pad3, 0, sizeof(sb->pad3));
sb->utime = cpu_to_le64((__u64)mddev->utime);
sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync)
sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
else
sb->resync_offset = cpu_to_le64(0);
max_dev = 0;
ITERATE_RDEV(mddev,rdev2,tmp)
if (rdev2->desc_nr > max_dev)
max_dev = rdev2->desc_nr;
sb->max_dev = max_dev;
for (i=0; i<max_dev;i++)
sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
ITERATE_RDEV(mddev,rdev2,tmp) {
i = rdev2->desc_nr;
if (rdev2->faulty)
sb->dev_roles[i] = cpu_to_le16(0xfffe);
else if (rdev2->in_sync)
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else
sb->dev_roles[i] = cpu_to_le16(0xffff);
}
sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
}
struct super_type super_types[] = { struct super_type super_types[] = {
[0] = { [0] = {
.name = "0.90.0", .name = "0.90.0",
...@@ -774,10 +975,15 @@ struct super_type super_types[] = { ...@@ -774,10 +975,15 @@ struct super_type super_types[] = {
.validate_super = super_90_validate, .validate_super = super_90_validate,
.sync_super = super_90_sync, .sync_super = super_90_sync,
}, },
[1] = {
.name = "md-1",
.owner = THIS_MODULE,
.load_super = super_1_load,
.validate_super = super_1_validate,
.sync_super = super_1_sync,
},
}; };
static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
{ {
struct list_head *tmp; struct list_head *tmp;
...@@ -804,13 +1010,13 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) ...@@ -804,13 +1010,13 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
static LIST_HEAD(pending_raid_disks); static LIST_HEAD(pending_raid_disks);
static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
{ {
mdk_rdev_t *same_pdev; mdk_rdev_t *same_pdev;
if (rdev->mddev) { if (rdev->mddev) {
MD_BUG(); MD_BUG();
return; return -EINVAL;
} }
same_pdev = match_dev_unit(mddev, rdev); same_pdev = match_dev_unit(mddev, rdev);
if (same_pdev) if (same_pdev)
...@@ -820,9 +1026,25 @@ static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) ...@@ -820,9 +1026,25 @@ static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
mdidx(mddev), bdev_partition_name(rdev->bdev), mdidx(mddev), bdev_partition_name(rdev->bdev),
bdev_partition_name(same_pdev->bdev)); bdev_partition_name(same_pdev->bdev));
/* Verify rdev->desc_nr is unique.
* If it is -1, assign a free number, else
* check number is not in use
*/
if (rdev->desc_nr < 0) {
int choice = 0;
if (mddev->pers) choice = mddev->raid_disks;
while (find_rdev_nr(mddev, choice))
choice++;
rdev->desc_nr = choice;
} else {
if (find_rdev_nr(mddev, rdev->desc_nr))
return -EBUSY;
}
list_add(&rdev->same_set, &mddev->disks); list_add(&rdev->same_set, &mddev->disks);
rdev->mddev = mddev; rdev->mddev = mddev;
printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev)); printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev));
return 0;
} }
static void unbind_rdev_from_array(mdk_rdev_t * rdev) static void unbind_rdev_from_array(mdk_rdev_t * rdev)
...@@ -910,6 +1132,7 @@ static void export_array(mddev_t *mddev) ...@@ -910,6 +1132,7 @@ static void export_array(mddev_t *mddev)
if (!list_empty(&mddev->disks)) if (!list_empty(&mddev->disks))
MD_BUG(); MD_BUG();
mddev->raid_disks = 0; mddev->raid_disks = 0;
mddev->major_version = 0;
} }
#undef BAD_CSUM #undef BAD_CSUM
...@@ -994,8 +1217,6 @@ void md_print_devices(void) ...@@ -994,8 +1217,6 @@ void md_print_devices(void)
static int write_disk_sb(mdk_rdev_t * rdev) static int write_disk_sb(mdk_rdev_t * rdev)
{ {
sector_t sb_offset;
sector_t size;
if (!rdev->sb_loaded) { if (!rdev->sb_loaded) {
MD_BUG(); MD_BUG();
...@@ -1006,35 +1227,12 @@ static int write_disk_sb(mdk_rdev_t * rdev) ...@@ -1006,35 +1227,12 @@ static int write_disk_sb(mdk_rdev_t * rdev)
return 1; return 1;
} }
sb_offset = calc_dev_sboffset(rdev->bdev); dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", bdev_partition_name(rdev->bdev),
if (rdev->sb_offset != sb_offset) { (unsigned long long)rdev->sb_offset);
printk(KERN_INFO "%s's sb offset has changed from %llu to %llu, skipping\n",
bdev_partition_name(rdev->bdev),
(unsigned long long)rdev->sb_offset,
(unsigned long long)sb_offset);
goto skip;
}
/*
* If the disk went offline meanwhile and it's just a spare, then
* its size has changed to zero silently, and the MD code does
* not yet know that it's faulty.
*/
size = calc_dev_size(rdev->bdev, rdev->mddev);
if (size != rdev->size) {
printk(KERN_INFO "%s's size has changed from %llu to %llu since import, skipping\n",
bdev_partition_name(rdev->bdev),
(unsigned long long)rdev->size,
(unsigned long long)size);
goto skip;
}
printk(KERN_INFO "(write) %s's sb offset: %llu\n", bdev_partition_name(rdev->bdev), (unsigned long long)sb_offset);
if (!sync_page_io(rdev->bdev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
goto fail;
skip:
return 0; return 0;
fail:
printk("md: write_disk_sb failed for device %s\n", bdev_partition_name(rdev->bdev)); printk("md: write_disk_sb failed for device %s\n", bdev_partition_name(rdev->bdev));
return 1; return 1;
} }
...@@ -1045,7 +1243,8 @@ static void sync_sbs(mddev_t * mddev) ...@@ -1045,7 +1243,8 @@ static void sync_sbs(mddev_t * mddev)
struct list_head *tmp; struct list_head *tmp;
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
super_90_sync(mddev, rdev); super_types[mddev->major_version].
sync_super(mddev, rdev);
rdev->sb_loaded = 1; rdev->sb_loaded = 1;
} }
} }
...@@ -1079,20 +1278,20 @@ static void md_update_sb(mddev_t * mddev) ...@@ -1079,20 +1278,20 @@ static void md_update_sb(mddev_t * mddev)
if (!mddev->persistent) if (!mddev->persistent)
return; return;
printk(KERN_INFO "md: updating md%d RAID superblock on device (in sync %d)\n", dprintk(KERN_INFO "md: updating md%d RAID superblock on device (in sync %d)\n",
mdidx(mddev),mddev->in_sync); mdidx(mddev),mddev->in_sync);
err = 0; err = 0;
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
printk(KERN_INFO "md: "); dprintk(KERN_INFO "md: ");
if (rdev->faulty) if (rdev->faulty)
printk("(skipping faulty "); dprintk("(skipping faulty ");
printk("%s ", bdev_partition_name(rdev->bdev)); dprintk("%s ", bdev_partition_name(rdev->bdev));
if (!rdev->faulty) { if (!rdev->faulty) {
err += write_disk_sb(rdev); err += write_disk_sb(rdev);
} else } else
printk(")\n"); dprintk(")\n");
if (!err && mddev->level == LEVEL_MULTIPATH) if (!err && mddev->level == LEVEL_MULTIPATH)
/* only need to write one superblock... */ /* only need to write one superblock... */
break; break;
...@@ -1107,7 +1306,7 @@ static void md_update_sb(mddev_t * mddev) ...@@ -1107,7 +1306,7 @@ static void md_update_sb(mddev_t * mddev)
} }
/* /*
* Import a device. If 'on_disk', then sanity check the superblock * Import a device. If 'super_format' >= 0, then sanity check the superblock
* *
* mark the device faulty if: * mark the device faulty if:
* *
...@@ -1116,7 +1315,7 @@ static void md_update_sb(mddev_t * mddev) ...@@ -1116,7 +1315,7 @@ static void md_update_sb(mddev_t * mddev)
* *
* a faulty rdev _never_ has rdev->sb set. * a faulty rdev _never_ has rdev->sb set.
*/ */
static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk) static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
{ {
int err; int err;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
...@@ -1141,6 +1340,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk) ...@@ -1141,6 +1340,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk)
rdev->desc_nr = -1; rdev->desc_nr = -1;
rdev->faulty = 0; rdev->faulty = 0;
rdev->in_sync = 0; rdev->in_sync = 0;
rdev->data_offset = 0;
atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->nr_pending, 0);
size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
...@@ -1152,8 +1352,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk) ...@@ -1152,8 +1352,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk)
goto abort_free; goto abort_free;
} }
if (on_disk) { if (super_format >= 0) {
err = super_90_load(rdev, NULL); err = super_types[super_format].
load_super(rdev, NULL, super_minor);
if (err == -EINVAL) { if (err == -EINVAL) {
printk(KERN_WARNING "md: %s has invalid sb, not importing!\n", printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
bdev_partition_name(rdev->bdev)); bdev_partition_name(rdev->bdev));
...@@ -1206,7 +1407,8 @@ static int analyze_sbs(mddev_t * mddev) ...@@ -1206,7 +1407,8 @@ static int analyze_sbs(mddev_t * mddev)
freshest = NULL; freshest = NULL;
ITERATE_RDEV(mddev,rdev,tmp) ITERATE_RDEV(mddev,rdev,tmp)
switch (super_90_load(rdev, freshest)) { switch (super_types[mddev->major_version].
load_super(rdev, freshest, mddev->minor_version)) {
case 1: case 1:
freshest = rdev; freshest = rdev;
break; break;
...@@ -1218,12 +1420,14 @@ static int analyze_sbs(mddev_t * mddev) ...@@ -1218,12 +1420,14 @@ static int analyze_sbs(mddev_t * mddev)
} }
super_90_validate(mddev, freshest); super_types[mddev->major_version].
validate_super(mddev, freshest);
i = 0; i = 0;
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev != freshest) if (rdev != freshest)
if (super_90_validate(mddev, rdev)) { if (super_types[mddev->major_version].
validate_super(mddev, rdev)) {
printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
bdev_partition_name(rdev->bdev)); bdev_partition_name(rdev->bdev));
kick_rdev_from_array(rdev); kick_rdev_from_array(rdev);
...@@ -1278,11 +1482,6 @@ static int device_size_calculation(mddev_t * mddev) ...@@ -1278,11 +1482,6 @@ static int device_size_calculation(mddev_t * mddev)
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev->faulty) if (rdev->faulty)
continue; continue;
if (rdev->size) {
MD_BUG();
continue;
}
rdev->size = calc_dev_size(rdev->bdev, mddev);
if (rdev->size < mddev->chunk_size / 1024) { if (rdev->size < mddev->chunk_size / 1024) {
printk(KERN_WARNING printk(KERN_WARNING
"md: Dev %s smaller than chunk_size: %lluk < %dk\n", "md: Dev %s smaller than chunk_size: %lluk < %dk\n",
...@@ -1380,6 +1579,16 @@ static struct gendisk *md_probe(dev_t dev, int *part, void *data) ...@@ -1380,6 +1579,16 @@ static struct gendisk *md_probe(dev_t dev, int *part, void *data)
return NULL; return NULL;
} }
void md_wakeup_thread(mdk_thread_t *thread);
static void md_safemode_timeout(unsigned long data)
{
mddev_t *mddev = (mddev_t *) data;
mddev->safemode = 1;
md_wakeup_thread(mddev->thread);
}
#define TOO_BIG_CHUNKSIZE KERN_ERR \ #define TOO_BIG_CHUNKSIZE KERN_ERR \
"too big chunk_size: %d > %d\n" "too big chunk_size: %d > %d\n"
...@@ -1521,13 +1730,14 @@ static int do_md_run(mddev_t * mddev) ...@@ -1521,13 +1730,14 @@ static int do_md_run(mddev_t * mddev)
} }
atomic_set(&mddev->writes_pending,0); atomic_set(&mddev->writes_pending,0);
mddev->safemode = 0; mddev->safemode = 0;
if (mddev->pers->sync_request) mddev->safemode_timer.function = md_safemode_timeout;
mddev->in_sync = 0; mddev->safemode_timer.data = (unsigned long) mddev;
else mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
mddev->in_sync = 1; mddev->in_sync = 1;
md_update_sb(mddev); md_update_sb(mddev);
md_recover_arrays(); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
set_capacity(disk, md_size[mdidx(mddev)]<<1); set_capacity(disk, md_size[mdidx(mddev)]<<1);
return (0); return (0);
} }
...@@ -1553,7 +1763,6 @@ static int restart_array(mddev_t *mddev) ...@@ -1553,7 +1763,6 @@ static int restart_array(mddev_t *mddev)
goto out; goto out;
mddev->safemode = 0; mddev->safemode = 0;
mddev->in_sync = 0;
md_update_sb(mddev); md_update_sb(mddev);
mddev->ro = 0; mddev->ro = 0;
set_disk_ro(disk, 0); set_disk_ro(disk, 0);
...@@ -1563,7 +1772,8 @@ static int restart_array(mddev_t *mddev) ...@@ -1563,7 +1772,8 @@ static int restart_array(mddev_t *mddev)
/* /*
* Kick recovery or resync if necessary * Kick recovery or resync if necessary
*/ */
md_recover_arrays(); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
err = 0; err = 0;
} else { } else {
printk(KERN_ERR "md: md%d has no personality assigned.\n", printk(KERN_ERR "md: md%d has no personality assigned.\n",
...@@ -1593,12 +1803,13 @@ static int do_md_stop(mddev_t * mddev, int ro) ...@@ -1593,12 +1803,13 @@ static int do_md_stop(mddev_t * mddev, int ro)
if (mddev->pers) { if (mddev->pers) {
if (mddev->sync_thread) { if (mddev->sync_thread) {
if (mddev->recovery_running > 0) set_bit(MD_RECOVERY_INTR, &mddev->recovery);
mddev->recovery_running = -1;
md_unregister_thread(mddev->sync_thread); md_unregister_thread(mddev->sync_thread);
mddev->sync_thread = NULL; mddev->sync_thread = NULL;
} }
del_timer_sync(&mddev->safemode_timer);
invalidate_device(mk_kdev(disk->major, disk->first_minor), 1); invalidate_device(mk_kdev(disk->major, disk->first_minor), 1);
if (ro) { if (ro) {
...@@ -1699,7 +1910,7 @@ static void autorun_devices(void) ...@@ -1699,7 +1910,7 @@ static void autorun_devices(void)
printk(KERN_INFO "md: considering %s ...\n", bdev_partition_name(rdev0->bdev)); printk(KERN_INFO "md: considering %s ...\n", bdev_partition_name(rdev0->bdev));
INIT_LIST_HEAD(&candidates); INIT_LIST_HEAD(&candidates);
ITERATE_RDEV_PENDING(rdev,tmp) ITERATE_RDEV_PENDING(rdev,tmp)
if (super_90_load(rdev, rdev0) >= 0) { if (super_90_load(rdev, rdev0, 0) >= 0) {
printk(KERN_INFO "md: adding %s ...\n", bdev_partition_name(rdev->bdev)); printk(KERN_INFO "md: adding %s ...\n", bdev_partition_name(rdev->bdev));
list_move(&rdev->same_set, &candidates); list_move(&rdev->same_set, &candidates);
} }
...@@ -1717,7 +1928,8 @@ static void autorun_devices(void) ...@@ -1717,7 +1928,8 @@ static void autorun_devices(void)
if (mddev_lock(mddev)) if (mddev_lock(mddev))
printk(KERN_WARNING "md: md%d locked, cannot run\n", printk(KERN_WARNING "md: md%d locked, cannot run\n",
mdidx(mddev)); mdidx(mddev));
else if (mddev->raid_disks || !list_empty(&mddev->disks)) { else if (mddev->raid_disks || mddev->major_version
|| !list_empty(&mddev->disks)) {
printk(KERN_WARNING "md: md%d already running, cannot run %s\n", printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
mdidx(mddev), bdev_partition_name(rdev0->bdev)); mdidx(mddev), bdev_partition_name(rdev0->bdev));
mddev_unlock(mddev); mddev_unlock(mddev);
...@@ -1725,7 +1937,8 @@ static void autorun_devices(void) ...@@ -1725,7 +1937,8 @@ static void autorun_devices(void)
printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
list_del_init(&rdev->same_set); list_del_init(&rdev->same_set);
bind_rdev_to_array(rdev, mddev); if (bind_rdev_to_array(rdev, mddev))
export_rdev(rdev);
} }
autorun_array(mddev); autorun_array(mddev);
mddev_unlock(mddev); mddev_unlock(mddev);
...@@ -1778,7 +1991,7 @@ static int autostart_array(dev_t startdev) ...@@ -1778,7 +1991,7 @@ static int autostart_array(dev_t startdev)
mdp_super_t *sb = NULL; mdp_super_t *sb = NULL;
mdk_rdev_t *start_rdev = NULL, *rdev; mdk_rdev_t *start_rdev = NULL, *rdev;
start_rdev = md_import_device(startdev, 1); start_rdev = md_import_device(startdev, 0, 0);
if (IS_ERR(start_rdev)) { if (IS_ERR(start_rdev)) {
printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev)); printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev));
return err; return err;
...@@ -1812,7 +2025,7 @@ static int autostart_array(dev_t startdev) ...@@ -1812,7 +2025,7 @@ static int autostart_array(dev_t startdev)
continue; continue;
if (dev == startdev) if (dev == startdev)
continue; continue;
rdev = md_import_device(dev, 1); rdev = md_import_device(dev, 0, 0);
if (IS_ERR(rdev)) { if (IS_ERR(rdev)) {
printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n", printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n",
partition_name(dev)); partition_name(dev));
...@@ -1874,10 +2087,9 @@ static int get_array_info(mddev_t * mddev, void * arg) ...@@ -1874,10 +2087,9 @@ static int get_array_info(mddev_t * mddev, void * arg)
} }
} }
info.major_version = mddev->major_version;
info.major_version = mddev->major_version; info.major_version = mddev->major_version;
info.minor_version = mddev->minor_version; info.minor_version = mddev->minor_version;
info.patch_version = mddev->patch_version; info.patch_version = 1;
info.ctime = mddev->ctime; info.ctime = mddev->ctime;
info.level = mddev->level; info.level = mddev->level;
info.size = mddev->size; info.size = mddev->size;
...@@ -1888,7 +2100,7 @@ static int get_array_info(mddev_t * mddev, void * arg) ...@@ -1888,7 +2100,7 @@ static int get_array_info(mddev_t * mddev, void * arg)
info.utime = mddev->utime; info.utime = mddev->utime;
info.state = 0; info.state = 0;
if (mddev->recovery_cp == MaxSector) if (mddev->in_sync)
info.state = (1<<MD_SB_CLEAN); info.state = (1<<MD_SB_CLEAN);
info.active_disks = active; info.active_disks = active;
info.working_disks = working; info.working_disks = working;
...@@ -1943,13 +2155,13 @@ static int get_disk_info(mddev_t * mddev, void * arg) ...@@ -1943,13 +2155,13 @@ static int get_disk_info(mddev_t * mddev, void * arg)
static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
{ {
sector_t size;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
dev_t dev; dev_t dev;
dev = MKDEV(info->major,info->minor); dev = MKDEV(info->major,info->minor);
if (!mddev->raid_disks) { if (!mddev->raid_disks) {
int err;
/* expecting a device which has a superblock */ /* expecting a device which has a superblock */
rdev = md_import_device(dev, 1); rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
if (IS_ERR(rdev)) { if (IS_ERR(rdev)) {
printk(KERN_WARNING "md: md_import_device returned %ld\n", PTR_ERR(rdev)); printk(KERN_WARNING "md: md_import_device returned %ld\n", PTR_ERR(rdev));
return PTR_ERR(rdev); return PTR_ERR(rdev);
...@@ -1957,7 +2169,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -1957,7 +2169,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
if (!list_empty(&mddev->disks)) { if (!list_empty(&mddev->disks)) {
mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
mdk_rdev_t, same_set); mdk_rdev_t, same_set);
int err = super_90_load(rdev, rdev0); int err = super_types[mddev->major_version]
.load_super(rdev, rdev0, mddev->minor_version);
if (err < 0) { if (err < 0) {
printk(KERN_WARNING "md: %s has different UUID to %s\n", printk(KERN_WARNING "md: %s has different UUID to %s\n",
bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev)); bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev));
...@@ -1965,12 +2178,52 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -1965,12 +2178,52 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
return -EINVAL; return -EINVAL;
} }
} }
bind_rdev_to_array(rdev, mddev); err = bind_rdev_to_array(rdev, mddev);
return 0; if (err)
export_rdev(rdev);
return err;
}
/*
* add_new_disk can be used once the array is assembled
* to add "hot spares". They must already have a superblock
* written
*/
if (mddev->pers) {
int err;
if (!mddev->pers->hot_add_disk) {
printk(KERN_WARNING "md%d: personality does not support diskops!\n",
mdidx(mddev));
return -EINVAL;
}
rdev = md_import_device(dev, mddev->major_version,
mddev->minor_version);
if (IS_ERR(rdev)) {
printk(KERN_WARNING "md: md_import_device returned %ld\n", PTR_ERR(rdev));
return PTR_ERR(rdev);
}
rdev->in_sync = 0; /* just to be sure */
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
export_rdev(rdev);
if (mddev->thread)
md_wakeup_thread(mddev->thread);
return err;
}
/* otherwise, add_new_disk is only allowed
* for major_version==0 superblocks
*/
if (mddev->major_version != 0) {
printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n",
mdidx(mddev));
return -EINVAL;
} }
if (!(info->state & (1<<MD_DISK_FAULTY))) { if (!(info->state & (1<<MD_DISK_FAULTY))) {
rdev = md_import_device (dev, 0); int err;
rdev = md_import_device (dev, -1, 0);
if (IS_ERR(rdev)) { if (IS_ERR(rdev)) {
printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", PTR_ERR(rdev));
return PTR_ERR(rdev); return PTR_ERR(rdev);
...@@ -1987,16 +2240,21 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -1987,16 +2240,21 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
else else
rdev->in_sync = 0; rdev->in_sync = 0;
bind_rdev_to_array(rdev, mddev); err = bind_rdev_to_array(rdev, mddev);
if (err) {
export_rdev(rdev);
return err;
}
if (!mddev->persistent) if (!mddev->persistent) {
printk(KERN_INFO "md: nonpersistent superblock ...\n"); printk(KERN_INFO "md: nonpersistent superblock ...\n");
rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
size = calc_dev_size(rdev->bdev, mddev); } else
rdev->sb_offset = calc_dev_sboffset(rdev->bdev); rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
rdev->size = calc_dev_size(rdev, mddev->chunk_size);
if (!mddev->size || (mddev->size > size)) if (!mddev->size || (mddev->size > rdev->size))
mddev->size = size; mddev->size = rdev->size;
} }
return 0; return 0;
...@@ -2066,7 +2324,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) ...@@ -2066,7 +2324,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
static int hot_add_disk(mddev_t * mddev, dev_t dev) static int hot_add_disk(mddev_t * mddev, dev_t dev)
{ {
int i, err; int err;
unsigned int size; unsigned int size;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
...@@ -2076,19 +2334,26 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) ...@@ -2076,19 +2334,26 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
partition_name(dev), mdidx(mddev)); partition_name(dev), mdidx(mddev));
if (mddev->major_version != 0) {
printk(KERN_WARNING "md%d: HOT_ADD may only be used with version-0 superblocks.\n",
mdidx(mddev));
return -EINVAL;
}
if (!mddev->pers->hot_add_disk) { if (!mddev->pers->hot_add_disk) {
printk(KERN_WARNING "md%d: personality does not support diskops!\n", printk(KERN_WARNING "md%d: personality does not support diskops!\n",
mdidx(mddev)); mdidx(mddev));
return -EINVAL; return -EINVAL;
} }
rdev = md_import_device (dev, 0); rdev = md_import_device (dev, -1, 0);
if (IS_ERR(rdev)) { if (IS_ERR(rdev)) {
printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", PTR_ERR(rdev));
return -EINVAL; return -EINVAL;
} }
size = calc_dev_size(rdev->bdev, mddev); rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
size = calc_dev_size(rdev, mddev->chunk_size);
rdev->size = size;
if (size < mddev->size) { if (size < mddev->size) {
printk(KERN_WARNING "md%d: disk size %llu blocks < array size %llu\n", printk(KERN_WARNING "md%d: disk size %llu blocks < array size %llu\n",
...@@ -2105,27 +2370,21 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) ...@@ -2105,27 +2370,21 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
goto abort_export; goto abort_export;
} }
rdev->in_sync = 0; rdev->in_sync = 0;
rdev->desc_nr = -1;
bind_rdev_to_array(rdev, mddev); bind_rdev_to_array(rdev, mddev);
/* /*
* The rest should better be atomic, we can have disk failures * The rest should better be atomic, we can have disk failures
* noticed in interrupt contexts ... * noticed in interrupt contexts ...
*/ */
rdev->size = size;
rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
for (i = mddev->raid_disks; i < mddev->max_disks; i++)
if (find_rdev_nr(mddev,i)==NULL)
break;
if (i == mddev->max_disks) { if (rdev->desc_nr == mddev->max_disks) {
printk(KERN_WARNING "md%d: can not hot-add to full array!\n", printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
mdidx(mddev)); mdidx(mddev));
err = -EBUSY; err = -EBUSY;
goto abort_unbind_export; goto abort_unbind_export;
} }
rdev->desc_nr = i;
rdev->raid_disk = -1; rdev->raid_disk = -1;
md_update_sb(mddev); md_update_sb(mddev);
...@@ -2134,7 +2393,8 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) ...@@ -2134,7 +2393,8 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
* Kick recovery, maybe this spare has to be added to the * Kick recovery, maybe this spare has to be added to the
* array immediately. * array immediately.
*/ */
md_recover_arrays(); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return 0; return 0;
...@@ -2146,9 +2406,37 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) ...@@ -2146,9 +2406,37 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
return err; return err;
} }
/*
* set_array_info is used two different ways
* The original usage is when creating a new array.
* In this usage, raid_disks is > = and it together with
* level, size, not_persistent,layout,chunksize determine the
* shape of the array.
* This will always create an array with a type-0.90.0 superblock.
* The newer usage is when assembling an array.
* In this case raid_disks will be 0, and the major_version field is
* use to determine which style super-blocks are to be found on the devices.
* The minor and patch _version numbers are also kept incase the
* super_block handler wishes to interpret them.
*/
static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
{ {
if (info->raid_disks == 0) {
/* just setting version number for superblock loading */
if (info->major_version < 0 ||
info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
super_types[info->major_version].name == NULL) {
/* maybe try to auto-load a module? */
printk(KERN_INFO "md: superblock version %d not known\n",
info->major_version);
return -EINVAL;
}
mddev->major_version = info->major_version;
mddev->minor_version = info->minor_version;
mddev->patch_version = info->patch_version;
return 0;
}
mddev->major_version = MD_MAJOR_VERSION; mddev->major_version = MD_MAJOR_VERSION;
mddev->minor_version = MD_MINOR_VERSION; mddev->minor_version = MD_MINOR_VERSION;
mddev->patch_version = MD_PATCHLEVEL_VERSION; mddev->patch_version = MD_PATCHLEVEL_VERSION;
...@@ -2169,6 +2457,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) ...@@ -2169,6 +2457,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
mddev->layout = info->layout; mddev->layout = info->layout;
mddev->chunk_size = info->chunk_size; mddev->chunk_size = info->chunk_size;
mddev->max_disks = MD_SB_DISKS;
/* /*
...@@ -2282,9 +2571,11 @@ static int md_ioctl(struct inode *inode, struct file *file, ...@@ -2282,9 +2571,11 @@ static int md_ioctl(struct inode *inode, struct file *file,
err = -EBUSY; err = -EBUSY;
goto abort_unlock; goto abort_unlock;
} }
if (arg) { {
mdu_array_info_t info; mdu_array_info_t info;
if (copy_from_user(&info, (void*)arg, sizeof(info))) { if (!arg)
memset(&info, 0, sizeof(info));
else if (copy_from_user(&info, (void*)arg, sizeof(info))) {
err = -EFAULT; err = -EFAULT;
goto abort_unlock; goto abort_unlock;
} }
...@@ -2473,12 +2764,6 @@ static struct block_device_operations md_fops = ...@@ -2473,12 +2764,6 @@ static struct block_device_operations md_fops =
.ioctl = md_ioctl, .ioctl = md_ioctl,
}; };
static inline void flush_curr_signals(void)
{
flush_signals(current);
}
int md_thread(void * arg) int md_thread(void * arg)
{ {
mdk_thread_t *thread = arg; mdk_thread_t *thread = arg;
...@@ -2489,7 +2774,7 @@ int md_thread(void * arg) ...@@ -2489,7 +2774,7 @@ int md_thread(void * arg)
* Detach thread * Detach thread
*/ */
daemonize(thread->name); daemonize(thread->name, mdidx(thread->mddev));
current->exit_signal = SIGCHLD; current->exit_signal = SIGCHLD;
allow_signal(SIGKILL); allow_signal(SIGKILL);
...@@ -2510,7 +2795,7 @@ int md_thread(void * arg) ...@@ -2510,7 +2795,7 @@ int md_thread(void * arg)
complete(thread->event); complete(thread->event);
while (thread->run) { while (thread->run) {
void (*run)(void *data); void (*run)(mddev_t *);
wait_event_interruptible(thread->wqueue, wait_event_interruptible(thread->wqueue,
test_bit(THREAD_WAKEUP, &thread->flags)); test_bit(THREAD_WAKEUP, &thread->flags));
...@@ -2521,11 +2806,11 @@ int md_thread(void * arg) ...@@ -2521,11 +2806,11 @@ int md_thread(void * arg)
run = thread->run; run = thread->run;
if (run) { if (run) {
run(thread->data); run(thread->mddev);
blk_run_queues(); blk_run_queues();
} }
if (signal_pending(current)) if (signal_pending(current))
flush_curr_signals(); flush_signals(current);
} }
complete(thread->event); complete(thread->event);
return 0; return 0;
...@@ -2538,8 +2823,8 @@ void md_wakeup_thread(mdk_thread_t *thread) ...@@ -2538,8 +2823,8 @@ void md_wakeup_thread(mdk_thread_t *thread)
wake_up(&thread->wqueue); wake_up(&thread->wqueue);
} }
mdk_thread_t *md_register_thread(void (*run) (void *), mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
void *data, const char *name) const char *name)
{ {
mdk_thread_t *thread; mdk_thread_t *thread;
int ret; int ret;
...@@ -2556,7 +2841,7 @@ mdk_thread_t *md_register_thread(void (*run) (void *), ...@@ -2556,7 +2841,7 @@ mdk_thread_t *md_register_thread(void (*run) (void *),
init_completion(&event); init_completion(&event);
thread->event = &event; thread->event = &event;
thread->run = run; thread->run = run;
thread->data = data; thread->mddev = mddev;
thread->name = name; thread->name = name;
ret = kernel_thread(md_thread, thread, 0); ret = kernel_thread(md_thread, thread, 0);
if (ret < 0) { if (ret < 0) {
...@@ -2591,16 +2876,6 @@ void md_unregister_thread(mdk_thread_t *thread) ...@@ -2591,16 +2876,6 @@ void md_unregister_thread(mdk_thread_t *thread)
kfree(thread); kfree(thread);
} }
static void md_recover_arrays(void)
{
if (!md_recovery_thread) {
MD_BUG();
return;
}
md_wakeup_thread(md_recovery_thread);
}
void md_error(mddev_t *mddev, mdk_rdev_t *rdev) void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
...@@ -2618,33 +2893,34 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -2618,33 +2893,34 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
if (!mddev->pers->error_handler) if (!mddev->pers->error_handler)
return; return;
mddev->pers->error_handler(mddev,rdev); mddev->pers->error_handler(mddev,rdev);
md_recover_arrays(); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
} }
static int status_unused(char * page) /* seq_file implementation /proc/mdstat */
static void status_unused(struct seq_file *seq)
{ {
int sz = 0, i = 0; int i = 0;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
struct list_head *tmp; struct list_head *tmp;
sz += sprintf(page + sz, "unused devices: "); seq_printf(seq, "unused devices: ");
ITERATE_RDEV_PENDING(rdev,tmp) { ITERATE_RDEV_PENDING(rdev,tmp) {
i++; i++;
sz += sprintf(page + sz, "%s ", seq_printf(seq, "%s ",
bdev_partition_name(rdev->bdev)); bdev_partition_name(rdev->bdev));
} }
if (!i) if (!i)
sz += sprintf(page + sz, "<none>"); seq_printf(seq, "<none>");
sz += sprintf(page + sz, "\n"); seq_printf(seq, "\n");
return sz;
} }
static int status_resync(char * page, mddev_t * mddev) static void status_resync(struct seq_file *seq, mddev_t * mddev)
{ {
int sz = 0;
unsigned long max_blocks, resync, res, dt, db, rt; unsigned long max_blocks, resync, res, dt, db, rt;
resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
...@@ -2655,21 +2931,22 @@ static int status_resync(char * page, mddev_t * mddev) ...@@ -2655,21 +2931,22 @@ static int status_resync(char * page, mddev_t * mddev)
*/ */
if (!max_blocks) { if (!max_blocks) {
MD_BUG(); MD_BUG();
return 0; return;
} }
res = (resync/1024)*1000/(max_blocks/1024 + 1); res = (resync/1024)*1000/(max_blocks/1024 + 1);
{ {
int i, x = res/50, y = 20-x; int i, x = res/50, y = 20-x;
sz += sprintf(page + sz, "["); seq_printf(seq, "[");
for (i = 0; i < x; i++) for (i = 0; i < x; i++)
sz += sprintf(page + sz, "="); seq_printf(seq, "=");
sz += sprintf(page + sz, ">"); seq_printf(seq, ">");
for (i = 0; i < y; i++) for (i = 0; i < y; i++)
sz += sprintf(page + sz, "."); seq_printf(seq, ".");
sz += sprintf(page + sz, "] "); seq_printf(seq, "] ");
} }
sz += sprintf(page + sz, " %s =%3lu.%lu%% (%lu/%lu)", seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
(mddev->spares ? "recovery" : "resync"), (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
"resync" : "recovery"),
res/10, res % 10, resync, max_blocks); res/10, res % 10, resync, max_blocks);
/* /*
...@@ -2686,44 +2963,110 @@ static int status_resync(char * page, mddev_t * mddev) ...@@ -2686,44 +2963,110 @@ static int status_resync(char * page, mddev_t * mddev)
db = resync - (mddev->resync_mark_cnt/2); db = resync - (mddev->resync_mark_cnt/2);
rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
sz += sprintf(page + sz, " speed=%ldK/sec", db/dt); seq_printf(seq, " speed=%ldK/sec", db/dt);
}
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
{
struct list_head *tmp;
loff_t l = *pos;
mddev_t *mddev;
if (l > 0x10000)
return NULL;
if (!l--)
/* header */
return (void*)1;
spin_lock(&all_mddevs_lock);
list_for_each(tmp,&all_mddevs)
if (!l--) {
mddev = list_entry(tmp, mddev_t, all_mddevs);
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
return mddev;
}
spin_unlock(&all_mddevs_lock);
return (void*)2;/* tail */
}
static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct list_head *tmp;
mddev_t *next_mddev, *mddev = v;
++*pos;
if (v == (void*)2)
return NULL;
spin_lock(&all_mddevs_lock);
if (v == (void*)1)
tmp = all_mddevs.next;
else
tmp = mddev->all_mddevs.next;
if (tmp != &all_mddevs)
next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
else {
next_mddev = (void*)2;
*pos = 0x10000;
}
spin_unlock(&all_mddevs_lock);
if (v != (void*)1)
mddev_put(mddev);
return next_mddev;
return sz;
} }
static int md_status_read_proc(char *page, char **start, off_t off, static void md_seq_stop(struct seq_file *seq, void *v)
int count, int *eof, void *data)
{ {
int sz = 0, j; mddev_t *mddev = v;
if (mddev && v != (void*)1 && v != (void*)2)
mddev_put(mddev);
}
static int md_seq_show(struct seq_file *seq, void *v)
{
mddev_t *mddev = v;
sector_t size; sector_t size;
struct list_head *tmp, *tmp2; struct list_head *tmp2;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
mddev_t *mddev; int i;
sz += sprintf(page + sz, "Personalities : "); if (v == (void*)1) {
for (j = 0; j < MAX_PERSONALITY; j++) seq_printf(seq, "Personalities : ");
if (pers[j]) for (i = 0; i < MAX_PERSONALITY; i++)
sz += sprintf(page+sz, "[%s] ", pers[j]->name); if (pers[i])
seq_printf(seq, "[%s] ", pers[i]->name);
sz += sprintf(page+sz, "\n"); seq_printf(seq, "\n");
return 0;
}
if (v == (void*)2) {
status_unused(seq);
return 0;
}
ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) { if (mddev_lock(mddev)!=0)
sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev), return -EINTR;
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "md%d : %sactive", mdidx(mddev),
mddev->pers ? "" : "in"); mddev->pers ? "" : "in");
if (mddev->pers) { if (mddev->pers) {
if (mddev->ro) if (mddev->ro)
sz += sprintf(page + sz, " (read-only)"); seq_printf(seq, " (read-only)");
sz += sprintf(page + sz, " %s", mddev->pers->name); seq_printf(seq, " %s", mddev->pers->name);
} }
size = 0; size = 0;
ITERATE_RDEV(mddev,rdev,tmp2) { ITERATE_RDEV(mddev,rdev,tmp2) {
sz += sprintf(page + sz, " %s[%d]", seq_printf(seq, " %s[%d]",
bdev_partition_name(rdev->bdev), rdev->desc_nr); bdev_partition_name(rdev->bdev), rdev->desc_nr);
if (rdev->faulty) { if (rdev->faulty) {
sz += sprintf(page + sz, "(F)"); seq_printf(seq, "(F)");
continue; continue;
} }
size += rdev->size; size += rdev->size;
...@@ -2731,34 +3074,50 @@ static int md_status_read_proc(char *page, char **start, off_t off, ...@@ -2731,34 +3074,50 @@ static int md_status_read_proc(char *page, char **start, off_t off,
if (!list_empty(&mddev->disks)) { if (!list_empty(&mddev->disks)) {
if (mddev->pers) if (mddev->pers)
sz += sprintf(page + sz, "\n %llu blocks", seq_printf(seq, "\n %llu blocks",
(unsigned long long)md_size[mdidx(mddev)]); (unsigned long long)md_size[mdidx(mddev)]);
else else
sz += sprintf(page + sz, "\n %llu blocks", (unsigned long long)size); seq_printf(seq, "\n %llu blocks", (unsigned long long)size);
} }
if (!mddev->pers) { if (mddev->pers) {
sz += sprintf(page+sz, "\n"); mddev->pers->status (seq, mddev);
mddev_unlock(mddev); seq_printf(seq, "\n ");
continue;
}
sz += mddev->pers->status (page+sz, mddev);
sz += sprintf(page+sz, "\n ");
if (mddev->curr_resync > 2) if (mddev->curr_resync > 2)
sz += status_resync (page+sz, mddev); status_resync (seq, mddev);
else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
sz += sprintf(page + sz, " resync=DELAYED"); seq_printf(seq, " resync=DELAYED");
}
sz += sprintf(page + sz, "\n"); seq_printf(seq, "\n");
mddev_unlock(mddev);
} }
sz += status_unused(page + sz); mddev_unlock(mddev);
return 0;
}
static struct seq_operations md_seq_ops = {
.start = md_seq_start,
.next = md_seq_next,
.stop = md_seq_stop,
.show = md_seq_show,
};
static int md_seq_open(struct inode *inode, struct file *file)
{
int error;
return sz; error = seq_open(file, &md_seq_ops);
return error;
} }
static struct file_operations md_seq_fops = {
.open = md_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
int register_md_personality(int pnum, mdk_personality_t *p) int register_md_personality(int pnum, mdk_personality_t *p)
{ {
if (pnum >= MAX_PERSONALITY) { if (pnum >= MAX_PERSONALITY) {
...@@ -2820,9 +3179,8 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) ...@@ -2820,9 +3179,8 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
atomic_sub(blocks, &mddev->recovery_active); atomic_sub(blocks, &mddev->recovery_active);
wake_up(&mddev->recovery_wait); wake_up(&mddev->recovery_wait);
if (!ok) { if (!ok) {
mddev->recovery_error = -EIO; set_bit(MD_RECOVERY_ERR, &mddev->recovery);
mddev->recovery_running = -1; md_wakeup_thread(mddev->thread);
md_recover_arrays();
// stop recovery, signal do_sync .... // stop recovery, signal do_sync ....
} }
} }
...@@ -2830,40 +3188,49 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) ...@@ -2830,40 +3188,49 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
void md_write_start(mddev_t *mddev) void md_write_start(mddev_t *mddev)
{ {
if (mddev->safemode && !atomic_read(&mddev->writes_pending)) { if (!atomic_read(&mddev->writes_pending)) {
mddev_lock_uninterruptible(mddev); mddev_lock_uninterruptible(mddev);
atomic_inc(&mddev->writes_pending);
if (mddev->in_sync) { if (mddev->in_sync) {
mddev->in_sync = 0; mddev->in_sync = 0;
del_timer(&mddev->safemode_timer);
md_update_sb(mddev); md_update_sb(mddev);
} }
atomic_inc(&mddev->writes_pending);
mddev_unlock(mddev); mddev_unlock(mddev);
} else } else
atomic_inc(&mddev->writes_pending); atomic_inc(&mddev->writes_pending);
} }
void md_write_end(mddev_t *mddev, mdk_thread_t *thread) void md_write_end(mddev_t *mddev)
{ {
if (atomic_dec_and_test(&mddev->writes_pending) && mddev->safemode) if (atomic_dec_and_test(&mddev->writes_pending)) {
md_wakeup_thread(thread); if (mddev->safemode == 2)
md_wakeup_thread(mddev->thread);
else
mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
}
} }
static inline void md_enter_safemode(mddev_t *mddev) static inline void md_enter_safemode(mddev_t *mddev)
{ {
mddev_lock_uninterruptible(mddev); mddev_lock_uninterruptible(mddev);
if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && !mddev->recovery_running) { if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
!mddev->in_sync && mddev->recovery_cp == MaxSector) {
mddev->in_sync = 1; mddev->in_sync = 1;
md_update_sb(mddev); md_update_sb(mddev);
} }
mddev_unlock(mddev); mddev_unlock(mddev);
if (mddev->safemode == 1)
mddev->safemode = 0;
} }
void md_handle_safemode(mddev_t *mddev) void md_handle_safemode(mddev_t *mddev)
{ {
if (signal_pending(current)) { if (signal_pending(current)) {
printk(KERN_INFO "md: md%d in safe mode\n",mdidx(mddev)); printk(KERN_INFO "md: md%d in immediate safe mode\n",mdidx(mddev));
mddev->safemode= 1; mddev->safemode = 2;
flush_curr_signals(); flush_signals(current);
} }
if (mddev->safemode) if (mddev->safemode)
md_enter_safemode(mddev); md_enter_safemode(mddev);
...@@ -2874,9 +3241,8 @@ DECLARE_WAIT_QUEUE_HEAD(resync_wait); ...@@ -2874,9 +3241,8 @@ DECLARE_WAIT_QUEUE_HEAD(resync_wait);
#define SYNC_MARKS 10 #define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ) #define SYNC_MARK_STEP (3*HZ)
static void md_do_sync(void *data) static void md_do_sync(mddev_t *mddev)
{ {
mddev_t *mddev = data;
mddev_t *mddev2; mddev_t *mddev2;
unsigned int max_sectors, currspeed = 0, unsigned int max_sectors, currspeed = 0,
j, window, err; j, window, err;
...@@ -2887,7 +3253,7 @@ static void md_do_sync(void *data) ...@@ -2887,7 +3253,7 @@ static void md_do_sync(void *data)
unsigned long last_check; unsigned long last_check;
/* just incase thread restarts... */ /* just incase thread restarts... */
if (mddev->recovery_running <= 0) if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
return; return;
/* we overload curr_resync somewhat here. /* we overload curr_resync somewhat here.
...@@ -2914,15 +3280,17 @@ static void md_do_sync(void *data) ...@@ -2914,15 +3280,17 @@ static void md_do_sync(void *data)
} }
if (wait_event_interruptible(resync_wait, if (wait_event_interruptible(resync_wait,
mddev2->curr_resync < mddev->curr_resync)) { mddev2->curr_resync < mddev->curr_resync)) {
flush_curr_signals(); flush_signals(current);
err = -EINTR; err = -EINTR;
mddev_put(mddev2); mddev_put(mddev2);
goto skip; goto skip;
} }
} }
if (mddev->curr_resync == 1) if (mddev->curr_resync == 1) {
mddev_put(mddev2);
break; break;
} }
}
} while (mddev->curr_resync < 2); } while (mddev->curr_resync < 2);
max_sectors = mddev->size << 1; max_sectors = mddev->size << 1;
...@@ -2934,9 +3302,13 @@ static void md_do_sync(void *data) ...@@ -2934,9 +3302,13 @@ static void md_do_sync(void *data)
sysctl_speed_limit_max); sysctl_speed_limit_max);
is_mddev_idle(mddev); /* this also initializes IO event counters */ is_mddev_idle(mddev); /* this also initializes IO event counters */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
j = mddev->recovery_cp;
else
j = 0;
for (m = 0; m < SYNC_MARKS; m++) { for (m = 0; m < SYNC_MARKS; m++) {
mark[m] = jiffies; mark[m] = jiffies;
mark_cnt[m] = mddev->recovery_cp; mark_cnt[m] = j;
} }
last_mark = 0; last_mark = 0;
mddev->resync_mark = mark[last_mark]; mddev->resync_mark = mark[last_mark];
...@@ -2953,12 +3325,10 @@ static void md_do_sync(void *data) ...@@ -2953,12 +3325,10 @@ static void md_do_sync(void *data)
init_waitqueue_head(&mddev->recovery_wait); init_waitqueue_head(&mddev->recovery_wait);
last_check = 0; last_check = 0;
mddev->recovery_error = 0; if (j)
if (mddev->recovery_cp)
printk(KERN_INFO "md: resuming recovery of md%d from checkpoint.\n", mdidx(mddev)); printk(KERN_INFO "md: resuming recovery of md%d from checkpoint.\n", mdidx(mddev));
for (j = mddev->recovery_cp; j < max_sectors;) { while (j < max_sectors) {
int sectors; int sectors;
sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
...@@ -2975,6 +3345,10 @@ static void md_do_sync(void *data) ...@@ -2975,6 +3345,10 @@ static void md_do_sync(void *data)
last_check = j; last_check = j;
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
test_bit(MD_RECOVERY_ERR, &mddev->recovery))
break;
blk_run_queues(); blk_run_queues();
repeat: repeat:
...@@ -2995,7 +3369,7 @@ static void md_do_sync(void *data) ...@@ -2995,7 +3369,7 @@ static void md_do_sync(void *data)
* got a signal, exit. * got a signal, exit.
*/ */
printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n"); printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
flush_curr_signals(); flush_signals(current);
err = -EINTR; err = -EINTR;
goto out; goto out;
} }
...@@ -3029,39 +3403,42 @@ static void md_do_sync(void *data) ...@@ -3029,39 +3403,42 @@ static void md_do_sync(void *data)
out: out:
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
if (mddev->recovery_running < 0 &&
!mddev->recovery_error && mddev->curr_resync > 2)
{
/* interrupted but no write errors */
printk(KERN_INFO "md: checkpointing recovery of md%d.\n", mdidx(mddev));
mddev->recovery_cp = mddev->curr_resync;
}
/* tell personality that we are finished */ /* tell personality that we are finished */
mddev->pers->sync_request(mddev, max_sectors, 1); mddev->pers->sync_request(mddev, max_sectors, 1);
skip:
mddev->curr_resync = 0;
if (err) if (err)
mddev->recovery_running = -1; set_bit(MD_RECOVERY_ERR, &mddev->recovery);
if (mddev->recovery_running > 0) if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
mddev->recovery_running = 0; mddev->curr_resync > 2 &&
if (mddev->recovery_running == 0) mddev->curr_resync > mddev->recovery_cp) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
printk(KERN_INFO "md: checkpointing recovery of md%d.\n", mdidx(mddev));
mddev->recovery_cp = mddev->curr_resync;
} else
mddev->recovery_cp = MaxSector; mddev->recovery_cp = MaxSector;
}
if (mddev->safemode) if (mddev->safemode)
md_enter_safemode(mddev); md_enter_safemode(mddev);
md_recover_arrays(); skip:
mddev->curr_resync = 0;
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
} }
/* /*
* This is the kernel thread that watches all md arrays for re-sync and other * This routine is regularly called by all per-raid-array threads to
* action that might be needed. * deal with generic issues like resync and super-block update.
* Raid personalities that don't have a thread (linear/raid0) do not
* need this as they never do any recovery or update the superblock.
*
* It does not do any resync itself, but rather "forks" off other threads * It does not do any resync itself, but rather "forks" off other threads
* to do that as needed. * to do that as needed.
* When it is determined that resync is needed, we set "->recovery_running" and * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
* create a thread at ->sync_thread. * "->recovery" and create a thread at ->sync_thread.
* When the thread finishes it clears recovery_running (or sets an error) * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
* and wakeup up this thread which will reap the thread and finish up. * and wakeups up this thread which will reap the thread and finish up.
* This thread also removes any faulty devices (with nr_pending == 0). * This thread also removes any faulty devices (with nr_pending == 0).
* *
* The overall approach is: * The overall approach is:
...@@ -3072,41 +3449,47 @@ static void md_do_sync(void *data) ...@@ -3072,41 +3449,47 @@ static void md_do_sync(void *data)
* 5/ If array is degraded, try to add spares devices * 5/ If array is degraded, try to add spares devices
* 6/ If array has spares or is not in-sync, start a resync thread. * 6/ If array has spares or is not in-sync, start a resync thread.
*/ */
void md_do_recovery(void *data) void md_check_recovery(mddev_t *mddev)
{ {
mddev_t *mddev;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
struct list_head *tmp, *rtmp; struct list_head *rtmp;
dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) { if (mddev->ro)
if (!mddev->raid_disks || !mddev->pers || mddev->ro) return;
goto unlock; if ( ! (
mddev->sb_dirty ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
test_bit(MD_RECOVERY_DONE, &mddev->recovery)
))
return;
if (mddev_trylock(mddev)==0) {
int spares =0;
if (mddev->sb_dirty) if (mddev->sb_dirty)
md_update_sb(mddev); md_update_sb(mddev);
if (mddev->recovery_running > 0) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery))
/* resync/recovery still happening */ /* resync/recovery still happening */
goto unlock; goto unlock;
if (mddev->sync_thread) { if (mddev->sync_thread) {
/* resync has finished, collect result */ /* resync has finished, collect result */
md_unregister_thread(mddev->sync_thread); md_unregister_thread(mddev->sync_thread);
mddev->sync_thread = NULL; mddev->sync_thread = NULL;
if (mddev->recovery_running == 0) { if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) {
/* success...*/ /* success...*/
/* activate any spares */ /* activate any spares */
mddev->pers->spare_active(mddev); mddev->pers->spare_active(mddev);
mddev->spares = 0;
} }
md_update_sb(mddev); md_update_sb(mddev);
mddev->recovery_running = 0; mddev->recovery = 0;
wake_up(&resync_wait); wake_up(&resync_wait);
goto unlock; goto unlock;
} }
if (mddev->recovery_running) { if (mddev->recovery) {
/* that's odd.. */ /* that's odd.. */
mddev->recovery_running = 0; mddev->recovery = 0;
wake_up(&resync_wait); wake_up(&resync_wait);
} }
...@@ -3114,7 +3497,6 @@ void md_do_recovery(void *data) ...@@ -3114,7 +3497,6 @@ void md_do_recovery(void *data)
* remove any failed drives, then * remove any failed drives, then
* add spares if possible * add spares if possible
*/ */
mddev->spares = 0;
ITERATE_RDEV(mddev,rdev,rtmp) { ITERATE_RDEV(mddev,rdev,rtmp) {
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
rdev->faulty && rdev->faulty &&
...@@ -3123,43 +3505,41 @@ void md_do_recovery(void *data) ...@@ -3123,43 +3505,41 @@ void md_do_recovery(void *data)
rdev->raid_disk = -1; rdev->raid_disk = -1;
} }
if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync) if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
mddev->spares++; spares++;
} }
if (mddev->degraded) { if (mddev->degraded) {
ITERATE_RDEV(mddev,rdev,rtmp) ITERATE_RDEV(mddev,rdev,rtmp)
if (rdev->raid_disk < 0 if (rdev->raid_disk < 0
&& !rdev->faulty) { && !rdev->faulty) {
if (mddev->pers->hot_add_disk(mddev,rdev)) { if (mddev->pers->hot_add_disk(mddev,rdev))
mddev->spares++; spares++;
mddev->recovery_cp = 0;
}
else else
break; break;
} }
} }
if (!mddev->spares && (mddev->recovery_cp == MaxSector )) { if (!spares && (mddev->recovery_cp == MaxSector )) {
/* nothing we can do ... */ /* nothing we can do ... */
goto unlock; goto unlock;
} }
if (mddev->pers->sync_request) { if (mddev->pers->sync_request) {
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
if (!spares)
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev->sync_thread = md_register_thread(md_do_sync,
mddev, mddev,
"md_resync"); "md%d_resync");
if (!mddev->sync_thread) { if (!mddev->sync_thread) {
printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev)); printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev));
/* leave the spares where they are, it shouldn't hurt */ /* leave the spares where they are, it shouldn't hurt */
mddev->recovery_running = 0; mddev->recovery = 0;
} else { } else {
mddev->recovery_running = 1;
md_wakeup_thread(mddev->sync_thread); md_wakeup_thread(mddev->sync_thread);
} }
} }
unlock: unlock:
mddev_unlock(mddev); mddev_unlock(mddev);
} }
dprintk(KERN_INFO "md: recovery thread finished ...\n");
} }
int md_notify_reboot(struct notifier_block *this, int md_notify_reboot(struct notifier_block *this,
...@@ -3194,6 +3574,7 @@ struct notifier_block md_notifier = { ...@@ -3194,6 +3574,7 @@ struct notifier_block md_notifier = {
static void md_geninit(void) static void md_geninit(void)
{ {
struct proc_dir_entry *p;
int i; int i;
for(i = 0; i < MAX_MD_DEVS; i++) { for(i = 0; i < MAX_MD_DEVS; i++) {
...@@ -3203,13 +3584,14 @@ static void md_geninit(void) ...@@ -3203,13 +3584,14 @@ static void md_geninit(void)
dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL); p = create_proc_entry("mdstat", S_IRUGO, NULL);
if (p)
p->proc_fops = &md_seq_fops;
#endif #endif
} }
int __init md_init(void) int __init md_init(void)
{ {
static char * name = "mdrecoveryd";
int minor; int minor;
printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n", printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
...@@ -3229,11 +3611,6 @@ int __init md_init(void) ...@@ -3229,11 +3611,6 @@ int __init md_init(void)
S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
} }
md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
if (!md_recovery_thread)
printk(KERN_ALERT
"md: bug: couldn't allocate md_recovery_thread\n");
register_reboot_notifier(&md_notifier); register_reboot_notifier(&md_notifier);
raid_table_header = register_sysctl_table(raid_root_table, 1); raid_table_header = register_sysctl_table(raid_root_table, 1);
...@@ -3268,7 +3645,7 @@ static void autostart_arrays(void) ...@@ -3268,7 +3645,7 @@ static void autostart_arrays(void)
for (i = 0; i < dev_cnt; i++) { for (i = 0; i < dev_cnt; i++) {
dev_t dev = detected_devices[i]; dev_t dev = detected_devices[i];
rdev = md_import_device(dev,1); rdev = md_import_device(dev,0, 0);
if (IS_ERR(rdev)) { if (IS_ERR(rdev)) {
printk(KERN_ALERT "md: could not import %s!\n", printk(KERN_ALERT "md: could not import %s!\n",
partition_name(dev)); partition_name(dev));
...@@ -3291,7 +3668,6 @@ static __exit void md_exit(void) ...@@ -3291,7 +3668,6 @@ static __exit void md_exit(void)
{ {
int i; int i;
blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
md_unregister_thread(md_recovery_thread);
for (i=0; i < MAX_MD_DEVS; i++) for (i=0; i < MAX_MD_DEVS; i++)
devfs_remove("md/%d", i); devfs_remove("md/%d", i);
devfs_remove("md"); devfs_remove("md");
...@@ -3331,4 +3707,5 @@ EXPORT_SYMBOL(md_unregister_thread); ...@@ -3331,4 +3707,5 @@ EXPORT_SYMBOL(md_unregister_thread);
EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_wakeup_thread);
EXPORT_SYMBOL(md_print_devices); EXPORT_SYMBOL(md_print_devices);
EXPORT_SYMBOL(md_interrupt_thread); EXPORT_SYMBOL(md_interrupt_thread);
EXPORT_SYMBOL(md_check_recovery);
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
...@@ -86,7 +86,6 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) ...@@ -86,7 +86,6 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
{ {
unsigned long flags; unsigned long flags;
mddev_t *mddev = mp_bh->mddev; mddev_t *mddev = mp_bh->mddev;
multipath_conf_t *conf = mddev_to_conf(mddev);
spin_lock_irqsave(&retry_list_lock, flags); spin_lock_irqsave(&retry_list_lock, flags);
if (multipath_retry_list == NULL) if (multipath_retry_list == NULL)
...@@ -95,7 +94,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) ...@@ -95,7 +94,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
multipath_retry_tail = &mp_bh->next_mp; multipath_retry_tail = &mp_bh->next_mp;
mp_bh->next_mp = NULL; mp_bh->next_mp = NULL;
spin_unlock_irqrestore(&retry_list_lock, flags); spin_unlock_irqrestore(&retry_list_lock, flags);
md_wakeup_thread(conf->thread); md_wakeup_thread(mddev->thread);
} }
...@@ -185,19 +184,18 @@ static int multipath_make_request (request_queue_t *q, struct bio * bio) ...@@ -185,19 +184,18 @@ static int multipath_make_request (request_queue_t *q, struct bio * bio)
return 0; return 0;
} }
static int multipath_status (char *page, mddev_t *mddev) static void multipath_status (struct seq_file *seq, mddev_t *mddev)
{ {
multipath_conf_t *conf = mddev_to_conf(mddev); multipath_conf_t *conf = mddev_to_conf(mddev);
int sz = 0, i; int i;
sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, seq_printf (seq, " [%d/%d] [", conf->raid_disks,
conf->working_disks); conf->working_disks);
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->raid_disks; i++)
sz += sprintf (page+sz, "%s", seq_printf (seq, "%s",
conf->multipaths[i].rdev && conf->multipaths[i].rdev &&
conf->multipaths[i].rdev->in_sync ? "U" : "_"); conf->multipaths[i].rdev->in_sync ? "U" : "_");
sz += sprintf (page+sz, "]"); seq_printf (seq, "]");
return sz;
} }
#define LAST_DISK KERN_ALERT \ #define LAST_DISK KERN_ALERT \
...@@ -334,14 +332,14 @@ static int multipath_remove_disk(mddev_t *mddev, int number) ...@@ -334,14 +332,14 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
* 3. Performs writes following reads for array syncronising. * 3. Performs writes following reads for array syncronising.
*/ */
static void multipathd (void *data) static void multipathd (mddev_t *mddev)
{ {
struct multipath_bh *mp_bh; struct multipath_bh *mp_bh;
struct bio *bio; struct bio *bio;
unsigned long flags; unsigned long flags;
mddev_t *mddev;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
md_check_recovery(mddev);
for (;;) { for (;;) {
spin_lock_irqsave(&retry_list_lock, flags); spin_lock_irqsave(&retry_list_lock, flags);
mp_bh = multipath_retry_list; mp_bh = multipath_retry_list;
...@@ -471,10 +469,10 @@ static int multipath_run (mddev_t *mddev) ...@@ -471,10 +469,10 @@ static int multipath_run (mddev_t *mddev)
} }
{ {
const char * name = "multipathd"; const char * name = "md%d_multipath";
conf->thread = md_register_thread(multipathd, conf, name); mddev->thread = md_register_thread(multipathd, mddev, name);
if (!conf->thread) { if (!mddev->thread) {
printk(THREAD_ERROR, mdidx(mddev)); printk(THREAD_ERROR, mdidx(mddev));
goto out_free_conf; goto out_free_conf;
} }
...@@ -513,7 +511,7 @@ static int multipath_stop (mddev_t *mddev) ...@@ -513,7 +511,7 @@ static int multipath_stop (mddev_t *mddev)
{ {
multipath_conf_t *conf = mddev_to_conf(mddev); multipath_conf_t *conf = mddev_to_conf(mddev);
md_unregister_thread(conf->thread); md_unregister_thread(mddev->thread);
mempool_destroy(conf->pool); mempool_destroy(conf->pool);
kfree(conf); kfree(conf);
mddev->private = NULL; mddev->private = NULL;
......
...@@ -349,7 +349,7 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio) ...@@ -349,7 +349,7 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio)
* is the only IO operation happening on this bh. * is the only IO operation happening on this bh.
*/ */
bio->bi_bdev = tmp_dev->bdev; bio->bi_bdev = tmp_dev->bdev;
bio->bi_sector = rsect; bio->bi_sector = rsect + tmp_dev->data_offset;
/* /*
* Let the main block layer submit the IO and resolve recursion: * Let the main block layer submit the IO and resolve recursion:
...@@ -372,41 +372,40 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio) ...@@ -372,41 +372,40 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio)
return 0; return 0;
} }
static int raid0_status (char *page, mddev_t *mddev) static void raid0_status (struct seq_file *seq, mddev_t *mddev)
{ {
int sz = 0;
#undef MD_DEBUG #undef MD_DEBUG
#ifdef MD_DEBUG #ifdef MD_DEBUG
int j, k; int j, k;
raid0_conf_t *conf = mddev_to_conf(mddev); raid0_conf_t *conf = mddev_to_conf(mddev);
sz += sprintf(page + sz, " "); seq_printf(seq, " ");
for (j = 0; j < conf->nr_zones; j++) { for (j = 0; j < conf->nr_zones; j++) {
sz += sprintf(page + sz, "[z%d", seq_printf(seq, "[z%d",
conf->hash_table[j].zone0 - conf->strip_zone); conf->hash_table[j].zone0 - conf->strip_zone);
if (conf->hash_table[j].zone1) if (conf->hash_table[j].zone1)
sz += sprintf(page+sz, "/z%d] ", seq_printf(seq, "/z%d] ",
conf->hash_table[j].zone1 - conf->strip_zone); conf->hash_table[j].zone1 - conf->strip_zone);
else else
sz += sprintf(page+sz, "] "); seq_printf(seq, "] ");
} }
sz += sprintf(page + sz, "\n"); seq_printf(seq, "\n");
for (j = 0; j < conf->nr_strip_zones; j++) { for (j = 0; j < conf->nr_strip_zones; j++) {
sz += sprintf(page + sz, " z%d=[", j); seq_printf(seq, " z%d=[", j);
for (k = 0; k < conf->strip_zone[j].nb_dev; k++) for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
sz += sprintf (page+sz, "%s/", bdev_partition_name( seq_printf (seq, "%s/", bdev_partition_name(
conf->strip_zone[j].dev[k]->bdev)); conf->strip_zone[j].dev[k]->bdev));
sz--;
sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n", seq_printf (seq, "] zo=%d do=%d s=%d\n",
conf->strip_zone[j].zone_offset, conf->strip_zone[j].zone_offset,
conf->strip_zone[j].dev_offset, conf->strip_zone[j].dev_offset,
conf->strip_zone[j].size); conf->strip_zone[j].size);
} }
#endif #endif
sz += sprintf(page + sz, " %dk chunks", mddev->chunk_size/1024); seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
return sz; return;
} }
static mdk_personality_t raid0_personality= static mdk_personality_t raid0_personality=
......
...@@ -225,13 +225,12 @@ static void reschedule_retry(r1bio_t *r1_bio) ...@@ -225,13 +225,12 @@ static void reschedule_retry(r1bio_t *r1_bio)
{ {
unsigned long flags; unsigned long flags;
mddev_t *mddev = r1_bio->mddev; mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev_to_conf(mddev);
spin_lock_irqsave(&retry_list_lock, flags); spin_lock_irqsave(&retry_list_lock, flags);
list_add(&r1_bio->retry_list, &retry_list_head); list_add(&r1_bio->retry_list, &retry_list_head);
spin_unlock_irqrestore(&retry_list_lock, flags); spin_unlock_irqrestore(&retry_list_lock, flags);
md_wakeup_thread(conf->thread); md_wakeup_thread(mddev->thread);
} }
/* /*
...@@ -320,7 +319,7 @@ static int end_request(struct bio *bio, unsigned int bytes_done, int error) ...@@ -320,7 +319,7 @@ static int end_request(struct bio *bio, unsigned int bytes_done, int error)
* already. * already.
*/ */
if (atomic_dec_and_test(&r1_bio->remaining)) { if (atomic_dec_and_test(&r1_bio->remaining)) {
md_write_end(r1_bio->mddev,conf->thread); md_write_end(r1_bio->mddev);
raid_end_bio_io(r1_bio, uptodate); raid_end_bio_io(r1_bio, uptodate);
} }
} }
...@@ -494,7 +493,7 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -494,7 +493,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
BUG(); BUG();
r1_bio->read_bio = read_bio; r1_bio->read_bio = read_bio;
read_bio->bi_sector = r1_bio->sector; read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_bdev = mirror->rdev->bdev;
read_bio->bi_end_io = end_request; read_bio->bi_end_io = end_request;
read_bio->bi_rw = r1_bio->cmd; read_bio->bi_rw = r1_bio->cmd;
...@@ -529,7 +528,7 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -529,7 +528,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
mbio = bio_clone(bio, GFP_NOIO); mbio = bio_clone(bio, GFP_NOIO);
r1_bio->write_bios[i] = mbio; r1_bio->write_bios[i] = mbio;
mbio->bi_sector = r1_bio->sector; mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = end_request; mbio->bi_end_io = end_request;
mbio->bi_rw = r1_bio->cmd; mbio->bi_rw = r1_bio->cmd;
...@@ -542,7 +541,7 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -542,7 +541,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
* If all mirrors are non-operational * If all mirrors are non-operational
* then return an IO error: * then return an IO error:
*/ */
md_write_end(mddev,conf->thread); md_write_end(mddev);
raid_end_bio_io(r1_bio, 0); raid_end_bio_io(r1_bio, 0);
return 0; return 0;
} }
...@@ -571,19 +570,18 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -571,19 +570,18 @@ static int make_request(request_queue_t *q, struct bio * bio)
return 0; return 0;
} }
static int status(char *page, mddev_t *mddev) static void status(struct seq_file *seq, mddev_t *mddev)
{ {
conf_t *conf = mddev_to_conf(mddev); conf_t *conf = mddev_to_conf(mddev);
int sz = 0, i; int i;
sz += sprintf(page+sz, " [%d/%d] [", conf->raid_disks, seq_printf(seq, " [%d/%d] [", conf->raid_disks,
conf->working_disks); conf->working_disks);
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->raid_disks; i++)
sz += sprintf(page+sz, "%s", seq_printf(seq, "%s",
conf->mirrors[i].rdev && conf->mirrors[i].rdev &&
conf->mirrors[i].rdev->in_sync ? "U" : "_"); conf->mirrors[i].rdev->in_sync ? "U" : "_");
sz += sprintf (page+sz, "]"); seq_printf(seq, "]");
return sz;
} }
#define LAST_DISK KERN_ALERT \ #define LAST_DISK KERN_ALERT \
...@@ -624,10 +622,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -624,10 +622,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->degraded++; mddev->degraded++;
conf->working_disks--; conf->working_disks--;
/* /*
* if recovery was running, stop it now. * if recovery is running, make sure it aborts.
*/ */
if (mddev->recovery_running) set_bit(MD_RECOVERY_ERR, &mddev->recovery);
mddev->recovery_running = -EIO;
} }
rdev->in_sync = 0; rdev->in_sync = 0;
rdev->faulty = 1; rdev->faulty = 1;
...@@ -859,7 +856,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) ...@@ -859,7 +856,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
mbio = bio_clone(bio, GFP_NOIO); mbio = bio_clone(bio, GFP_NOIO);
r1_bio->write_bios[i] = mbio; r1_bio->write_bios[i] = mbio;
mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_sector = r1_bio->sector; mbio->bi_sector = r1_bio->sector | conf->mirrors[i].rdev->data_offset;
mbio->bi_end_io = end_sync_write; mbio->bi_end_io = end_sync_write;
mbio->bi_rw = WRITE; mbio->bi_rw = WRITE;
mbio->bi_private = r1_bio; mbio->bi_private = r1_bio;
...@@ -900,17 +897,17 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) ...@@ -900,17 +897,17 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
* 3. Performs writes following reads for array syncronising. * 3. Performs writes following reads for array syncronising.
*/ */
static void raid1d(void *data) static void raid1d(mddev_t *mddev)
{ {
struct list_head *head = &retry_list_head; struct list_head *head = &retry_list_head;
r1bio_t *r1_bio; r1bio_t *r1_bio;
struct bio *bio; struct bio *bio;
unsigned long flags; unsigned long flags;
mddev_t *mddev; conf_t *conf = mddev_to_conf(mddev);
conf_t *conf = data;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
md_handle_safemode(conf->mddev); md_check_recovery(mddev);
md_handle_safemode(mddev);
for (;;) { for (;;) {
spin_lock_irqsave(&retry_list_lock, flags); spin_lock_irqsave(&retry_list_lock, flags);
...@@ -937,7 +934,7 @@ static void raid1d(void *data) ...@@ -937,7 +934,7 @@ static void raid1d(void *data)
printk(REDIRECT_SECTOR, printk(REDIRECT_SECTOR,
bdev_partition_name(rdev->bdev), (unsigned long long)r1_bio->sector); bdev_partition_name(rdev->bdev), (unsigned long long)r1_bio->sector);
bio->bi_bdev = rdev->bdev; bio->bi_bdev = rdev->bdev;
bio->bi_sector = r1_bio->sector; bio->bi_sector = r1_bio->sector + rdev->data_offset;
bio->bi_rw = r1_bio->cmd; bio->bi_rw = r1_bio->cmd;
generic_make_request(bio); generic_make_request(bio);
...@@ -1048,7 +1045,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) ...@@ -1048,7 +1045,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
read_bio = bio_clone(r1_bio->master_bio, GFP_NOIO); read_bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
read_bio->bi_sector = sector_nr; read_bio->bi_sector = sector_nr + mirror->rdev->data_offset;
read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_bdev = mirror->rdev->bdev;
read_bio->bi_end_io = end_sync_read; read_bio->bi_end_io = end_sync_read;
read_bio->bi_rw = READ; read_bio->bi_rw = READ;
...@@ -1190,10 +1187,8 @@ static int run(mddev_t *mddev) ...@@ -1190,10 +1187,8 @@ static int run(mddev_t *mddev)
{ {
snprintf(conf->thread_name,MD_THREAD_NAME_MAX,"raid1d_md%d",mdidx(mddev)); mddev->thread = md_register_thread(raid1d, mddev, "md%d_raid1");
if (!mddev->thread) {
conf->thread = md_register_thread(raid1d, conf, conf->thread_name);
if (!conf->thread) {
printk(THREAD_ERROR, mdidx(mddev)); printk(THREAD_ERROR, mdidx(mddev));
goto out_free_conf; goto out_free_conf;
} }
...@@ -1219,7 +1214,8 @@ static int stop(mddev_t *mddev) ...@@ -1219,7 +1214,8 @@ static int stop(mddev_t *mddev)
{ {
conf_t *conf = mddev_to_conf(mddev); conf_t *conf = mddev_to_conf(mddev);
md_unregister_thread(conf->thread); md_unregister_thread(mddev->thread);
mddev->thread = NULL;
if (conf->r1bio_pool) if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool); mempool_destroy(conf->r1bio_pool);
kfree(conf); kfree(conf);
......
...@@ -71,12 +71,12 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) ...@@ -71,12 +71,12 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
list_add_tail(&sh->lru, &conf->delayed_list); list_add_tail(&sh->lru, &conf->delayed_list);
else else
list_add_tail(&sh->lru, &conf->handle_list); list_add_tail(&sh->lru, &conf->handle_list);
md_wakeup_thread(conf->thread); md_wakeup_thread(conf->mddev->thread);
} else { } else {
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
atomic_dec(&conf->preread_active_stripes); atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
md_wakeup_thread(conf->thread); md_wakeup_thread(conf->mddev->thread);
} }
list_add_tail(&sh->lru, &conf->inactive_list); list_add_tail(&sh->lru, &conf->inactive_list);
atomic_dec(&conf->active_stripes); atomic_dec(&conf->active_stripes);
...@@ -463,10 +463,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -463,10 +463,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
conf->failed_disks++; conf->failed_disks++;
rdev->in_sync = 0; rdev->in_sync = 0;
/* /*
* if recovery was running, stop it now. * if recovery was running, make sure it aborts.
*/ */
if (mddev->recovery_running) set_bit(MD_RECOVERY_ERR, &mddev->recovery);
mddev->recovery_running = -EIO;
} }
rdev->faulty = 1; rdev->faulty = 1;
printk (KERN_ALERT printk (KERN_ALERT
...@@ -913,7 +912,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -913,7 +912,7 @@ static void handle_stripe(struct stripe_head *sh)
struct bio *nextbi = bi->bi_next; struct bio *nextbi = bi->bi_next;
clear_bit(BIO_UPTODATE, &bi->bi_flags); clear_bit(BIO_UPTODATE, &bi->bi_flags);
if (--bi->bi_phys_segments == 0) { if (--bi->bi_phys_segments == 0) {
md_write_end(conf->mddev, conf->thread); md_write_end(conf->mddev);
bi->bi_next = return_bi; bi->bi_next = return_bi;
return_bi = bi; return_bi = bi;
} }
...@@ -970,7 +969,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -970,7 +969,7 @@ static void handle_stripe(struct stripe_head *sh)
while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
wbi2 = wbi->bi_next; wbi2 = wbi->bi_next;
if (--wbi->bi_phys_segments == 0) { if (--wbi->bi_phys_segments == 0) {
md_write_end(conf->mddev, conf->thread); md_write_end(conf->mddev);
wbi->bi_next = return_bi; wbi->bi_next = return_bi;
return_bi = wbi; return_bi = wbi;
} }
...@@ -1113,7 +1112,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1113,7 +1112,7 @@ static void handle_stripe(struct stripe_head *sh)
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
atomic_dec(&conf->preread_active_stripes); atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
md_wakeup_thread(conf->thread); md_wakeup_thread(conf->mddev->thread);
} }
} }
} }
...@@ -1207,7 +1206,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1207,7 +1206,7 @@ static void handle_stripe(struct stripe_head *sh)
bi->bi_bdev = rdev->bdev; bi->bi_bdev = rdev->bdev;
PRINTK("for %llu schedule op %ld on disc %d\n", (unsigned long long)sh->sector, bi->bi_rw, i); PRINTK("for %llu schedule op %ld on disc %d\n", (unsigned long long)sh->sector, bi->bi_rw, i);
atomic_inc(&sh->count); atomic_inc(&sh->count);
bi->bi_sector = sh->sector; bi->bi_sector = sh->sector + rdev->data_offset;
bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_vcnt = 1; bi->bi_vcnt = 1;
bi->bi_idx = 0; bi->bi_idx = 0;
...@@ -1251,7 +1250,7 @@ static void raid5_unplug_device(void *data) ...@@ -1251,7 +1250,7 @@ static void raid5_unplug_device(void *data)
if (blk_remove_plug(q)) if (blk_remove_plug(q))
raid5_activate_delayed(conf); raid5_activate_delayed(conf);
md_wakeup_thread(conf->thread); md_wakeup_thread(mddev->thread);
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
} }
...@@ -1304,7 +1303,7 @@ static int make_request (request_queue_t *q, struct bio * bi) ...@@ -1304,7 +1303,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
int bytes = bi->bi_size; int bytes = bi->bi_size;
if ( bio_data_dir(bi) == WRITE ) if ( bio_data_dir(bi) == WRITE )
md_write_end(mddev,conf->thread); md_write_end(mddev);
bi->bi_size = 0; bi->bi_size = 0;
bi->bi_end_io(bi, bytes, 0); bi->bi_end_io(bi, bytes, 0);
} }
...@@ -1356,16 +1355,17 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) ...@@ -1356,16 +1355,17 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
* During the scan, completed stripes are saved for us by the interrupt * During the scan, completed stripes are saved for us by the interrupt
* handler, so that they will not have to wait for our next wakeup. * handler, so that they will not have to wait for our next wakeup.
*/ */
static void raid5d (void *data) static void raid5d (mddev_t *mddev)
{ {
struct stripe_head *sh; struct stripe_head *sh;
raid5_conf_t *conf = data; raid5_conf_t *conf = mddev_to_conf(mddev);
mddev_t *mddev = conf->mddev;
int handled; int handled;
PRINTK("+++ raid5d active\n"); PRINTK("+++ raid5d active\n");
md_check_recovery(mddev);
md_handle_safemode(mddev); md_handle_safemode(mddev);
handled = 0; handled = 0;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
while (1) { while (1) {
...@@ -1486,10 +1486,8 @@ static int run (mddev_t *mddev) ...@@ -1486,10 +1486,8 @@ static int run (mddev_t *mddev)
} }
{ {
snprintf(conf->thread_name,MD_THREAD_NAME_MAX,"raid5d_md%d",mdidx(mddev)); mddev->thread = md_register_thread(raid5d, mddev, "md%d_raid5");
if (!mddev->thread) {
conf->thread = md_register_thread(raid5d, conf, conf->thread_name);
if (!conf->thread) {
printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
goto abort; goto abort;
} }
...@@ -1500,7 +1498,7 @@ static int run (mddev_t *mddev) ...@@ -1500,7 +1498,7 @@ static int run (mddev_t *mddev)
if (grow_stripes(conf, conf->max_nr_stripes)) { if (grow_stripes(conf, conf->max_nr_stripes)) {
printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory); printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
shrink_stripes(conf); shrink_stripes(conf);
md_unregister_thread(conf->thread); md_unregister_thread(mddev->thread);
goto abort; goto abort;
} else } else
printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev)); printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
...@@ -1536,7 +1534,8 @@ static int stop (mddev_t *mddev) ...@@ -1536,7 +1534,8 @@ static int stop (mddev_t *mddev)
{ {
raid5_conf_t *conf = (raid5_conf_t *) mddev->private; raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
md_unregister_thread(conf->thread); md_unregister_thread(mddev->thread);
mddev->thread = NULL;
shrink_stripes(conf); shrink_stripes(conf);
free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
kfree(conf); kfree(conf);
...@@ -1574,29 +1573,26 @@ static void printall (raid5_conf_t *conf) ...@@ -1574,29 +1573,26 @@ static void printall (raid5_conf_t *conf)
} }
} }
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
PRINTK("--- raid5d inactive\n");
} }
#endif #endif
static int status (char *page, mddev_t *mddev) static void status (struct seq_file *seq, mddev_t *mddev)
{ {
raid5_conf_t *conf = (raid5_conf_t *) mddev->private; raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
int sz = 0, i; int i;
sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->raid_disks; i++)
sz += sprintf (page+sz, "%s", seq_printf (seq, "%s",
conf->disks[i].rdev && conf->disks[i].rdev &&
conf->disks[i].rdev->in_sync ? "U" : "_"); conf->disks[i].rdev->in_sync ? "U" : "_");
sz += sprintf (page+sz, "]"); seq_printf (seq, "]");
#if RAID5_DEBUG #if RAID5_DEBUG
#define D(x) \ #define D(x) \
sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x)) seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
printall(conf); printall(conf);
#endif #endif
return sz;
} }
static void print_raid5_conf (raid5_conf_t *conf) static void print_raid5_conf (raid5_conf_t *conf)
......
...@@ -305,8 +305,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ...@@ -305,8 +305,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_end, (long long)lock->fl.fl_end,
wait); wait);
/* Lock file against concurrent access */
down(&file->f_sema);
/* Get existing block (in case client is busy-waiting) */ /* Get existing block (in case client is busy-waiting) */
block = nlmsvc_lookup_block(file, lock, 0); block = nlmsvc_lookup_block(file, lock, 0);
...@@ -314,6 +312,9 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ...@@ -314,6 +312,9 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
lock->fl.fl_flags |= FL_LOCKD; lock->fl.fl_flags |= FL_LOCKD;
again: again:
/* Lock file against concurrent access */
down(&file->f_sema);
if (!(conflock = posix_test_lock(&file->f_file, &lock->fl))) { if (!(conflock = posix_test_lock(&file->f_file, &lock->fl))) {
error = posix_lock_file(&file->f_file, &lock->fl); error = posix_lock_file(&file->f_file, &lock->fl);
...@@ -346,7 +347,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ...@@ -346,7 +347,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
/* If we don't have a block, create and initialize it. Then /* If we don't have a block, create and initialize it. Then
* retry because we may have slept in kmalloc. */ * retry because we may have slept in kmalloc. */
/* We have to release f_sema as nlmsvc_create_block may try to
* to claim it while doing host garbage collection */
if (block == NULL) { if (block == NULL) {
up(&file->f_sema);
dprintk("lockd: blocking on this lock (allocating).\n"); dprintk("lockd: blocking on this lock (allocating).\n");
if (!(block = nlmsvc_create_block(rqstp, file, lock, cookie))) if (!(block = nlmsvc_create_block(rqstp, file, lock, cookie)))
return nlm_lck_denied_nolocks; return nlm_lck_denied_nolocks;
......
...@@ -294,7 +294,9 @@ int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) ...@@ -294,7 +294,9 @@ int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
/* client */ /* client */
len = qword_get(&mesg, buf, PAGE_SIZE); len = qword_get(&mesg, buf, PAGE_SIZE);
if (len <= 0) return -EINVAL; err = -EINVAL;
if (len <= 0) goto out;
err = -ENOENT; err = -ENOENT;
dom = auth_domain_find(buf); dom = auth_domain_find(buf);
if (!dom) if (!dom)
...@@ -473,8 +475,14 @@ exp_get_by_name(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry, ...@@ -473,8 +475,14 @@ exp_get_by_name(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry,
exp = svc_export_lookup(&key, 0); exp = svc_export_lookup(&key, 0);
if (exp != NULL) if (exp != NULL)
if (cache_check(&svc_export_cache, &exp->h, reqp)) switch (cache_check(&svc_export_cache, &exp->h, reqp)) {
case 0: break;
case -EAGAIN:
exp = ERR_PTR(-EAGAIN);
break;
default:
exp = NULL; exp = NULL;
}
return exp; return exp;
} }
...@@ -915,7 +923,8 @@ struct flags { ...@@ -915,7 +923,8 @@ struct flags {
{ NFSEXP_UIDMAP, {"uidmap", ""}}, { NFSEXP_UIDMAP, {"uidmap", ""}},
{ NFSEXP_KERBEROS, { "kerberos", ""}}, { NFSEXP_KERBEROS, { "kerberos", ""}},
{ NFSEXP_SUNSECURE, { "sunsecure", ""}}, { NFSEXP_SUNSECURE, { "sunsecure", ""}},
{ NFSEXP_CROSSMNT, {"nohide", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}},
{ NFSEXP_CROSSMNT, {"crossmnt", ""}},
{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
#ifdef MSNFS #ifdef MSNFS
......
...@@ -79,7 +79,7 @@ static struct raparms * raparm_cache; ...@@ -79,7 +79,7 @@ static struct raparms * raparm_cache;
* N.B. After this call _both_ fhp and resfh need an fh_put * N.B. After this call _both_ fhp and resfh need an fh_put
* *
* If the lookup would cross a mountpoint, and the mounted filesystem * If the lookup would cross a mountpoint, and the mounted filesystem
* is exported to the client with NFSEXP_CROSSMNT, then the lookup is * is exported to the client with NFSEXP_NOHIDE, then the lookup is
* accepted as it stands and the mounted directory is * accepted as it stands and the mounted directory is
* returned. Otherwise the covered directory is returned. * returned. Otherwise the covered directory is returned.
* NOTE: this mountpoint crossing is not supported properly by all * NOTE: this mountpoint crossing is not supported properly by all
...@@ -115,7 +115,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, ...@@ -115,7 +115,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
read_lock(&dparent_lock); read_lock(&dparent_lock);
dentry = dget(dparent->d_parent); dentry = dget(dparent->d_parent);
read_unlock(&dparent_lock); read_unlock(&dparent_lock);
} else if (!EX_CROSSMNT(exp)) } else if (!EX_NOHIDE(exp))
dentry = dget(dparent); /* .. == . just like at / */ dentry = dget(dparent); /* .. == . just like at / */
else { else {
/* checking mountpoint crossing is very different when stepping up */ /* checking mountpoint crossing is very different when stepping up */
...@@ -133,6 +133,12 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, ...@@ -133,6 +133,12 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
exp2 = exp_parent(exp->ex_client, mnt, dentry, exp2 = exp_parent(exp->ex_client, mnt, dentry,
&rqstp->rq_chandle); &rqstp->rq_chandle);
if (IS_ERR(exp2)) {
err = PTR_ERR(exp2);
dput(dentry);
mntput(mnt);
goto out;
}
if (!exp2) { if (!exp2) {
dput(dentry); dput(dentry);
dentry = dget(dparent); dentry = dget(dparent);
...@@ -157,9 +163,19 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, ...@@ -157,9 +163,19 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
struct dentry *mounts = dget(dentry); struct dentry *mounts = dget(dentry);
while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)) while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts))
; ;
exp2 = exp_get_by_name(exp->ex_client, mnt, exp2 = exp_get_by_name(exp->ex_client, mnt,
mounts, &rqstp->rq_chandle); mounts, &rqstp->rq_chandle);
if (exp2 && EX_CROSSMNT(exp2)) { if (IS_ERR(exp2)) {
err = PTR_ERR(exp2);
dput(mounts);
dput(dentry);
mntput(mnt);
goto out;
}
if (exp2 &&
((exp->ex_flags & NFSEXP_CROSSMNT)
|| EX_NOHIDE(exp2))) {
/* successfully crossed mount point */ /* successfully crossed mount point */
exp_put(exp); exp_put(exp);
exp = exp2; exp = exp2;
......
...@@ -1310,6 +1310,10 @@ static void free_journal_ram(struct super_block *p_s_sb) { ...@@ -1310,6 +1310,10 @@ static void free_journal_ram(struct super_block *p_s_sb) {
if (SB_JOURNAL(p_s_sb)->j_header_bh) { if (SB_JOURNAL(p_s_sb)->j_header_bh) {
brelse(SB_JOURNAL(p_s_sb)->j_header_bh) ; brelse(SB_JOURNAL(p_s_sb)->j_header_bh) ;
} }
/* j_header_bh is on the journal dev, make sure not to release the journal
* dev until we brelse j_header_bh
*/
release_journal_dev(p_s_sb, SB_JOURNAL(p_s_sb));
vfree(SB_JOURNAL(p_s_sb)) ; vfree(SB_JOURNAL(p_s_sb)) ;
} }
...@@ -1341,7 +1345,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, struct sup ...@@ -1341,7 +1345,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, struct sup
commit_wq = NULL; commit_wq = NULL;
} }
release_journal_dev( p_s_sb, SB_JOURNAL( p_s_sb ) );
free_journal_ram(p_s_sb) ; free_journal_ram(p_s_sb) ;
return 0 ; return 0 ;
...@@ -1868,23 +1871,17 @@ static int release_journal_dev( struct super_block *super, ...@@ -1868,23 +1871,17 @@ static int release_journal_dev( struct super_block *super,
result = 0; result = 0;
if( journal -> j_dev_file != NULL ) { if( journal -> j_dev_file != NULL ) {
/*
* journal block device was taken via filp_open
*/
result = filp_close( journal -> j_dev_file, NULL ); result = filp_close( journal -> j_dev_file, NULL );
journal -> j_dev_file = NULL; journal -> j_dev_file = NULL;
journal -> j_dev_bd = NULL; journal -> j_dev_bd = NULL;
} else if( journal -> j_dev_bd != NULL ) { } else if( journal -> j_dev_bd != NULL ) {
/*
* journal block device was taken via bdget and blkdev_get
*/
result = blkdev_put( journal -> j_dev_bd, BDEV_FS ); result = blkdev_put( journal -> j_dev_bd, BDEV_FS );
journal -> j_dev_bd = NULL; journal -> j_dev_bd = NULL;
} }
if( result != 0 ) { if( result != 0 ) {
reiserfs_warning("sh-457: release_journal_dev: Cannot release journal device: %i", result ); reiserfs_warning("sh-457: release_journal_dev: Cannot release journal device: %i\n", result );
} }
return result; return result;
} }
...@@ -1895,6 +1892,7 @@ static int journal_init_dev( struct super_block *super, ...@@ -1895,6 +1892,7 @@ static int journal_init_dev( struct super_block *super,
{ {
int result; int result;
dev_t jdev; dev_t jdev;
int blkdev_mode = FMODE_READ | FMODE_WRITE;
result = 0; result = 0;
...@@ -1902,12 +1900,16 @@ static int journal_init_dev( struct super_block *super, ...@@ -1902,12 +1900,16 @@ static int journal_init_dev( struct super_block *super,
journal -> j_dev_file = NULL; journal -> j_dev_file = NULL;
jdev = SB_ONDISK_JOURNAL_DEVICE( super ) ? jdev = SB_ONDISK_JOURNAL_DEVICE( super ) ?
SB_ONDISK_JOURNAL_DEVICE( super ) : super->s_dev; SB_ONDISK_JOURNAL_DEVICE( super ) : super->s_dev;
if (bdev_read_only(super->s_bdev))
blkdev_mode = FMODE_READ;
/* there is no "jdev" option and journal is on separate device */ /* there is no "jdev" option and journal is on separate device */
if( ( !jdev_name || !jdev_name[ 0 ] ) ) { if( ( !jdev_name || !jdev_name[ 0 ] ) ) {
journal -> j_dev_bd = bdget(jdev); journal -> j_dev_bd = bdget(jdev);
if( journal -> j_dev_bd ) if( journal -> j_dev_bd )
result = blkdev_get( journal -> j_dev_bd, result = blkdev_get( journal -> j_dev_bd,
FMODE_READ | FMODE_WRITE, 0, blkdev_mode, 0,
BDEV_FS ); BDEV_FS );
else else
result = -ENOMEM; result = -ENOMEM;
...@@ -1928,10 +1930,10 @@ static int journal_init_dev( struct super_block *super, ...@@ -1928,10 +1930,10 @@ static int journal_init_dev( struct super_block *super,
jdev_inode = journal -> j_dev_file -> f_dentry -> d_inode; jdev_inode = journal -> j_dev_file -> f_dentry -> d_inode;
journal -> j_dev_bd = jdev_inode -> i_bdev; journal -> j_dev_bd = jdev_inode -> i_bdev;
if( !S_ISBLK( jdev_inode -> i_mode ) ) { if( !S_ISBLK( jdev_inode -> i_mode ) ) {
printk( "journal_init_dev: '%s' is not a block device", jdev_name ); printk( "journal_init_dev: '%s' is not a block device\n", jdev_name );
result = -ENOTBLK; result = -ENOTBLK;
} else if( jdev_inode -> i_bdev == NULL ) { } else if( jdev_inode -> i_bdev == NULL ) {
printk( "journal_init_dev: bdev uninitialized for '%s'", jdev_name ); printk( "journal_init_dev: bdev uninitialized for '%s'\n", jdev_name );
result = -ENOMEM; result = -ENOMEM;
} else { } else {
/* ok */ /* ok */
...@@ -1941,12 +1943,12 @@ static int journal_init_dev( struct super_block *super, ...@@ -1941,12 +1943,12 @@ static int journal_init_dev( struct super_block *super,
} else { } else {
result = PTR_ERR( journal -> j_dev_file ); result = PTR_ERR( journal -> j_dev_file );
journal -> j_dev_file = NULL; journal -> j_dev_file = NULL;
printk( "journal_init_dev: Cannot open '%s': %i", jdev_name, result ); printk( "journal_init_dev: Cannot open '%s': %i\n", jdev_name, result );
} }
if( result != 0 ) { if( result != 0 ) {
release_journal_dev( super, journal ); release_journal_dev( super, journal );
} }
printk( "journal_init_dev: journal device: %s", bdevname(journal->j_dev_bd)); printk( "journal_init_dev: journal device: %s\n", bdevname(journal->j_dev_bd));
return result; return result;
} }
...@@ -1961,8 +1963,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo ...@@ -1961,8 +1963,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
struct reiserfs_journal *journal; struct reiserfs_journal *journal;
if (sizeof(struct reiserfs_journal_commit) != 4096 || if (sizeof(struct reiserfs_journal_commit) != 4096 ||
sizeof(struct reiserfs_journal_desc) != 4096 sizeof(struct reiserfs_journal_desc) != 4096) {
) {
printk("journal-1249: commit or desc struct not 4096 %Zd %Zd\n", sizeof(struct reiserfs_journal_commit), printk("journal-1249: commit or desc struct not 4096 %Zd %Zd\n", sizeof(struct reiserfs_journal_commit),
sizeof(struct reiserfs_journal_desc)) ; sizeof(struct reiserfs_journal_desc)) ;
return 1 ; return 1 ;
...@@ -1974,6 +1975,11 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo ...@@ -1974,6 +1975,11 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
return 1 ; return 1 ;
} }
memset(journal, 0, sizeof(struct reiserfs_journal)) ; memset(journal, 0, sizeof(struct reiserfs_journal)) ;
INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ;
INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_prealloc_list);
reiserfs_allocate_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap,
SB_BMAP_NR(p_s_sb)) ;
allocate_bitmap_nodes(p_s_sb) ;
/* reserved for journal area support */ /* reserved for journal area support */
SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ? SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ?
...@@ -1983,7 +1989,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo ...@@ -1983,7 +1989,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
if( journal_init_dev( p_s_sb, journal, j_dev_name ) != 0 ) { if( journal_init_dev( p_s_sb, journal, j_dev_name ) != 0 ) {
printk( "sh-462: unable to initialize jornal device\n"); printk( "sh-462: unable to initialize jornal device\n");
return 1; goto free_and_return;
} }
rs = SB_DISK_SUPER_BLOCK(p_s_sb); rs = SB_DISK_SUPER_BLOCK(p_s_sb);
...@@ -1993,8 +1999,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo ...@@ -1993,8 +1999,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb)); SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb));
if (!bhjh) { if (!bhjh) {
printk("sh-459: unable to read journal header\n") ; printk("sh-459: unable to read journal header\n") ;
release_journal_dev(p_s_sb, journal); goto free_and_return;
return 1 ;
} }
jh = (struct reiserfs_journal_header *)(bhjh->b_data); jh = (struct reiserfs_journal_header *)(bhjh->b_data);
...@@ -2005,8 +2010,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo ...@@ -2005,8 +2010,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
jh->jh_journal.jp_journal_magic, bdevname( SB_JOURNAL(p_s_sb)->j_dev_bd ), jh->jh_journal.jp_journal_magic, bdevname( SB_JOURNAL(p_s_sb)->j_dev_bd ),
sb_jp_journal_magic(rs), reiserfs_bdevname (p_s_sb)); sb_jp_journal_magic(rs), reiserfs_bdevname (p_s_sb));
brelse (bhjh); brelse (bhjh);
release_journal_dev(p_s_sb, journal); goto free_and_return;
return 1 ;
} }
SB_JOURNAL_TRANS_MAX(p_s_sb) = le32_to_cpu (jh->jh_journal.jp_journal_trans_max); SB_JOURNAL_TRANS_MAX(p_s_sb) = le32_to_cpu (jh->jh_journal.jp_journal_trans_max);
...@@ -2064,7 +2068,6 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo ...@@ -2064,7 +2068,6 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
brelse (bhjh); brelse (bhjh);
SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ; SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ;
SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */ SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */
...@@ -2075,12 +2078,8 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo ...@@ -2075,12 +2078,8 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */ memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */
INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ;
INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_dirty_buffers) ; INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_dirty_buffers) ;
spin_lock_init(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock) ; spin_lock_init(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock) ;
reiserfs_allocate_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap,
SB_BMAP_NR(p_s_sb)) ;
allocate_bitmap_nodes(p_s_sb) ;
SB_JOURNAL(p_s_sb)->j_start = 0 ; SB_JOURNAL(p_s_sb)->j_start = 0 ;
SB_JOURNAL(p_s_sb)->j_len = 0 ; SB_JOURNAL(p_s_sb)->j_len = 0 ;
...@@ -2107,20 +2106,15 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo ...@@ -2107,20 +2106,15 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ; SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ;
if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) { if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) {
reiserfs_warning("journal-2005, get_list_bitmap failed for journal list 0\n") ; reiserfs_warning("journal-2005, get_list_bitmap failed for journal list 0\n") ;
release_journal_dev(p_s_sb, journal); goto free_and_return;
return 1 ;
} }
if (journal_read(p_s_sb) < 0) { if (journal_read(p_s_sb) < 0) {
reiserfs_warning("Replay Failure, unable to mount\n") ; reiserfs_warning("Replay Failure, unable to mount\n") ;
free_journal_ram(p_s_sb) ; goto free_and_return;
release_journal_dev(p_s_sb, journal);
return 1 ;
} }
SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this
where it belongs */ where it belongs */
INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_prealloc_list);
if (reiserfs_dont_log (p_s_sb)) if (reiserfs_dont_log (p_s_sb))
return 0; return 0;
...@@ -2129,7 +2123,9 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo ...@@ -2129,7 +2123,9 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
commit_wq = create_workqueue("reiserfs"); commit_wq = create_workqueue("reiserfs");
return 0 ; return 0 ;
free_and_return:
free_journal_ram(p_s_sb);
return 1;
} }
/* /*
......
...@@ -678,35 +678,35 @@ xor_32regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, ...@@ -678,35 +678,35 @@ xor_32regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
} }
static struct xor_block_template xor_block_8regs = { static struct xor_block_template xor_block_8regs = {
name: "8regs", .name = "8regs",
do_2: xor_8regs_2, .do_2 = xor_8regs_2,
do_3: xor_8regs_3, .do_3 = xor_8regs_3,
do_4: xor_8regs_4, .do_4 = xor_8regs_4,
do_5: xor_8regs_5, .do_5 = xor_8regs_5,
}; };
static struct xor_block_template xor_block_32regs = { static struct xor_block_template xor_block_32regs = {
name: "32regs", .name = "32regs",
do_2: xor_32regs_2, .do_2 = xor_32regs_2,
do_3: xor_32regs_3, .do_3 = xor_32regs_3,
do_4: xor_32regs_4, .do_4 = xor_32regs_4,
do_5: xor_32regs_5, .do_5 = xor_32regs_5,
}; };
static struct xor_block_template xor_block_8regs_p = { static struct xor_block_template xor_block_8regs_p = {
name: "8regs_prefetch", .name = "8regs_prefetch",
do_2: xor_8regs_p_2, .do_2 = xor_8regs_p_2,
do_3: xor_8regs_p_3, .do_3 = xor_8regs_p_3,
do_4: xor_8regs_p_4, .do_4 = xor_8regs_p_4,
do_5: xor_8regs_p_5, .do_5 = xor_8regs_p_5,
}; };
static struct xor_block_template xor_block_32regs_p = { static struct xor_block_template xor_block_32regs_p = {
name: "32regs_prefetch", .name = "32regs_prefetch",
do_2: xor_32regs_p_2, .do_2 = xor_32regs_p_2,
do_3: xor_32regs_p_3, .do_3 = xor_32regs_p_3,
do_4: xor_32regs_p_4, .do_4 = xor_32regs_p_4,
do_5: xor_32regs_p_5, .do_5 = xor_32regs_p_5,
}; };
#define XOR_TRY_TEMPLATES \ #define XOR_TRY_TEMPLATES \
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
#define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
#include <asm/i387.h>
static void static void
xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
......
...@@ -239,7 +239,6 @@ extern inline char *bio_kmap_irq(struct bio *bio, unsigned long *flags) ...@@ -239,7 +239,6 @@ extern inline char *bio_kmap_irq(struct bio *bio, unsigned long *flags)
* balancing is a lot nicer this way * balancing is a lot nicer this way
*/ */
local_save_flags(*flags); local_save_flags(*flags);
local_irq_disable();
addr = (unsigned long) kmap_atomic(bio_page(bio), KM_BIO_SRC_IRQ); addr = (unsigned long) kmap_atomic(bio_page(bio), KM_BIO_SRC_IRQ);
if (addr & ~PAGE_MASK) if (addr & ~PAGE_MASK)
......
...@@ -35,12 +35,13 @@ ...@@ -35,12 +35,13 @@
#define NFSEXP_UIDMAP 0x0040 #define NFSEXP_UIDMAP 0x0040
#define NFSEXP_KERBEROS 0x0080 /* not available */ #define NFSEXP_KERBEROS 0x0080 /* not available */
#define NFSEXP_SUNSECURE 0x0100 #define NFSEXP_SUNSECURE 0x0100
#define NFSEXP_CROSSMNT 0x0200 #define NFSEXP_NOHIDE 0x0200
#define NFSEXP_NOSUBTREECHECK 0x0400 #define NFSEXP_NOSUBTREECHECK 0x0400
#define NFSEXP_NOAUTHNLM 0x0800 /* Don't authenticate NLM requests - just trust */ #define NFSEXP_NOAUTHNLM 0x0800 /* Don't authenticate NLM requests - just trust */
#define NFSEXP_MSNFS 0x1000 /* do silly things that MS clients expect */ #define NFSEXP_MSNFS 0x1000 /* do silly things that MS clients expect */
#define NFSEXP_FSID 0x2000 #define NFSEXP_FSID 0x2000
#define NFSEXP_ALLFLAGS 0x3FFF #define NFSEXP_CROSSMNT 0x4000
#define NFSEXP_ALLFLAGS 0x7FFF
#ifdef __KERNEL__ #ifdef __KERNEL__
...@@ -73,7 +74,7 @@ struct svc_expkey { ...@@ -73,7 +74,7 @@ struct svc_expkey {
#define EX_SECURE(exp) (!((exp)->ex_flags & NFSEXP_INSECURE_PORT)) #define EX_SECURE(exp) (!((exp)->ex_flags & NFSEXP_INSECURE_PORT))
#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) #define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC))
#define EX_RDONLY(exp) ((exp)->ex_flags & NFSEXP_READONLY) #define EX_RDONLY(exp) ((exp)->ex_flags & NFSEXP_READONLY)
#define EX_CROSSMNT(exp) ((exp)->ex_flags & NFSEXP_CROSSMNT) #define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE)
#define EX_SUNSECURE(exp) ((exp)->ex_flags & NFSEXP_SUNSECURE) #define EX_SUNSECURE(exp) ((exp)->ex_flags & NFSEXP_SUNSECURE)
#define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES) #define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/hdreg.h> #include <linux/hdreg.h>
#include <linux/proc_fs.h> #include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/smp_lock.h> #include <linux/smp_lock.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <net/checksum.h> #include <net/checksum.h>
...@@ -68,13 +69,14 @@ extern inline char * bdev_partition_name (struct block_device *bdev) ...@@ -68,13 +69,14 @@ extern inline char * bdev_partition_name (struct block_device *bdev)
} }
extern int register_md_personality (int p_num, mdk_personality_t *p); extern int register_md_personality (int p_num, mdk_personality_t *p);
extern int unregister_md_personality (int p_num); extern int unregister_md_personality (int p_num);
extern mdk_thread_t * md_register_thread (void (*run) (void *data), extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev),
void *data, const char *name); mddev_t *mddev, const char *name);
extern void md_unregister_thread (mdk_thread_t *thread); extern void md_unregister_thread (mdk_thread_t *thread);
extern void md_wakeup_thread(mdk_thread_t *thread); extern void md_wakeup_thread(mdk_thread_t *thread);
extern void md_check_recovery(mddev_t *mddev);
extern void md_interrupt_thread (mdk_thread_t *thread); extern void md_interrupt_thread (mdk_thread_t *thread);
extern void md_write_start(mddev_t *mddev); extern void md_write_start(mddev_t *mddev);
extern void md_write_end(mddev_t *mddev, mdk_thread_t *thread); extern void md_write_end(mddev_t *mddev);
extern void md_handle_safemode(mddev_t *mddev); extern void md_handle_safemode(mddev_t *mddev);
extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors); extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
......
...@@ -155,6 +155,7 @@ struct mdk_rdev_s ...@@ -155,6 +155,7 @@ struct mdk_rdev_s
struct page *sb_page; struct page *sb_page;
int sb_loaded; int sb_loaded;
sector_t data_offset; /* start of data in array */
sector_t sb_offset; sector_t sb_offset;
int preferred_minor; /* autorun support */ int preferred_minor; /* autorun support */
...@@ -206,22 +207,31 @@ struct mddev_s ...@@ -206,22 +207,31 @@ struct mddev_s
char uuid[16]; char uuid[16];
struct mdk_thread_s *thread; /* management thread */
struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
unsigned long curr_resync; /* blocks scheduled */ unsigned long curr_resync; /* blocks scheduled */
unsigned long resync_mark; /* a recent timestamp */ unsigned long resync_mark; /* a recent timestamp */
unsigned long resync_mark_cnt;/* blocks written at resync_mark */ unsigned long resync_mark_cnt;/* blocks written at resync_mark */
/* recovery_running is 0 for no recovery/resync,
* 1 for active recovery /* recovery/resync flags
* 2 for active resync * NEEDED: we might need to start a resync/recover
* -error for an error (e.g. -EINTR) * RUNNING: a thread is running, or about to be started
* it can only be set > 0 under reconfig_sem * SYNC: actually doing a resync, not a recovery
* ERR: and IO error was detected - abort the resync/recovery
* INTR: someone requested a (clean) early abort.
* DONE: thread is done and is waiting to be reaped
*/ */
int recovery_running; #define MD_RECOVERY_RUNNING 0
int recovery_error; /* error from recovery write */ #define MD_RECOVERY_SYNC 1
#define MD_RECOVERY_ERR 2
#define MD_RECOVERY_INTR 3
#define MD_RECOVERY_DONE 4
#define MD_RECOVERY_NEEDED 5
unsigned long recovery;
int in_sync; /* know to not need resync */ int in_sync; /* know to not need resync */
struct semaphore reconfig_sem; struct semaphore reconfig_sem;
atomic_t active; atomic_t active;
int spares;
int degraded; /* whether md should consider int degraded; /* whether md should consider
* adding a spare * adding a spare
...@@ -230,9 +240,11 @@ struct mddev_s ...@@ -230,9 +240,11 @@ struct mddev_s
atomic_t recovery_active; /* blocks scheduled, but not written */ atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait; wait_queue_head_t recovery_wait;
sector_t recovery_cp; sector_t recovery_cp;
int safemode; /* if set, update "clean" superblock unsigned int safemode; /* if set, update "clean" superblock
* when no writes pending. * when no writes pending.
*/ */
unsigned int safemode_delay;
struct timer_list safemode_timer;
atomic_t writes_pending; atomic_t writes_pending;
request_queue_t queue; /* for plugging ... */ request_queue_t queue; /* for plugging ... */
...@@ -245,7 +257,7 @@ struct mdk_personality_s ...@@ -245,7 +257,7 @@ struct mdk_personality_s
int (*make_request)(request_queue_t *q, struct bio *bio); int (*make_request)(request_queue_t *q, struct bio *bio);
int (*run)(mddev_t *mddev); int (*run)(mddev_t *mddev);
int (*stop)(mddev_t *mddev); int (*stop)(mddev_t *mddev);
int (*status)(char *page, mddev_t *mddev); void (*status)(struct seq_file *seq, mddev_t *mddev);
/* error_handler must set ->faulty and clear ->in_sync /* error_handler must set ->faulty and clear ->in_sync
* if appropriate, and should abort recovery if needed * if appropriate, and should abort recovery if needed
*/ */
...@@ -292,8 +304,8 @@ extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr); ...@@ -292,8 +304,8 @@ extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
ITERATE_RDEV_GENERIC(pending_raid_disks,rdev,tmp) ITERATE_RDEV_GENERIC(pending_raid_disks,rdev,tmp)
typedef struct mdk_thread_s { typedef struct mdk_thread_s {
void (*run) (void *data); void (*run) (mddev_t *mddev);
void *data; mddev_t *mddev;
wait_queue_head_t wqueue; wait_queue_head_t wqueue;
unsigned long flags; unsigned long flags;
struct completion *event; struct completion *event;
......
...@@ -173,5 +173,58 @@ static inline __u64 md_event(mdp_super_t *sb) { ...@@ -173,5 +173,58 @@ static inline __u64 md_event(mdp_super_t *sb) {
return (ev<<32)| sb->events_lo; return (ev<<32)| sb->events_lo;
} }
/*
* The version-1 superblock :
* All numeric fields are little-endian.
*
* total size: 256 bytes plus 2 per device.
* 1K allows 384 devices.
*/
struct mdp_superblock_1 {
/* constant array information - 128 bytes */
__u32 magic; /* MD_SB_MAGIC: 0xa92b4efc - little endian */
__u32 major_version; /* 1 */
__u32 feature_map; /* 0 for now */
__u32 pad0; /* always set to 0 when writing */
__u8 set_uuid[16]; /* user-space generated. */
char set_name[32]; /* set and interpreted by user-space */
__u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/
__u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */
__u32 layout; /* only for raid5 currently */
__u64 size; /* used size of component devices, in 512byte sectors */
__u32 chunksize; /* in 512byte sectors */
__u32 raid_disks;
__u8 pad1[128-92]; /* set to 0 when written */
/* constant this-device information - 64 bytes */
__u64 data_offset; /* sector start of data, often 0 */
__u64 data_size; /* sectors in this device that can be used for data */
__u64 super_offset; /* sector start of this superblock */
__u64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
__u32 dev_number; /* permanent identifier of this device - not role in raid */
__u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
__u8 pad2[64-56]; /* set to 0 when writing */
/* array state information - 64 bytes */
__u64 utime; /* 40 bits second, 24 btes microseconds */
__u64 events; /* incremented when superblock updated */
__u64 resync_offset; /* data before this offset (from data_offset) known to be in sync */
__u32 sb_csum; /* checksum upto devs[max_dev] */
__u32 max_dev; /* size of devs[] array to consider */
__u8 pad3[64-40]; /* set to 0 when writing */
/* device state information. Indexed by dev_number.
* 2 bytes per device
* Note there are no per-device state flags. State information is rolled
* into the 'roles' value. If a device is spare or faulty, then it doesn't
* have a meaningful role.
*/
__u16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */
};
#endif #endif
...@@ -13,7 +13,6 @@ struct multipath_private_data { ...@@ -13,7 +13,6 @@ struct multipath_private_data {
struct multipath_info multipaths[MD_SB_DISKS]; struct multipath_info multipaths[MD_SB_DISKS];
int raid_disks; int raid_disks;
int working_disks; int working_disks;
mdk_thread_t *thread;
spinlock_t device_lock; spinlock_t device_lock;
mempool_t *pool; mempool_t *pool;
......
...@@ -19,7 +19,6 @@ struct r1_private_data_s { ...@@ -19,7 +19,6 @@ struct r1_private_data_s {
int working_disks; int working_disks;
int last_used; int last_used;
sector_t next_seq_sect; sector_t next_seq_sect;
mdk_thread_t *thread;
spinlock_t device_lock; spinlock_t device_lock;
/* for use when syncing mirrors: */ /* for use when syncing mirrors: */
...@@ -34,7 +33,6 @@ struct r1_private_data_s { ...@@ -34,7 +33,6 @@ struct r1_private_data_s {
mempool_t *r1bio_pool; mempool_t *r1bio_pool;
mempool_t *r1buf_pool; mempool_t *r1buf_pool;
char thread_name[MD_THREAD_NAME_MAX];
}; };
typedef struct r1_private_data_s conf_t; typedef struct r1_private_data_s conf_t;
......
...@@ -203,7 +203,6 @@ struct disk_info { ...@@ -203,7 +203,6 @@ struct disk_info {
struct raid5_private_data { struct raid5_private_data {
struct stripe_head **stripe_hashtbl; struct stripe_head **stripe_hashtbl;
mddev_t *mddev; mddev_t *mddev;
mdk_thread_t *thread;
struct disk_info disks[MD_SB_DISKS]; struct disk_info disks[MD_SB_DISKS];
struct disk_info *spare; struct disk_info *spare;
int chunk_size, level, algorithm; int chunk_size, level, algorithm;
...@@ -226,7 +225,6 @@ struct raid5_private_data { ...@@ -226,7 +225,6 @@ struct raid5_private_data {
* waiting for 25% to be free * waiting for 25% to be free
*/ */
spinlock_t device_lock; spinlock_t device_lock;
char thread_name[MD_THREAD_NAME_MAX];
}; };
typedef struct raid5_private_data raid5_conf_t; typedef struct raid5_private_data raid5_conf_t;
......
...@@ -190,6 +190,7 @@ RTN *FNAME ARGS \ ...@@ -190,6 +190,7 @@ RTN *FNAME ARGS \
else read_unlock(&(DETAIL)->hash_lock); \ else read_unlock(&(DETAIL)->hash_lock); \
if (set) \ if (set) \
cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \ cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \
if (set==1 && new) cache_fresh(DETAIL, &new->MEMBER, 0); \
if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL); \ if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL); \
return tmp; \ return tmp; \
} \ } \
......
...@@ -441,9 +441,6 @@ svcauth_unix_accept(struct svc_rqst *rqstp, u32 *authp) ...@@ -441,9 +441,6 @@ svcauth_unix_accept(struct svc_rqst *rqstp, u32 *authp)
return SVC_DENIED; return SVC_DENIED;
} }
/* Put NULL verifier */
svc_putu32(resv, RPC_AUTH_NULL);
svc_putu32(resv, 0);
key.m_class = rqstp->rq_server->sv_program->pg_class; key.m_class = rqstp->rq_server->sv_program->pg_class;
key.m_addr = rqstp->rq_addr.sin_addr; key.m_addr = rqstp->rq_addr.sin_addr;
...@@ -470,8 +467,13 @@ svcauth_unix_accept(struct svc_rqst *rqstp, u32 *authp) ...@@ -470,8 +467,13 @@ svcauth_unix_accept(struct svc_rqst *rqstp, u32 *authp)
} }
else rv = SVC_DROP; else rv = SVC_DROP;
if (rqstp->rq_client == NULL && rqstp->rq_proc != 0) if (rv == SVC_OK && rqstp->rq_client == NULL && rqstp->rq_proc != 0)
goto badcred; goto badcred;
/* Put NULL verifier */
svc_putu32(resv, RPC_AUTH_NULL);
svc_putu32(resv, 0);
return rv; return rv;
badcred: badcred:
......
...@@ -577,12 +577,15 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) ...@@ -577,12 +577,15 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
if (skb_is_nonlinear(skb)) { if (skb_is_nonlinear(skb)) {
/* we have to copy */ /* we have to copy */
local_bh_disable();
if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
local_bh_enable();
/* checksum error */ /* checksum error */
skb_free_datagram(svsk->sk_sk, skb); skb_free_datagram(svsk->sk_sk, skb);
svc_sock_received(svsk); svc_sock_received(svsk);
return 0; return 0;
} }
local_bh_enable();
skb_free_datagram(svsk->sk_sk, skb); skb_free_datagram(svsk->sk_sk, skb);
} else { } else {
/* we can use it in-place */ /* we can use it in-place */
...@@ -1435,7 +1438,7 @@ static struct cache_deferred_req * ...@@ -1435,7 +1438,7 @@ static struct cache_deferred_req *
svc_defer(struct cache_req *req) svc_defer(struct cache_req *req)
{ {
struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.head[0].iov_len); int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
struct svc_deferred_req *dr; struct svc_deferred_req *dr;
if (rqstp->rq_arg.page_len) if (rqstp->rq_arg.page_len)
...@@ -1444,6 +1447,7 @@ svc_defer(struct cache_req *req) ...@@ -1444,6 +1447,7 @@ svc_defer(struct cache_req *req)
dr = rqstp->rq_deferred; dr = rqstp->rq_deferred;
rqstp->rq_deferred = NULL; rqstp->rq_deferred = NULL;
} else { } else {
int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
/* FIXME maybe discard if size too large */ /* FIXME maybe discard if size too large */
dr = kmalloc(size, GFP_KERNEL); dr = kmalloc(size, GFP_KERNEL);
if (dr == NULL) if (dr == NULL)
...@@ -1452,8 +1456,8 @@ svc_defer(struct cache_req *req) ...@@ -1452,8 +1456,8 @@ svc_defer(struct cache_req *req)
dr->serv = rqstp->rq_server; dr->serv = rqstp->rq_server;
dr->prot = rqstp->rq_prot; dr->prot = rqstp->rq_prot;
dr->addr = rqstp->rq_addr; dr->addr = rqstp->rq_addr;
dr->argslen = rqstp->rq_arg.head[0].iov_len >> 2; dr->argslen = rqstp->rq_arg.len >> 2;
memcpy(dr->args, rqstp->rq_arg.head[0].iov_base, dr->argslen<<2); memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
} }
spin_lock(&rqstp->rq_server->sv_lock); spin_lock(&rqstp->rq_server->sv_lock);
rqstp->rq_sock->sk_inuse++; rqstp->rq_sock->sk_inuse++;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment