Commit 951f4be9 authored by Alexander Viro's avatar Alexander Viro Committed by Linus Torvalds

[PATCH] fix check_disk_change() deadlocks

Small, but tricky: fix for check_disk_change() deadlocks.
What we do is
	a) opening block device shifted from check_partition() to
	   grok_partitions(); check_partitions() takes opened
	   struct block_device.
	b) all callers of check_disk_change() fall in two groups -
	   ones that are called only from some ->open() and ones
	   that are _never_ called from ->open().  There is no
	   middle ground.  We split the thing in two functions -
	   check_disk_change() for the first class and full_check_....
	   for the second.  The former (ones inside ->open()) doesn't
	   touch partition tables but marks the bdev as "had been
	   invalidated".  In the end of do_open() we check if
	   bdev is marked and call wipe_partitions()/check_partition()
	   if it is - at that point bdev is fully set up and ready.
	c) ->bd_part_sem kludge is gone - we use ->bd_sem instead.
	   That is, do_open() on a partition grabs ->bd_sem on entire
	   disk and picks partition data while under it; do_open() on
	   entire disk rereads partition if needed before dropping
	   ->bd_sem (right before dropping it); BLKRRPART does
	   trylock on ->bd_sem and then checks ->bd_part_count -
	   same logics as before, except that we use ->bd_sem instead
	   of ->bd_part_sem.

That kills recursive open(), gives us the same exclusion rules as
we had and makes sure that actual IO (including rereading partition
tables) is done only when we are ready to do it.

It actually sounds a lot nastier than it is.  do_open() is a one sick
puppy right now, but we have everything in one place and _out_ of drivers
(and 20-odd equally sick puppies are gone from them, along with about
the same number of races).

Now we are almost ready to clean it up for good - all that remains to
do before that is to get the rest of drivers (cciss, DAC960, i2o and
a couple of ancients - xd and acsi) using per-disk gendisks.  Then
most of that crap will disappear.

BTW, the only generic ioctl remaining in the drivers is HDIO_GETGEO -
a lot of foo_ioctl() starts with if (cmd != HDIO_GETGEO) return -EINVAL; ;-)
parent 9804df6c
......@@ -310,7 +310,7 @@ struct block_device *bdget(dev_t dev)
new_bdev->bd_contains = NULL;
new_bdev->bd_inode = inode;
new_bdev->bd_part_count = 0;
sema_init(&new_bdev->bd_part_sem, 1);
new_bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = kdev;
inode->i_bdev = new_bdev;
......@@ -518,24 +518,34 @@ int check_disk_change(struct block_device *bdev)
disk = get_gendisk(dev);
part = disk->part + minor(dev) - disk->first_minor;
if (disk && disk->minor_shift) {
if (!down_trylock(&bdev->bd_part_sem)) {
if (!bdev->bd_part_count) {
if (wipe_partitions(dev) == 0) {
if (bdops->revalidate)
bdops->revalidate(dev);
grok_partitions(dev, part[0].nr_sects);
}
}
up(&bdev->bd_part_sem);
}
} else {
if (bdops->revalidate)
bdops->revalidate(dev);
}
if (disk && disk->minor_shift)
bdev->bd_invalidated = 1;
return 1;
}
int full_check_disk_change(struct block_device *bdev)
{
int res;
down(&bdev->bd_sem);
res = check_disk_change(bdev);
if (bdev->bd_invalidated && !bdev->bd_part_count) {
struct gendisk *g = get_gendisk(to_kdev_t(bdev->bd_dev));
struct hd_struct *part;
part = g->part + MINOR(bdev->bd_dev) - g->first_minor;
bdev->bd_invalidated = 0;
wipe_partitions(to_kdev_t(bdev->bd_dev));
if (part[0].nr_sects)
check_partition(g, bdev);
}
up(&bdev->bd_sem);
return res;
}
/*
* Will die as soon as two remaining callers get converted.
*/
int __check_disk_change(dev_t dev)
{
struct block_device *bdev = bdget(dev);
......@@ -544,11 +554,24 @@ int __check_disk_change(dev_t dev)
return 0;
if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW) < 0)
return 0;
res = check_disk_change(bdev);
res = full_check_disk_change(bdev);
blkdev_put(bdev, BDEV_RAW);
return res;
}
static void bd_set_size(struct block_device *bdev, loff_t size)
{
unsigned bsize = bdev_hardsect_size(bdev);
bdev->bd_inode->i_size = size;
while (bsize < PAGE_CACHE_SIZE) {
if (size & bsize)
break;
bsize <<= 1;
}
bdev->bd_block_size = bsize;
bdev->bd_inode->i_blkbits = blksize_bits(bsize);
}
static int do_open(struct block_device *bdev, struct inode *inode, struct file *file)
{
int ret = -ENXIO;
......@@ -595,53 +618,64 @@ static int do_open(struct block_device *bdev, struct inode *inode, struct file *
}
}
if (bdev->bd_contains == bdev) {
struct gendisk *g = get_gendisk(dev);
if (bdev->bd_op->open) {
ret = bdev->bd_op->open(inode, file);
if (ret)
goto out2;
}
} else {
down(&bdev->bd_contains->bd_part_sem);
bdev->bd_contains->bd_part_count++;
up(&bdev->bd_contains->bd_part_sem);
}
if (!bdev->bd_openers) {
struct blk_dev_struct *p = blk_dev + major(dev);
struct gendisk *g = get_gendisk(dev);
unsigned bsize = bdev_hardsect_size(bdev);
struct backing_dev_info *bdi;
sector_t sect = 0;
bdev->bd_offset = 0;
if (g) {
struct hd_struct *p;
p = g->part + minor(dev) - g->first_minor;
bdev->bd_inode->i_size = (loff_t) p->nr_sects << 9;
bdev->bd_offset = p->start_sect;
sect = p->nr_sects;
} else if (blk_size[major(dev)])
bdev->bd_inode->i_size =
(loff_t) blk_size[major(dev)][minor(dev)] << 10;
else
bdev->bd_inode->i_size = 0;
while (bsize < PAGE_CACHE_SIZE) {
if (bdev->bd_inode->i_size & bsize)
break;
bsize <<= 1;
}
bdev->bd_block_size = bsize;
bdev->bd_inode->i_blkbits = blksize_bits(bsize);
sect = blk_size[major(dev)][minor(dev)] << 1;
if (p->queue)
bdev->bd_queue = p->queue(dev);
else
bdev->bd_queue = &p->request_queue;
if (bdev->bd_inode->i_data.backing_dev_info ==
&default_backing_dev_info) {
struct backing_dev_info *bdi;
bd_set_size(bdev, (loff_t)sect << 9);
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
bdi = &default_backing_dev_info;
inode->i_data.backing_dev_info = bdi;
bdev->bd_inode->i_data.backing_dev_info = bdi;
}
if (bdev->bd_invalidated && !bdev->bd_part_count) {
struct hd_struct *part;
part = g->part + minor(dev) - g->first_minor;
bdev->bd_invalidated = 0;
wipe_partitions(dev);
if (part[0].nr_sects)
check_partition(g, bdev);
}
} else {
down(&bdev->bd_contains->bd_sem);
bdev->bd_contains->bd_part_count++;
if (!bdev->bd_openers) {
struct gendisk *g = get_gendisk(dev);
struct hd_struct *p;
p = g->part + minor(dev) - g->first_minor;
inode->i_data.backing_dev_info =
bdev->bd_inode->i_data.backing_dev_info =
bdev->bd_contains->bd_inode->i_data.backing_dev_info;
if (!p->nr_sects) {
bdev->bd_contains->bd_part_count--;
up(&bdev->bd_contains->bd_sem);
ret = -ENXIO;
goto out2;
}
bdev->bd_queue = bdev->bd_contains->bd_queue;
bdev->bd_offset = p->start_sect;
bd_set_size(bdev, (loff_t) p->nr_sects << 9);
}
up(&bdev->bd_contains->bd_sem);
}
bdev->bd_openers++;
up(&bdev->bd_sem);
......@@ -725,9 +759,9 @@ int blkdev_put(struct block_device *bdev, int kind)
if (bdev->bd_op->release)
ret = bdev->bd_op->release(bd_inode, NULL);
} else {
down(&bdev->bd_contains->bd_part_sem);
down(&bdev->bd_contains->bd_sem);
bdev->bd_contains->bd_part_count--;
up(&bdev->bd_contains->bd_part_sem);
up(&bdev->bd_contains->bd_sem);
}
if (!bdev->bd_openers) {
if (bdev->bd_op->owner)
......@@ -758,24 +792,25 @@ static int blkdev_reread_part(struct block_device *bdev)
struct hd_struct *part;
int res;
if (!disk)
if (!disk || !disk->minor_shift)
return -EINVAL;
part = disk->part + minor(dev) - disk->first_minor;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (down_trylock(&bdev->bd_part_sem));
if (down_trylock(&bdev->bd_sem));
return -EBUSY;
if (bdev->bd_part_count) {
up(&bdev->bd_part_sem);
up(&bdev->bd_sem);
return -EBUSY;
}
res = wipe_partitions(dev);
if (!res) {
if (bdev->bd_op->revalidate)
bdev->bd_op->revalidate(dev);
grok_partitions(dev, part[0].nr_sects);
if (part[0].nr_sects)
check_partition(disk, bdev);
}
up(&bdev->bd_part_sem);
up(&bdev->bd_sem);
return res;
}
......
......@@ -2390,7 +2390,7 @@ static int check_disc_changed (struct devfs_entry *de)
/* Ugly hack to disable messages about unable to read partition table */
tmp = warn_no_part;
warn_no_part = 0;
retval = check_disk_change(bdev);
retval = full_check_disk_change(bdev);
warn_no_part = tmp;
out:
devfs_put_ops (de);
......
......@@ -289,10 +289,13 @@ void driverfs_remove_partitions(struct gendisk *hd, int minor)
return;
}
static void check_partition(struct gendisk *hd, kdev_t dev)
/*
* DON'T EXPORT
*/
void check_partition(struct gendisk *hd, struct block_device *bdev)
{
devfs_handle_t de = NULL;
struct block_device *bdev;
kdev_t dev = to_kdev_t(bdev->bd_dev);
char buf[64];
struct parsed_partitions *state;
int i;
......@@ -314,9 +317,6 @@ static void check_partition(struct gendisk *hd, kdev_t dev)
if (n - COMPAQ_SMART2_MAJOR <= 7 || n - COMPAQ_CISS_MAJOR <= 7)
sprintf(state->name, "p");
}
bdev = bdget(kdev_t_to_nr(dev));
if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW))
goto out;
state->limit = 1<<hd->minor_shift;
for (i = 0; check_part[i]; i++) {
int res, j;
......@@ -328,7 +328,7 @@ static void check_partition(struct gendisk *hd, kdev_t dev)
if (res < 0) {
if (warn_no_part)
printk(" unable to read partition table\n");
goto setup_devfs;
goto out;
}
p = hd->part + minor(dev) - hd->first_minor;
for (j = 1; j < state->limit; j++) {
......@@ -340,12 +340,10 @@ static void check_partition(struct gendisk *hd, kdev_t dev)
md_autodetect_dev(mk_kdev(major(dev),minor(dev)+j));
#endif
}
goto setup_devfs;
goto out;
}
printk(" unknown partition table\n");
setup_devfs:
blkdev_put(bdev, BDEV_RAW);
out:
driverfs_create_partitions(hd, minor(dev));
devfs_register_partitions (hd, minor(dev), 0);
......@@ -463,34 +461,29 @@ void register_disk(struct gendisk *gdev, kdev_t dev, unsigned minors,
void grok_partitions(kdev_t dev, long size)
{
int minors, first_minor, end_minor;
struct block_device *bdev;
struct gendisk *g = get_gendisk(dev);
struct hd_struct *p;
if (!g)
return;
minors = 1 << g->minor_shift;
first_minor = minor(dev);
if (first_minor & (minors-1)) {
printk("grok_partitions: bad device 0x%02x:%02x\n",
major(dev), first_minor);
first_minor &= ~(minors-1);
}
end_minor = first_minor + minors;
p = g->part + first_minor - g->first_minor;
p = g->part + minor(dev) - g->first_minor;
p[0].nr_sects = size;
/* No minors to use for partitions */
if (minors == 1)
if (!g->minor_shift)
return;
/* No such device (e.g., media were just removed) */
if (!size)
return;
check_partition(g, mk_kdev(g->major, first_minor));
bdev = bdget(kdev_t_to_nr(dev));
if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW) < 0)
return;
check_partition(g, bdev);
blkdev_put(bdev, BDEV_RAW);
}
unsigned char *read_dev_sector(struct block_device *bdev, unsigned long n, Sector *p)
......
......@@ -502,7 +502,6 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
devfs_put_ops (de); /* Decrement module use count now we're safe */
if (error)
goto out;
check_disk_change(bdev);
error = -EACCES;
if (!(flags & MS_RDONLY) && bdev_read_only(bdev))
goto out1;
......
......@@ -279,6 +279,7 @@ extern struct blk_dev_struct blk_dev[MAX_BLKDEV];
extern void grok_partitions(kdev_t dev, long size);
extern int wipe_partitions(kdev_t dev);
extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size);
extern void check_partition(struct gendisk *disk, struct block_device *bdev);
extern void generic_make_request(struct bio *bio);
extern inline request_queue_t *bdev_get_queue(struct block_device *bdev);
extern void blk_put_request(struct request *);
......
......@@ -356,8 +356,8 @@ struct block_device {
struct block_device * bd_contains;
unsigned bd_block_size;
unsigned long bd_offset;
struct semaphore bd_part_sem;
unsigned bd_part_count;
int bd_invalidated;
};
struct inode {
......@@ -1132,6 +1132,7 @@ extern int fs_may_remount_ro(struct super_block *);
#define bio_data_dir(bio) ((bio)->bi_rw & 1)
extern int check_disk_change(struct block_device *);
extern int full_check_disk_change(struct block_device *);
extern int __check_disk_change(dev_t);
extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment