Commit a277f856 authored by Mauricio Faria de Oliveira's avatar Mauricio Faria de Oliveira Committed by Kelsey Skunberg

bcache: check and adjust logical block size for backing devices

BugLink: https://bugs.launchpad.net/bugs/1867916

It's possible for a block driver to set logical block size to
a value greater than page size incorrectly; e.g. bcache takes
the value from the superblock, set by the user w/ make-bcache.

This causes a BUG/NULL pointer dereference in the path:

  __blkdev_get()
  -> set_init_blocksize() // set i_blkbits based on ...
     -> bdev_logical_block_size()
        -> queue_logical_block_size() // ... this value
  -> bdev_disk_changed()
     ...
     -> blkdev_readpage()
        -> block_read_full_page()
           -> create_page_buffers() // size = 1 << i_blkbits
              -> create_empty_buffers() // give size/take pointer
                 -> alloc_page_buffers() // return NULL
                 .. BUG!

Because alloc_page_buffers() is called with size > PAGE_SIZE,
thus it initializes head = NULL, skips the loop, return head;
then create_empty_buffers() gets (and uses) the NULL pointer.

This has been around longer than commit ad6bf88a ("block:
fix an integer overflow in logical block size"); however, it
increased the range of values that can trigger the issue.

Previously only 8k/16k/32k (on x86/4k page size) would do it,
as greater values overflow unsigned short to zero, and queue_
logical_block_size() would then use the default of 512.

Now the range with unsigned int is much larger, and users w/
the 512k value, which happened to be zero'ed previously and
work fine, started to hit this issue -- as the zero is gone,
and queue_logical_block_size() does return 512k (>PAGE_SIZE.)

Fix this by checking the bcache device's logical block size,
and if it's greater than page size, fallback to the backing/
cached device's logical page size.

This doesn't affect cache devices as those are still checked
for block/page size in read_super(); only the backing/cached
devices are not.

Apparently it's a regression from commit 2903381f ("bcache:
Take data offset from the bdev superblock."), moving the check
into BCACHE_SB_VERSION_CDEV only. Now that we have superblocks
of backing devices out there with this larger value, we cannot
refuse to load them (i.e., have a similar check in _BDEV.)

Ideally perhaps bcache should use all values from the backing
device (physical/logical/io_min block size)? But for now just
fix the problematic case.

Test-case:

    # IMG=/root/disk.img
    # dd if=/dev/zero of=$IMG bs=1 count=0 seek=1G
    # DEV=$(losetup --find --show $IMG)
    # make-bcache --bdev $DEV --block 8k
      < see dmesg >

Before:

    # uname -r
    5.7.0-rc7

    [   55.944046] BUG: kernel NULL pointer dereference, address: 0000000000000000
    ...
    [   55.949742] CPU: 3 PID: 610 Comm: bcache-register Not tainted 5.7.0-rc7 #4
    ...
    [   55.952281] RIP: 0010:create_empty_buffers+0x1a/0x100
    ...
    [   55.966434] Call Trace:
    [   55.967021]  create_page_buffers+0x48/0x50
    [   55.967834]  block_read_full_page+0x49/0x380
    [   55.972181]  do_read_cache_page+0x494/0x610
    [   55.974780]  read_part_sector+0x2d/0xaa
    [   55.975558]  read_lba+0x10e/0x1e0
    [   55.977904]  efi_partition+0x120/0x5a6
    [   55.980227]  blk_add_partitions+0x161/0x390
    [   55.982177]  bdev_disk_changed+0x61/0xd0
    [   55.982961]  __blkdev_get+0x350/0x490
    [   55.983715]  __device_add_disk+0x318/0x480
    [   55.984539]  bch_cached_dev_run+0xc5/0x270
    [   55.986010]  register_bcache.cold+0x122/0x179
    [   55.987628]  kernfs_fop_write+0xbc/0x1a0
    [   55.988416]  vfs_write+0xb1/0x1a0
    [   55.989134]  ksys_write+0x5a/0xd0
    [   55.989825]  do_syscall_64+0x43/0x140
    [   55.990563]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
    [   55.991519] RIP: 0033:0x7f7d60ba3154
    ...

After:

    # uname -r
    5.7.0.bcachelbspgsz

    [   31.672460] bcache: bcache_device_init() bcache0: sb/logical block size (8192) greater than page size (4096) falling back to device logical block size (512)
    [   31.675133] bcache: register_bdev() registered backing device loop0

    # grep ^ /sys/block/bcache0/queue/*_block_size
    /sys/block/bcache0/queue/logical_block_size:512
    /sys/block/bcache0/queue/physical_block_size:8192
Reported-by: default avatarRyan Finnie <ryan@finnie.org>
Reported-by: default avatarSebastian Marsching <sebastian@marsching.com>
Signed-off-by: default avatarMauricio Faria de Oliveira <mfo@canonical.com>
Signed-off-by: default avatarColy Li <colyli@suse.de>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
(backported from commit dcacbc12)
[mfo: backport: hunks 1/3/4: adjust bcache_device_init() signature
 and calls for the lack of parameter make_request_fn from upstream.
 hunk 1: adjust bcache_device_init() signature for 'unsigned (int)',
         and refresh one context line.
 hunk 2: change from blk_queue_flag_set/clear() to set/clear_bit().]
Signed-off-by: default avatarMauricio Faria de Oliveira <mfo@canonical.com>
Acked-by: default avatarAndrea Righi <andrea.righi@canonical.com>
Acked-by: default avatarStefan Bader <stefan.bader@canonical.com>
Signed-off-by: default avatarKelsey Skunberg <kelsey.skunberg@canonical.com>
parent 8de9922b
...@@ -760,7 +760,7 @@ static void bcache_device_free(struct bcache_device *d) ...@@ -760,7 +760,7 @@ static void bcache_device_free(struct bcache_device *d)
} }
static int bcache_device_init(struct bcache_device *d, unsigned block_size, static int bcache_device_init(struct bcache_device *d, unsigned block_size,
sector_t sectors) sector_t sectors, struct block_device *cached_bdev)
{ {
struct request_queue *q; struct request_queue *q;
size_t n; size_t n;
...@@ -827,6 +827,21 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, ...@@ -827,6 +827,21 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
q->limits.io_min = block_size; q->limits.io_min = block_size;
q->limits.logical_block_size = block_size; q->limits.logical_block_size = block_size;
q->limits.physical_block_size = block_size; q->limits.physical_block_size = block_size;
if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
/*
* This should only happen with BCACHE_SB_VERSION_BDEV.
* Block/page size is checked for BCACHE_SB_VERSION_CDEV.
*/
pr_info("%s: sb/logical block size (%u) greater than page size "
"(%lu) falling back to device logical block size (%u)",
d->disk->disk_name, q->limits.logical_block_size,
PAGE_SIZE, bdev_logical_block_size(cached_bdev));
/* This also adjusts physical block size/min io size if needed */
blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
}
set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags); clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
...@@ -1164,7 +1179,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) ...@@ -1164,7 +1179,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
q->limits.raid_partial_stripes_expensive; q->limits.raid_partial_stripes_expensive;
ret = bcache_device_init(&dc->disk, block_size, ret = bcache_device_init(&dc->disk, block_size,
dc->bdev->bd_part->nr_sects - dc->sb.data_offset); dc->bdev->bd_part->nr_sects - dc->sb.data_offset, dc->bdev);
if (ret) if (ret)
return ret; return ret;
...@@ -1267,7 +1282,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) ...@@ -1267,7 +1282,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
kobject_init(&d->kobj, &bch_flash_dev_ktype); kobject_init(&d->kobj, &bch_flash_dev_ktype);
if (bcache_device_init(d, block_bytes(c), u->sectors)) if (bcache_device_init(d, block_bytes(c), u->sectors, NULL))
goto err; goto err;
bcache_device_attach(d, c, u - c->uuids); bcache_device_attach(d, c, u - c->uuids);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment