Commit d080827f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm updates from Dan Williams:
 "The bulk of this has appeared in -next and independently received a
  build success notification from the kbuild robot.  The 'for-4.5/block-
  dax' topic branch was rebased over the weekend to drop the "block
  device end-of-life" rework that Al would like to see re-implemented
  with a notifier, and to address bug reports against the badblocks
  integration.

  There is pending feedback against "libnvdimm: Add a poison list and
  export badblocks" received last week.  Linda identified some localized
  fixups that we will handle incrementally.

  Summary:

   - Media error handling: The 'badblocks' implementation that
     originated in md-raid is up-levelled to a generic capability of a
     block device.  This initial implementation is limited to being
     consulted in the pmem block-i/o path.  Later, 'badblocks' will be
     consulted when creating dax mappings.

   - Raw block device dax: For virtualization and other cases that want
     large contiguous mappings of persistent memory, add the capability
     to dax-mmap a block device directly.

   - Increased /dev/mem restrictions: Add an option to treat all
     io-memory as IORESOURCE_EXCLUSIVE, i.e. disable /dev/mem access
     while a driver is actively using an address range.  This behavior
     is controlled via the new CONFIG_IO_STRICT_DEVMEM option and can be
     overridden by the existing "iomem=relaxed" kernel command line
     option.

   - Miscellaneous fixes include a 'pfn'-device huge page alignment fix,
     block device shutdown crash fix, and other small libnvdimm fixes"

* tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (32 commits)
  block: kill disk_{check|set|clear|alloc}_badblocks
  libnvdimm, pmem: nvdimm_read_bytes() badblocks support
  pmem, dax: disable dax in the presence of bad blocks
  pmem: fail io-requests to known bad blocks
  libnvdimm: convert to statically allocated badblocks
  libnvdimm: don't fail init for full badblocks list
  block, badblocks: introduce devm_init_badblocks
  block: clarify badblocks lifetime
  badblocks: rename badblocks_free to badblocks_exit
  libnvdimm, pmem: move definition of nvdimm_namespace_add_poison to nd.h
  libnvdimm: Add a poison list and export badblocks
  nfit_test: Enable DSMs for all test NFITs
  md: convert to use the generic badblocks code
  block: Add badblock management for gendisks
  badblocks: Add core badblock management code
  block: fix del_gendisk() vs blkdev_ioctl crash
  block: enable dax for raw block devices
  block: introduce bdev_file_inode()
  restrict /dev/mem to idle io memory ranges
  arch: consolidate CONFIG_STRICT_DEVM in lib/Kconfig.debug
  ...
parents cbd88cd4 8b63b6bf
......@@ -2,6 +2,7 @@ config ARM
bool
default y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_CUSTOM_GPIO_H
......
......@@ -15,20 +15,6 @@ config ARM_PTDUMP
kernel.
If in doubt, say "N"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel.
If this option is switched on, the /dev/mem file only allows
userspace access to memory mapped peripherals.
If in doubt, say Y.
# RMK wants arm kernels compiled with frame pointers or stack unwinding.
# If you know what you are doing and are willing to live without stack
# traces, you can get a slightly smaller kernel by setting this option to
......
......@@ -3,6 +3,7 @@ config ARM64
select ACPI_CCA_REQUIRED if ACPI
select ACPI_GENERIC_GSI if ACPI
select ACPI_REDUCED_HARDWARE_ONLY if ACPI
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
......
......@@ -14,20 +14,6 @@ config ARM64_PTDUMP
kernel.
If in doubt, say "N"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
help
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel.
If this option is switched on, the /dev/mem file only allows
userspace access to memory mapped peripherals.
If in doubt, say Y.
config PID_IN_CONTEXTIDR
bool "Write the current PID to the CONTEXTIDR register"
help
......
......@@ -10,6 +10,7 @@ config FRV
select HAVE_DEBUG_BUGVERBOSE
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_CPU_DEVICES
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_WANT_IPC_PARSE_VERSION
select OLD_SIGSUSPEND3
select OLD_SIGACTION
......
......@@ -13,6 +13,7 @@ config M32R
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
select GENERIC_ATOMIC64
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_USES_GETTIMEOFFSET
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
......
......@@ -159,6 +159,7 @@ config PPC
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
select ARCH_HAS_DMA_SET_COHERENT_MASK
select ARCH_HAS_DEVMEM_IS_ALLOWED
select HAVE_ARCH_SECCOMP_FILTER
config GENERIC_CSUM
......
......@@ -335,18 +335,6 @@ config PPC_EARLY_DEBUG_CPM_ADDR
platform probing is done, all platforms selected must
share the same address.
config STRICT_DEVMEM
def_bool y
prompt "Filter access to /dev/mem"
help
This option restricts access to /dev/mem. If this option is
disabled, you allow userspace access to all memory, including
kernel and userspace memory. Accidental memory access is likely
to be disastrous.
Memory access is required for experts who want to debug the kernel.
If you are unsure, say Y.
config FAIL_IOMMU
bool "Fault-injection capability for IOMMU"
depends on FAULT_INJECTION
......
......@@ -66,6 +66,7 @@ config S390
def_bool y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_SG_CHAIN
......
......@@ -5,18 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
config STRICT_DEVMEM
def_bool y
prompt "Filter access to /dev/mem"
---help---
This option restricts access to /dev/mem. If this option is
disabled, you allow userspace access to all memory, including
kernel and userspace memory. Accidental memory access is likely
to be disastrous.
Memory access is required for experts who want to debug the kernel.
If you are unsure, say Y.
config S390_PTDUMP
bool "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
......
......@@ -19,6 +19,7 @@ config TILE
select VIRT_TO_BUS
select SYS_HYPERVISOR
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_CLOCKEVENTS
select MODULES_USE_ELF_RELA
......@@ -116,9 +117,6 @@ config ARCH_DISCONTIGMEM_DEFAULT
config TRACE_IRQFLAGS_SUPPORT
def_bool y
config STRICT_DEVMEM
def_bool y
# SMP is required for Tilera Linux.
config SMP
def_bool y
......
config UNICORE32
def_bool y
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_MEMBLOCK
......
......@@ -2,20 +2,6 @@ menu "Kernel hacking"
source "lib/Kconfig.debug"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel.
If this option is switched on, the /dev/mem file only allows
userspace access to memory mapped peripherals.
If in doubt, say Y.
config EARLY_PRINTK
def_bool DEBUG_OCD
help
......
......@@ -24,6 +24,7 @@ config X86
select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
......
......@@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel. Note that with PAT support
enabled, even in this case there are restrictions on /dev/mem
use due to the cache aliasing requirements.
If this option is switched on, the /dev/mem file only allows
userspace access to PCI space and the BIOS code and data regions.
This is sufficient for dosemu and X and all common users of
/dev/mem.
If in doubt, say Y.
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
......
......@@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
partitions/
badblocks.o partitions/
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
......
/*
* Bad block management
*
* - Heavily based on MD badblocks code from Neil Brown
*
* Copyright (c) 2015, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/badblocks.h>
#include <linux/seqlock.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/slab.h>
/**
* badblocks_check() - check a given range for bad sectors
* @bb: the badblocks structure that holds all badblock information
* @s: sector (start) at which to check for badblocks
* @sectors: number of sectors to check for badblocks
* @first_bad: pointer to store location of the first badblock
* @bad_sectors: pointer to store number of badblocks after @first_bad
*
* We can record which blocks on each device are 'bad' and so just
* fail those blocks, or that stripe, rather than the whole device.
* Entries in the bad-block table are 64bits wide. This comprises:
* Length of bad-range, in sectors: 0-511 for lengths 1-512
* Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
* A 'shift' can be set so that larger blocks are tracked and
* consequently larger devices can be covered.
* 'Acknowledged' flag - 1 bit. - the most significant bit.
*
* Locking of the bad-block table uses a seqlock so badblocks_check
* might need to retry if it is very unlucky.
* We will sometimes want to check for bad blocks in a bi_end_io function,
* so we use the write_seqlock_irq variant.
*
* When looking for a bad block we specify a range and want to
* know if any block in the range is bad. So we binary-search
* to the last range that starts at-or-before the given endpoint,
* (or "before the sector after the target range")
* then see if it ends after the given start.
*
* Return:
* 0: there are no known bad blocks in the range
* 1: there are known bad block which are all acknowledged
* -1: there are bad blocks which have not yet been acknowledged in metadata.
* plus the start/length of the first bad section we overlap.
*/
int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors)
{
int hi;
int lo;
u64 *p = bb->page;
int rv;
sector_t target = s + sectors;
unsigned seq;
if (bb->shift > 0) {
/* round the start down, and the end up */
s >>= bb->shift;
target += (1<<bb->shift) - 1;
target >>= bb->shift;
sectors = target - s;
}
/* 'target' is now the first block after the bad range */
retry:
seq = read_seqbegin(&bb->lock);
lo = 0;
rv = 0;
hi = bb->count;
/* Binary search between lo and hi for 'target'
* i.e. for the last range that starts before 'target'
*/
/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
* are known not to be the last range before target.
* VARIANT: hi-lo is the number of possible
* ranges, and decreases until it reaches 1
*/
while (hi - lo > 1) {
int mid = (lo + hi) / 2;
sector_t a = BB_OFFSET(p[mid]);
if (a < target)
/* This could still be the one, earlier ranges
* could not.
*/
lo = mid;
else
/* This and later ranges are definitely out. */
hi = mid;
}
/* 'lo' might be the last that started before target, but 'hi' isn't */
if (hi > lo) {
/* need to check all range that end after 's' to see if
* any are unacknowledged.
*/
while (lo >= 0 &&
BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
if (BB_OFFSET(p[lo]) < target) {
/* starts before the end, and finishes after
* the start, so they must overlap
*/
if (rv != -1 && BB_ACK(p[lo]))
rv = 1;
else
rv = -1;
*first_bad = BB_OFFSET(p[lo]);
*bad_sectors = BB_LEN(p[lo]);
}
lo--;
}
}
if (read_seqretry(&bb->lock, seq))
goto retry;
return rv;
}
EXPORT_SYMBOL_GPL(badblocks_check);
/**
* badblocks_set() - Add a range of bad blocks to the table.
* @bb: the badblocks structure that holds all badblock information
* @s: first sector to mark as bad
* @sectors: number of sectors to mark as bad
* @acknowledged: weather to mark the bad sectors as acknowledged
*
* This might extend the table, or might contract it if two adjacent ranges
* can be merged. We binary-search to find the 'insertion' point, then
* decide how best to handle it.
*
* Return:
* 0: success
* 1: failed to set badblocks (out of space)
*/
int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
int acknowledged)
{
u64 *p;
int lo, hi;
int rv = 0;
unsigned long flags;
if (bb->shift < 0)
/* badblocks are disabled */
return 0;
if (bb->shift) {
/* round the start down, and the end up */
sector_t next = s + sectors;
s >>= bb->shift;
next += (1<<bb->shift) - 1;
next >>= bb->shift;
sectors = next - s;
}
write_seqlock_irqsave(&bb->lock, flags);
p = bb->page;
lo = 0;
hi = bb->count;
/* Find the last range that starts at-or-before 's' */
while (hi - lo > 1) {
int mid = (lo + hi) / 2;
sector_t a = BB_OFFSET(p[mid]);
if (a <= s)
lo = mid;
else
hi = mid;
}
if (hi > lo && BB_OFFSET(p[lo]) > s)
hi = lo;
if (hi > lo) {
/* we found a range that might merge with the start
* of our new range
*/
sector_t a = BB_OFFSET(p[lo]);
sector_t e = a + BB_LEN(p[lo]);
int ack = BB_ACK(p[lo]);
if (e >= s) {
/* Yes, we can merge with a previous range */
if (s == a && s + sectors >= e)
/* new range covers old */
ack = acknowledged;
else
ack = ack && acknowledged;
if (e < s + sectors)
e = s + sectors;
if (e - a <= BB_MAX_LEN) {
p[lo] = BB_MAKE(a, e-a, ack);
s = e;
} else {
/* does not all fit in one range,
* make p[lo] maximal
*/
if (BB_LEN(p[lo]) != BB_MAX_LEN)
p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
s = a + BB_MAX_LEN;
}
sectors = e - s;
}
}
if (sectors && hi < bb->count) {
/* 'hi' points to the first range that starts after 's'.
* Maybe we can merge with the start of that range
*/
sector_t a = BB_OFFSET(p[hi]);
sector_t e = a + BB_LEN(p[hi]);
int ack = BB_ACK(p[hi]);
if (a <= s + sectors) {
/* merging is possible */
if (e <= s + sectors) {
/* full overlap */
e = s + sectors;
ack = acknowledged;
} else
ack = ack && acknowledged;
a = s;
if (e - a <= BB_MAX_LEN) {
p[hi] = BB_MAKE(a, e-a, ack);
s = e;
} else {
p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
s = a + BB_MAX_LEN;
}
sectors = e - s;
lo = hi;
hi++;
}
}
if (sectors == 0 && hi < bb->count) {
/* we might be able to combine lo and hi */
/* Note: 's' is at the end of 'lo' */
sector_t a = BB_OFFSET(p[hi]);
int lolen = BB_LEN(p[lo]);
int hilen = BB_LEN(p[hi]);
int newlen = lolen + hilen - (s - a);
if (s >= a && newlen < BB_MAX_LEN) {
/* yes, we can combine them */
int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
memmove(p + hi, p + hi + 1,
(bb->count - hi - 1) * 8);
bb->count--;
}
}
while (sectors) {
/* didn't merge (it all).
* Need to add a range just before 'hi'
*/
if (bb->count >= MAX_BADBLOCKS) {
/* No room for more */
rv = 1;
break;
} else {
int this_sectors = sectors;
memmove(p + hi + 1, p + hi,
(bb->count - hi) * 8);
bb->count++;
if (this_sectors > BB_MAX_LEN)
this_sectors = BB_MAX_LEN;
p[hi] = BB_MAKE(s, this_sectors, acknowledged);
sectors -= this_sectors;
s += this_sectors;
}
}
bb->changed = 1;
if (!acknowledged)
bb->unacked_exist = 1;
write_sequnlock_irqrestore(&bb->lock, flags);
return rv;
}
EXPORT_SYMBOL_GPL(badblocks_set);
/**
* badblocks_clear() - Remove a range of bad blocks to the table.
* @bb: the badblocks structure that holds all badblock information
* @s: first sector to mark as bad
* @sectors: number of sectors to mark as bad
*
* This may involve extending the table if we spilt a region,
* but it must not fail. So if the table becomes full, we just
* drop the remove request.
*
* Return:
* 0: success
* 1: failed to clear badblocks
*/
int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
{
u64 *p;
int lo, hi;
sector_t target = s + sectors;
int rv = 0;
if (bb->shift > 0) {
/* When clearing we round the start up and the end down.
* This should not matter as the shift should align with
* the block size and no rounding should ever be needed.
* However it is better the think a block is bad when it
* isn't than to think a block is not bad when it is.
*/
s += (1<<bb->shift) - 1;
s >>= bb->shift;
target >>= bb->shift;
sectors = target - s;
}
write_seqlock_irq(&bb->lock);
p = bb->page;
lo = 0;
hi = bb->count;
/* Find the last range that starts before 'target' */
while (hi - lo > 1) {
int mid = (lo + hi) / 2;
sector_t a = BB_OFFSET(p[mid]);
if (a < target)
lo = mid;
else
hi = mid;
}
if (hi > lo) {
/* p[lo] is the last range that could overlap the
* current range. Earlier ranges could also overlap,
* but only this one can overlap the end of the range.
*/
if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
/* Partial overlap, leave the tail of this range */
int ack = BB_ACK(p[lo]);
sector_t a = BB_OFFSET(p[lo]);
sector_t end = a + BB_LEN(p[lo]);
if (a < s) {
/* we need to split this range */
if (bb->count >= MAX_BADBLOCKS) {
rv = -ENOSPC;
goto out;
}
memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
bb->count++;
p[lo] = BB_MAKE(a, s-a, ack);
lo++;
}
p[lo] = BB_MAKE(target, end - target, ack);
/* there is no longer an overlap */
hi = lo;
lo--;
}
while (lo >= 0 &&
BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
/* This range does overlap */
if (BB_OFFSET(p[lo]) < s) {
/* Keep the early parts of this range. */
int ack = BB_ACK(p[lo]);
sector_t start = BB_OFFSET(p[lo]);
p[lo] = BB_MAKE(start, s - start, ack);
/* now low doesn't overlap, so.. */
break;
}
lo--;
}
/* 'lo' is strictly before, 'hi' is strictly after,
* anything between needs to be discarded
*/
if (hi - lo > 1) {
memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
bb->count -= (hi - lo - 1);
}
}
bb->changed = 1;
out:
write_sequnlock_irq(&bb->lock);
return rv;
}
EXPORT_SYMBOL_GPL(badblocks_clear);
/**
* ack_all_badblocks() - Acknowledge all bad blocks in a list.
* @bb: the badblocks structure that holds all badblock information
*
* This only succeeds if ->changed is clear. It is used by
* in-kernel metadata updates
*/
void ack_all_badblocks(struct badblocks *bb)
{
if (bb->page == NULL || bb->changed)
/* no point even trying */
return;
write_seqlock_irq(&bb->lock);
if (bb->changed == 0 && bb->unacked_exist) {
u64 *p = bb->page;
int i;
for (i = 0; i < bb->count ; i++) {
if (!BB_ACK(p[i])) {
sector_t start = BB_OFFSET(p[i]);
int len = BB_LEN(p[i]);
p[i] = BB_MAKE(start, len, 1);
}
}
bb->unacked_exist = 0;
}
write_sequnlock_irq(&bb->lock);
}
EXPORT_SYMBOL_GPL(ack_all_badblocks);
/**
* badblocks_show() - sysfs access to bad-blocks list
* @bb: the badblocks structure that holds all badblock information
* @page: buffer received from sysfs
* @unack: weather to show unacknowledged badblocks
*
* Return:
* Length of returned data
*/
ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
{
size_t len;
int i;
u64 *p = bb->page;
unsigned seq;
if (bb->shift < 0)
return 0;
retry:
seq = read_seqbegin(&bb->lock);
len = 0;
i = 0;
while (len < PAGE_SIZE && i < bb->count) {
sector_t s = BB_OFFSET(p[i]);
unsigned int length = BB_LEN(p[i]);
int ack = BB_ACK(p[i]);
i++;
if (unack && ack)
continue;
len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
(unsigned long long)s << bb->shift,
length << bb->shift);
}
if (unack && len == 0)
bb->unacked_exist = 0;
if (read_seqretry(&bb->lock, seq))
goto retry;
return len;
}
EXPORT_SYMBOL_GPL(badblocks_show);
/**
* badblocks_store() - sysfs access to bad-blocks list
* @bb: the badblocks structure that holds all badblock information
* @page: buffer received from sysfs
* @len: length of data received from sysfs
* @unack: weather to show unacknowledged badblocks
*
* Return:
* Length of the buffer processed or -ve error.
*/
ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
int unack)
{
unsigned long long sector;
int length;
char newline;
switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
case 3:
if (newline != '\n')
return -EINVAL;
case 2:
if (length <= 0)
return -EINVAL;
break;
default:
return -EINVAL;
}
if (badblocks_set(bb, sector, length, !unack))
return -ENOSPC;
else
return len;
}
EXPORT_SYMBOL_GPL(badblocks_store);
static int __badblocks_init(struct device *dev, struct badblocks *bb,
int enable)
{
bb->dev = dev;
bb->count = 0;
if (enable)
bb->shift = 0;
else
bb->shift = -1;
if (dev)
bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
else
bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!bb->page) {
bb->shift = -1;
return -ENOMEM;
}
seqlock_init(&bb->lock);
return 0;
}
/**
* badblocks_init() - initialize the badblocks structure
* @bb: the badblocks structure that holds all badblock information
* @enable: weather to enable badblocks accounting
*
* Return:
* 0: success
* -ve errno: on error
*/
int badblocks_init(struct badblocks *bb, int enable)
{
return __badblocks_init(NULL, bb, enable);
}
EXPORT_SYMBOL_GPL(badblocks_init);
int devm_init_badblocks(struct device *dev, struct badblocks *bb)
{
if (!bb)
return -EINVAL;
return __badblocks_init(dev, bb, 1);
}
EXPORT_SYMBOL_GPL(devm_init_badblocks);
/**
* badblocks_exit() - free the badblocks structure
* @bb: the badblocks structure that holds all badblock information
*/
void badblocks_exit(struct badblocks *bb)
{
if (!bb)
return;
if (bb->dev)
devm_kfree(bb->dev, bb->page);
else
kfree(bb->page);
bb->page = NULL;
}
EXPORT_SYMBOL_GPL(badblocks_exit);
......@@ -20,6 +20,7 @@
#include <linux/idr.h>
#include <linux/log2.h>
#include <linux/pm_runtime.h>
#include <linux/badblocks.h>
#include "blk.h"
......@@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk)
kobject_put(disk->part0.holder_dir);
kobject_put(disk->slave_dir);
disk->driverfs_dev = NULL;
if (!sysfs_deprecated)
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
......@@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk)
}
EXPORT_SYMBOL(del_gendisk);
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
struct device_attribute *attr,
char *page)
{
struct gendisk *disk = dev_to_disk(dev);
if (!disk->bb)
return sprintf(page, "\n");
return badblocks_show(disk->bb, page, 0);
}
static ssize_t disk_badblocks_store(struct device *dev,
struct device_attribute *attr,
const char *page, size_t len)
{
struct gendisk *disk = dev_to_disk(dev);
if (!disk->bb)
return -ENXIO;
return badblocks_store(disk->bb, page, len, 0);
}
/**
* get_gendisk - get partitioning information for a given device
* @devt: device to get partitioning information for
......@@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
disk_badblocks_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
......@@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_capability.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
&dev_attr_badblocks.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
......
......@@ -4,6 +4,7 @@
#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
#include <linux/badblocks.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/blktrace_api.h>
......@@ -406,6 +407,71 @@ static inline int is_unrecognized_ioctl(int ret)
ret == -ENOIOCTLCMD;
}
#ifdef CONFIG_FS_DAX
bool blkdev_dax_capable(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
if (!disk->fops->direct_access)
return false;
/*
* If the partition is not aligned on a page boundary, we can't
* do dax I/O to it.
*/
if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
return false;
/*
* If the device has known bad blocks, force all I/O through the
* driver / page cache.
*
* TODO: support finer grained dax error handling
*/
if (disk->bb && disk->bb->count)
return false;
return true;
}
static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
{
unsigned long arg;
int rc = 0;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (get_user(arg, (int __user *)(argp)))
return -EFAULT;
arg = !!arg;
if (arg == !!(bdev->bd_inode->i_flags & S_DAX))
return 0;
if (arg)
arg = S_DAX;
if (arg && !blkdev_dax_capable(bdev))
return -ENOTTY;
mutex_lock(&bdev->bd_inode->i_mutex);
if (bdev->bd_map_count == 0)
inode_set_flags(bdev->bd_inode, arg, S_DAX);
else
rc = -EBUSY;
mutex_unlock(&bdev->bd_inode->i_mutex);
return rc;
}
#else
static int blkdev_daxset(struct block_device *bdev, int arg)
{
if (arg)
return -ENOTTY;
return 0;
}
#endif
static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
......@@ -568,6 +634,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
case BLKDAXSET:
return blkdev_daxset(bdev, arg);
case BLKDAXGET:
return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
break;
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, argp);
case IOC_PR_RESERVE:
......
......@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ndctl.h>
#include <linux/delay.h>
#include <linux/list.h>
#include <linux/acpi.h>
#include <linux/sort.h>
......@@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
/* devm will free nfit_blk */
}
static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc,
struct nd_cmd_ars_cap *cmd, u64 addr, u64 length)
{
cmd->address = addr;
cmd->length = length;
return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
sizeof(*cmd));
}
static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
struct nd_cmd_ars_start *cmd, u64 addr, u64 length)
{
int rc;
cmd->address = addr;
cmd->length = length;
cmd->type = ND_ARS_PERSISTENT;
while (1) {
rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd,
sizeof(*cmd));
if (rc)
return rc;
switch (cmd->status) {
case 0:
return 0;
case 1:
/* ARS unsupported, but we should never get here */
return 0;
case 2:
return -EINVAL;
case 3:
/* ARS is in progress */
msleep(1000);
break;
default:
return -ENXIO;
}
}
}
static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
struct nd_cmd_ars_status *cmd)
{
int rc;
while (1) {
rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
sizeof(*cmd));
if (rc || cmd->status & 0xffff)
return -ENXIO;
/* Check extended status (Upper two bytes) */
switch (cmd->status >> 16) {
case 0:
return 0;
case 1:
/* ARS is in progress */
msleep(1000);
break;
case 2:
/* No ARS performed for the current boot */
return 0;
default:
return -ENXIO;
}
}
}
static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
struct nd_cmd_ars_status *ars_status, u64 start)
{
int rc;
u32 i;
/*
* The address field returned by ars_status should be either
* less than or equal to the address we last started ARS for.
* The (start, length) returned by ars_status should also have
* non-zero overlap with the range we started ARS for.
* If this is not the case, bail.
*/
if (ars_status->address > start ||
(ars_status->address + ars_status->length < start))
return -ENXIO;
for (i = 0; i < ars_status->num_records; i++) {
rc = nvdimm_bus_add_poison(nvdimm_bus,
ars_status->records[i].err_address,
ars_status->records[i].length);
if (rc)
return rc;
}
return 0;
}
static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
struct nd_region_desc *ndr_desc)
{
struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
struct nd_cmd_ars_status *ars_status = NULL;
struct nd_cmd_ars_start *ars_start = NULL;
struct nd_cmd_ars_cap *ars_cap = NULL;
u64 start, len, cur, remaining;
int rc;
ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL);
if (!ars_cap)
return -ENOMEM;
start = ndr_desc->res->start;
len = ndr_desc->res->end - ndr_desc->res->start + 1;
rc = ars_get_cap(nd_desc, ars_cap, start, len);
if (rc)
goto out;
/*
* If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in
* extended status is not set, skip this but continue initialization
*/
if ((ars_cap->status & 0xffff) ||
!(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
dev_warn(acpi_desc->dev,
"ARS unsupported (status: 0x%x), won't create an error list\n",
ars_cap->status);
goto out;
}
/*
* Check if a full-range ARS has been run. If so, use those results
* without having to start a new ARS.
*/
ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status),
GFP_KERNEL);
if (!ars_status) {
rc = -ENOMEM;
goto out;
}
rc = ars_get_status(nd_desc, ars_status);
if (rc)
goto out;
if (ars_status->address <= start &&
(ars_status->address + ars_status->length >= start + len)) {
rc = ars_status_process_records(nvdimm_bus, ars_status, start);
goto out;
}
/*
* ARS_STATUS can overflow if the number of poison entries found is
* greater than the maximum buffer size (ars_cap->max_ars_out)
* To detect overflow, check if the length field of ars_status
* is less than the length we supplied. If so, process the
* error entries we got, adjust the start point, and start again
*/
ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL);
if (!ars_start)
return -ENOMEM;
cur = start;
remaining = len;
do {
u64 done, end;
rc = ars_do_start(nd_desc, ars_start, cur, remaining);
if (rc)
goto out;
rc = ars_get_status(nd_desc, ars_status);
if (rc)
goto out;
rc = ars_status_process_records(nvdimm_bus, ars_status, cur);
if (rc)
goto out;
end = min(cur + remaining,
ars_status->address + ars_status->length);
done = end - cur;
cur += done;
remaining -= done;
} while (remaining);
out:
kfree(ars_cap);
kfree(ars_start);
kfree(ars_status);
return rc;
}
static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
struct acpi_nfit_memory_map *memdev,
......@@ -1585,6 +1781,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
nvdimm_bus = acpi_desc->nvdimm_bus;
if (nfit_spa_type(spa) == NFIT_SPA_PM) {
rc = acpi_nfit_find_poison(acpi_desc, ndr_desc);
if (rc) {
dev_err(acpi_desc->dev,
"error while performing ARS to find poison: %d\n",
rc);
return rc;
}
if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc))
return -ENOMEM;
} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
......
......@@ -34,6 +34,7 @@
#include <linux/kthread.h>
#include <linux/blkdev.h>
#include <linux/badblocks.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
......@@ -710,8 +711,7 @@ void md_rdev_clear(struct md_rdev *rdev)
put_page(rdev->bb_page);
rdev->bb_page = NULL;
}
kfree(rdev->badblocks.page);
rdev->badblocks.page = NULL;
badblocks_exit(&rdev->badblocks);
}
EXPORT_SYMBOL_GPL(md_rdev_clear);
......@@ -1361,8 +1361,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
return cpu_to_le32(csum);
}
static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
int acknowledged);
static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
{
struct mdp_superblock_1 *sb;
......@@ -1487,8 +1485,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
count <<= sb->bblog_shift;
if (bb + 1 == 0)
break;
if (md_set_badblocks(&rdev->badblocks,
sector, count, 1) == 0)
if (badblocks_set(&rdev->badblocks, sector, count, 1))
return -EINVAL;
}
} else if (sb->bblog_offset != 0)
......@@ -2320,7 +2317,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
rdev_for_each(rdev, mddev) {
if (rdev->badblocks.changed) {
rdev->badblocks.changed = 0;
md_ack_all_badblocks(&rdev->badblocks);
ack_all_badblocks(&rdev->badblocks);
md_error(mddev, rdev);
}
clear_bit(Blocked, &rdev->flags);
......@@ -2446,7 +2443,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
clear_bit(Blocked, &rdev->flags);
if (any_badblocks_changed)
md_ack_all_badblocks(&rdev->badblocks);
ack_all_badblocks(&rdev->badblocks);
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
......@@ -3054,11 +3051,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
static struct rdev_sysfs_entry rdev_recovery_start =
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
static ssize_t
badblocks_show(struct badblocks *bb, char *page, int unack);
static ssize_t
badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
/* sysfs access to bad-blocks list.
* We present two files.
* 'bad-blocks' lists sector numbers and lengths of ranges that
* are recorded as bad. The list is truncated to fit within
* the one-page limit of sysfs.
* Writing "sector length" to this file adds an acknowledged
* bad block list.
* 'unacknowledged-bad-blocks' lists bad blocks that have not yet
* been acknowledged. Writing to this file adds bad blocks
* without acknowledging them. This is largely for testing.
*/
static ssize_t bb_show(struct md_rdev *rdev, char *page)
{
return badblocks_show(&rdev->badblocks, page, 0);
......@@ -3173,14 +3176,7 @@ int md_rdev_init(struct md_rdev *rdev)
* This reserves the space even on arrays where it cannot
* be used - I wonder if that matters
*/
rdev->badblocks.count = 0;
rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
seqlock_init(&rdev->badblocks.lock);
if (rdev->badblocks.page == NULL)
return -ENOMEM;
return 0;
return badblocks_init(&rdev->badblocks, 0);
}
EXPORT_SYMBOL_GPL(md_rdev_init);
/*
......@@ -8489,254 +8485,9 @@ void md_finish_reshape(struct mddev *mddev)
}
EXPORT_SYMBOL(md_finish_reshape);
/* Bad block management.
* We can record which blocks on each device are 'bad' and so just
* fail those blocks, or that stripe, rather than the whole device.
* Entries in the bad-block table are 64bits wide. This comprises:
* Length of bad-range, in sectors: 0-511 for lengths 1-512
* Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
* A 'shift' can be set so that larger blocks are tracked and
* consequently larger devices can be covered.
* 'Acknowledged' flag - 1 bit. - the most significant bit.
*
* Locking of the bad-block table uses a seqlock so md_is_badblock
* might need to retry if it is very unlucky.
* We will sometimes want to check for bad blocks in a bi_end_io function,
* so we use the write_seqlock_irq variant.
*
* When looking for a bad block we specify a range and want to
* know if any block in the range is bad. So we binary-search
* to the last range that starts at-or-before the given endpoint,
* (or "before the sector after the target range")
* then see if it ends after the given start.
* We return
* 0 if there are no known bad blocks in the range
* 1 if there are known bad block which are all acknowledged
* -1 if there are bad blocks which have not yet been acknowledged in metadata.
* plus the start/length of the first bad section we overlap.
*/
int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors)
{
int hi;
int lo;
u64 *p = bb->page;
int rv;
sector_t target = s + sectors;
unsigned seq;
if (bb->shift > 0) {
/* round the start down, and the end up */
s >>= bb->shift;
target += (1<<bb->shift) - 1;
target >>= bb->shift;
sectors = target - s;
}
/* 'target' is now the first block after the bad range */
retry:
seq = read_seqbegin(&bb->lock);
lo = 0;
rv = 0;
hi = bb->count;
/* Binary search between lo and hi for 'target'
* i.e. for the last range that starts before 'target'
*/
/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
* are known not to be the last range before target.
* VARIANT: hi-lo is the number of possible
* ranges, and decreases until it reaches 1
*/
while (hi - lo > 1) {
int mid = (lo + hi) / 2;
sector_t a = BB_OFFSET(p[mid]);
if (a < target)
/* This could still be the one, earlier ranges
* could not. */
lo = mid;
else
/* This and later ranges are definitely out. */
hi = mid;
}
/* 'lo' might be the last that started before target, but 'hi' isn't */
if (hi > lo) {
/* need to check all range that end after 's' to see if
* any are unacknowledged.
*/
while (lo >= 0 &&
BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
if (BB_OFFSET(p[lo]) < target) {
/* starts before the end, and finishes after
* the start, so they must overlap
*/
if (rv != -1 && BB_ACK(p[lo]))
rv = 1;
else
rv = -1;
*first_bad = BB_OFFSET(p[lo]);
*bad_sectors = BB_LEN(p[lo]);
}
lo--;
}
}
if (read_seqretry(&bb->lock, seq))
goto retry;
return rv;
}
EXPORT_SYMBOL_GPL(md_is_badblock);
/*
* Add a range of bad blocks to the table.
* This might extend the table, or might contract it
* if two adjacent ranges can be merged.
* We binary-search to find the 'insertion' point, then
* decide how best to handle it.
*/
static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
int acknowledged)
{
u64 *p;
int lo, hi;
int rv = 1;
unsigned long flags;
if (bb->shift < 0)
/* badblocks are disabled */
return 0;
if (bb->shift) {
/* round the start down, and the end up */
sector_t next = s + sectors;
s >>= bb->shift;
next += (1<<bb->shift) - 1;
next >>= bb->shift;
sectors = next - s;
}
write_seqlock_irqsave(&bb->lock, flags);
p = bb->page;
lo = 0;
hi = bb->count;
/* Find the last range that starts at-or-before 's' */
while (hi - lo > 1) {
int mid = (lo + hi) / 2;
sector_t a = BB_OFFSET(p[mid]);
if (a <= s)
lo = mid;
else
hi = mid;
}
if (hi > lo && BB_OFFSET(p[lo]) > s)
hi = lo;
if (hi > lo) {
/* we found a range that might merge with the start
* of our new range
*/
sector_t a = BB_OFFSET(p[lo]);
sector_t e = a + BB_LEN(p[lo]);
int ack = BB_ACK(p[lo]);
if (e >= s) {
/* Yes, we can merge with a previous range */
if (s == a && s + sectors >= e)
/* new range covers old */
ack = acknowledged;
else
ack = ack && acknowledged;
if (e < s + sectors)
e = s + sectors;
if (e - a <= BB_MAX_LEN) {
p[lo] = BB_MAKE(a, e-a, ack);
s = e;
} else {
/* does not all fit in one range,
* make p[lo] maximal
*/
if (BB_LEN(p[lo]) != BB_MAX_LEN)
p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
s = a + BB_MAX_LEN;
}
sectors = e - s;
}
}
if (sectors && hi < bb->count) {
/* 'hi' points to the first range that starts after 's'.
* Maybe we can merge with the start of that range */
sector_t a = BB_OFFSET(p[hi]);
sector_t e = a + BB_LEN(p[hi]);
int ack = BB_ACK(p[hi]);
if (a <= s + sectors) {
/* merging is possible */
if (e <= s + sectors) {
/* full overlap */
e = s + sectors;
ack = acknowledged;
} else
ack = ack && acknowledged;
a = s;
if (e - a <= BB_MAX_LEN) {
p[hi] = BB_MAKE(a, e-a, ack);
s = e;
} else {
p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
s = a + BB_MAX_LEN;
}
sectors = e - s;
lo = hi;
hi++;
}
}
if (sectors == 0 && hi < bb->count) {
/* we might be able to combine lo and hi */
/* Note: 's' is at the end of 'lo' */
sector_t a = BB_OFFSET(p[hi]);
int lolen = BB_LEN(p[lo]);
int hilen = BB_LEN(p[hi]);
int newlen = lolen + hilen - (s - a);
if (s >= a && newlen < BB_MAX_LEN) {
/* yes, we can combine them */
int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
memmove(p + hi, p + hi + 1,
(bb->count - hi - 1) * 8);
bb->count--;
}
}
while (sectors) {
/* didn't merge (it all).
* Need to add a range just before 'hi' */
if (bb->count >= MD_MAX_BADBLOCKS) {
/* No room for more */
rv = 0;
break;
} else {
int this_sectors = sectors;
memmove(p + hi + 1, p + hi,
(bb->count - hi) * 8);
bb->count++;
if (this_sectors > BB_MAX_LEN)
this_sectors = BB_MAX_LEN;
p[hi] = BB_MAKE(s, this_sectors, acknowledged);
sectors -= this_sectors;
s += this_sectors;
}
}
bb->changed = 1;
if (!acknowledged)
bb->unacked_exist = 1;
write_sequnlock_irqrestore(&bb->lock, flags);
return rv;
}
/* Bad block management */
/* Returns 1 on success, 0 on failure */
int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new)
{
......@@ -8745,114 +8496,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->new_data_offset;
else
s += rdev->data_offset;
rv = md_set_badblocks(&rdev->badblocks,
s, sectors, 0);
if (rv) {
rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
if (rv == 0) {
/* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
md_wakeup_thread(rdev->mddev->thread);
}
return rv;
return 1;
} else
return 0;
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);
/*
* Remove a range of bad blocks from the table.
* This may involve extending the table if we spilt a region,
* but it must not fail. So if the table becomes full, we just
* drop the remove request.
*/
static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
{
u64 *p;
int lo, hi;
sector_t target = s + sectors;
int rv = 0;
if (bb->shift > 0) {
/* When clearing we round the start up and the end down.
* This should not matter as the shift should align with
* the block size and no rounding should ever be needed.
* However it is better the think a block is bad when it
* isn't than to think a block is not bad when it is.
*/
s += (1<<bb->shift) - 1;
s >>= bb->shift;
target >>= bb->shift;
sectors = target - s;
}
write_seqlock_irq(&bb->lock);
p = bb->page;
lo = 0;
hi = bb->count;
/* Find the last range that starts before 'target' */
while (hi - lo > 1) {
int mid = (lo + hi) / 2;
sector_t a = BB_OFFSET(p[mid]);
if (a < target)
lo = mid;
else
hi = mid;
}
if (hi > lo) {
/* p[lo] is the last range that could overlap the
* current range. Earlier ranges could also overlap,
* but only this one can overlap the end of the range.
*/
if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
/* Partial overlap, leave the tail of this range */
int ack = BB_ACK(p[lo]);
sector_t a = BB_OFFSET(p[lo]);
sector_t end = a + BB_LEN(p[lo]);
if (a < s) {
/* we need to split this range */
if (bb->count >= MD_MAX_BADBLOCKS) {
rv = -ENOSPC;
goto out;
}
memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
bb->count++;
p[lo] = BB_MAKE(a, s-a, ack);
lo++;
}
p[lo] = BB_MAKE(target, end - target, ack);
/* there is no longer an overlap */
hi = lo;
lo--;
}
while (lo >= 0 &&
BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
/* This range does overlap */
if (BB_OFFSET(p[lo]) < s) {
/* Keep the early parts of this range. */
int ack = BB_ACK(p[lo]);
sector_t start = BB_OFFSET(p[lo]);
p[lo] = BB_MAKE(start, s - start, ack);
/* now low doesn't overlap, so.. */
break;
}
lo--;
}
/* 'lo' is strictly before, 'hi' is strictly after,
* anything between needs to be discarded
*/
if (hi - lo > 1) {
memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
bb->count -= (hi - lo - 1);
}
}
bb->changed = 1;
out:
write_sequnlock_irq(&bb->lock);
return rv;
}
int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new)
{
......@@ -8860,133 +8516,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->new_data_offset;
else
s += rdev->data_offset;
return md_clear_badblocks(&rdev->badblocks,
return badblocks_clear(&rdev->badblocks,
s, sectors);
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
/*
* Acknowledge all bad blocks in a list.
* This only succeeds if ->changed is clear. It is used by
* in-kernel metadata updates
*/
void md_ack_all_badblocks(struct badblocks *bb)
{
if (bb->page == NULL || bb->changed)
/* no point even trying */
return;
write_seqlock_irq(&bb->lock);
if (bb->changed == 0 && bb->unacked_exist) {
u64 *p = bb->page;
int i;
for (i = 0; i < bb->count ; i++) {
if (!BB_ACK(p[i])) {
sector_t start = BB_OFFSET(p[i]);
int len = BB_LEN(p[i]);
p[i] = BB_MAKE(start, len, 1);
}
}
bb->unacked_exist = 0;
}
write_sequnlock_irq(&bb->lock);
}
EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
/* sysfs access to bad-blocks list.
* We present two files.
* 'bad-blocks' lists sector numbers and lengths of ranges that
* are recorded as bad. The list is truncated to fit within
* the one-page limit of sysfs.
* Writing "sector length" to this file adds an acknowledged
* bad block list.
* 'unacknowledged-bad-blocks' lists bad blocks that have not yet
* been acknowledged. Writing to this file adds bad blocks
* without acknowledging them. This is largely for testing.
*/
static ssize_t
badblocks_show(struct badblocks *bb, char *page, int unack)
{
size_t len;
int i;
u64 *p = bb->page;
unsigned seq;
if (bb->shift < 0)
return 0;
retry:
seq = read_seqbegin(&bb->lock);
len = 0;
i = 0;
while (len < PAGE_SIZE && i < bb->count) {
sector_t s = BB_OFFSET(p[i]);
unsigned int length = BB_LEN(p[i]);
int ack = BB_ACK(p[i]);
i++;
if (unack && ack)
continue;
len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
(unsigned long long)s << bb->shift,
length << bb->shift);
}
if (unack && len == 0)
bb->unacked_exist = 0;
if (read_seqretry(&bb->lock, seq))
goto retry;
return len;
}
#define DO_DEBUG 1
static ssize_t
badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
{
unsigned long long sector;
int length;
char newline;
#ifdef DO_DEBUG
/* Allow clearing via sysfs *only* for testing/debugging.
* Normally only a successful write may clear a badblock
*/
int clear = 0;
if (page[0] == '-') {
clear = 1;
page++;
}
#endif /* DO_DEBUG */
switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
case 3:
if (newline != '\n')
return -EINVAL;
case 2:
if (length <= 0)
return -EINVAL;
break;
default:
return -EINVAL;
}
#ifdef DO_DEBUG
if (clear) {
md_clear_badblocks(bb, sector, length);
return len;
}
#endif /* DO_DEBUG */
if (md_set_badblocks(bb, sector, length, !unack))
return len;
else
return -ENOSPC;
}
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
......
......@@ -17,6 +17,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/badblocks.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mm.h>
......@@ -28,13 +29,6 @@
#define MaxSector (~(sector_t)0)
/* Bad block numbers are stored sorted in a single page.
* 64bits is used for each block or extent.
* 54 bits are sector number, 9 bits are extent size,
* 1 bit is an 'acknowledged' flag.
*/
#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
/*
* MD's 'extended' device
*/
......@@ -117,22 +111,7 @@ struct md_rdev {
struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */
struct badblocks {
int count; /* count of bad blocks */
int unacked_exist; /* there probably are unacknowledged
* bad blocks. This is only cleared
* when a read discovers none
*/
int shift; /* shift from sectors to block size
* a -ve shift means badblocks are
* disabled.*/
u64 *page; /* badblock list */
int changed;
seqlock_t lock;
sector_t sector;
sector_t size; /* in sectors */
} badblocks;
struct badblocks badblocks;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
......@@ -185,22 +164,11 @@ enum flag_bits {
*/
};
#define BB_LEN_MASK (0x00000000000001FFULL)
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
#define BB_ACK_MASK (0x8000000000000000ULL)
#define BB_MAX_LEN 512
#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors);
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors)
{
if (unlikely(rdev->badblocks.count)) {
int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
sectors,
first_bad, bad_sectors);
if (rv)
......@@ -213,8 +181,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
extern void md_ack_all_badblocks(struct badblocks *bb);
struct md_cluster_info;
struct mddev {
......
......@@ -11,6 +11,7 @@
* General Public License for more details.
*/
#include <linux/libnvdimm.h>
#include <linux/badblocks.h>
#include <linux/export.h>
#include <linux/module.h>
#include <linux/blkdev.h>
......@@ -325,6 +326,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
if (!nvdimm_bus)
return NULL;
INIT_LIST_HEAD(&nvdimm_bus->list);
INIT_LIST_HEAD(&nvdimm_bus->poison_list);
init_waitqueue_head(&nvdimm_bus->probe_wait);
nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
mutex_init(&nvdimm_bus->reconfig_mutex);
......@@ -359,6 +361,172 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
}
EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
static void set_badblock(struct badblocks *bb, sector_t s, int num)
{
dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
(u64) s * 512, (u64) num * 512);
/* this isn't an error as the hardware will still throw an exception */
if (badblocks_set(bb, s, num, 1))
dev_info_once(bb->dev, "%s: failed for sector %llx\n",
__func__, (u64) s);
}
/**
* __add_badblock_range() - Convert a physical address range to bad sectors
* @bb: badblocks instance to populate
* @ns_offset: namespace offset where the error range begins (in bytes)
* @len: number of bytes of poison to be added
*
* This assumes that the range provided with (ns_offset, len) is within
* the bounds of physical addresses for this namespace, i.e. lies in the
* interval [ns_start, ns_start + ns_size)
*/
static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
{
const unsigned int sector_size = 512;
sector_t start_sector;
u64 num_sectors;
u32 rem;
start_sector = div_u64(ns_offset, sector_size);
num_sectors = div_u64_rem(len, sector_size, &rem);
if (rem)
num_sectors++;
if (unlikely(num_sectors > (u64)INT_MAX)) {
u64 remaining = num_sectors;
sector_t s = start_sector;
while (remaining) {
int done = min_t(u64, remaining, INT_MAX);
set_badblock(bb, s, done);
remaining -= done;
s += done;
}
} else
set_badblock(bb, start_sector, num_sectors);
}
/**
* nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks
* @ndns: the namespace containing poison ranges
* @bb: badblocks instance to populate
* @offset: offset at the start of the namespace before 'sector 0'
*
* The poison list generated during NFIT initialization may contain multiple,
* possibly overlapping ranges in the SPA (System Physical Address) space.
* Compare each of these ranges to the namespace currently being initialized,
* and add badblocks to the gendisk for all matching sub-ranges
*/
void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
struct badblocks *bb, resource_size_t offset)
{
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
struct nvdimm_bus *nvdimm_bus;
struct list_head *poison_list;
u64 ns_start, ns_end, ns_size;
struct nd_poison *pl;
ns_size = nvdimm_namespace_capacity(ndns) - offset;
ns_start = nsio->res.start + offset;
ns_end = nsio->res.end;
nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent);
poison_list = &nvdimm_bus->poison_list;
if (list_empty(poison_list))
return;
list_for_each_entry(pl, poison_list, list) {
u64 pl_end = pl->start + pl->length - 1;
/* Discard intervals with no intersection */
if (pl_end < ns_start)
continue;
if (pl->start > ns_end)
continue;
/* Deal with any overlap after start of the namespace */
if (pl->start >= ns_start) {
u64 start = pl->start;
u64 len;
if (pl_end <= ns_end)
len = pl->length;
else
len = ns_start + ns_size - pl->start;
__add_badblock_range(bb, start - ns_start, len);
continue;
}
/* Deal with overlap for poison starting before the namespace */
if (pl->start < ns_start) {
u64 len;
if (pl_end < ns_end)
len = pl->start + pl->length - ns_start;
else
len = ns_size;
__add_badblock_range(bb, 0, len);
}
}
}
EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison);
static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
{
struct nd_poison *pl;
pl = kzalloc(sizeof(*pl), GFP_KERNEL);
if (!pl)
return -ENOMEM;
pl->start = addr;
pl->length = length;
list_add_tail(&pl->list, &nvdimm_bus->poison_list);
return 0;
}
int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
{
struct nd_poison *pl;
if (list_empty(&nvdimm_bus->poison_list))
return __add_poison(nvdimm_bus, addr, length);
/*
* There is a chance this is a duplicate, check for those first.
* This will be the common case as ARS_STATUS returns all known
* errors in the SPA space, and we can't query it per region
*/
list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
if (pl->start == addr) {
/* If length has changed, update this list entry */
if (pl->length != length)
pl->length = length;
return 0;
}
/*
* If not a duplicate or a simple length update, add the entry as is,
* as any overlapping ranges will get resolved when the list is consumed
* and converted to badblocks
*/
return __add_poison(nvdimm_bus, addr, length);
}
EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
static void free_poison_list(struct list_head *poison_list)
{
struct nd_poison *pl, *next;
list_for_each_entry_safe(pl, next, poison_list, list) {
list_del(&pl->list);
kfree(pl);
}
list_del_init(poison_list);
}
static int child_unregister(struct device *dev, void *data)
{
/*
......@@ -385,6 +553,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
nd_synchronize();
device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
free_poison_list(&nvdimm_bus->poison_list);
nvdimm_bus_destroy_ndctl(nvdimm_bus);
device_unregister(&nvdimm_bus->dev);
......
......@@ -77,6 +77,59 @@ static bool is_namespace_io(struct device *dev)
return dev ? dev->type == &namespace_io_device_type : false;
}
static int is_uuid_busy(struct device *dev, void *data)
{
u8 *uuid1 = data, *uuid2 = NULL;
if (is_namespace_pmem(dev)) {
struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
uuid2 = nspm->uuid;
} else if (is_namespace_blk(dev)) {
struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
uuid2 = nsblk->uuid;
} else if (is_nd_btt(dev)) {
struct nd_btt *nd_btt = to_nd_btt(dev);
uuid2 = nd_btt->uuid;
} else if (is_nd_pfn(dev)) {
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
uuid2 = nd_pfn->uuid;
}
if (uuid2 && memcmp(uuid1, uuid2, NSLABEL_UUID_LEN) == 0)
return -EBUSY;
return 0;
}
static int is_namespace_uuid_busy(struct device *dev, void *data)
{
if (is_nd_pmem(dev) || is_nd_blk(dev))
return device_for_each_child(dev, data, is_uuid_busy);
return 0;
}
/**
* nd_is_uuid_unique - verify that no other namespace has @uuid
* @dev: any device on a nvdimm_bus
* @uuid: uuid to check
*/
bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
{
struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
if (!nvdimm_bus)
return false;
WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
if (device_for_each_child(&nvdimm_bus->dev, uuid,
is_namespace_uuid_busy) != 0)
return false;
return true;
}
bool pmem_should_map_pages(struct device *dev)
{
struct nd_region *nd_region = to_nd_region(dev->parent);
......@@ -104,20 +157,10 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
const char *suffix = NULL;
if (ndns->claim) {
if (is_nd_btt(ndns->claim))
if (ndns->claim && is_nd_btt(ndns->claim))
suffix = "s";
else if (is_nd_pfn(ndns->claim))
suffix = "m";
else
dev_WARN_ONCE(&ndns->dev, 1,
"unknown claim type by %s\n",
dev_name(ndns->claim));
}
if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) {
if (!suffix && pmem_should_map_pages(&ndns->dev))
suffix = "m";
sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : "");
} else if (is_namespace_blk(&ndns->dev)) {
struct nd_namespace_blk *nsblk;
......@@ -791,6 +834,15 @@ static void nd_namespace_pmem_set_size(struct nd_region *nd_region,
res->end = nd_region->ndr_start + size - 1;
}
static bool uuid_not_set(const u8 *uuid, struct device *dev, const char *where)
{
if (!uuid) {
dev_dbg(dev, "%s: uuid not set\n", where);
return true;
}
return false;
}
static ssize_t __size_store(struct device *dev, unsigned long long val)
{
resource_size_t allocated = 0, available = 0;
......@@ -820,8 +872,12 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
* We need a uuid for the allocation-label and dimm(s) on which
* to store the label.
*/
if (!uuid || nd_region->ndr_mappings == 0)
if (uuid_not_set(uuid, dev, __func__))
return -ENXIO;
if (nd_region->ndr_mappings == 0) {
dev_dbg(dev, "%s: not associated with dimm(s)\n", __func__);
return -ENXIO;
}
div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder);
if (remainder) {
......@@ -1211,6 +1267,29 @@ static ssize_t holder_show(struct device *dev,
}
static DEVICE_ATTR_RO(holder);
static ssize_t mode_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nd_namespace_common *ndns = to_ndns(dev);
struct device *claim;
char *mode;
ssize_t rc;
device_lock(dev);
claim = ndns->claim;
if (pmem_should_map_pages(dev) || (claim && is_nd_pfn(claim)))
mode = "memory";
else if (claim && is_nd_btt(claim))
mode = "safe";
else
mode = "raw";
rc = sprintf(buf, "%s\n", mode);
device_unlock(dev);
return rc;
}
static DEVICE_ATTR_RO(mode);
static ssize_t force_raw_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
......@@ -1234,6 +1313,7 @@ static DEVICE_ATTR_RW(force_raw);
static struct attribute *nd_namespace_attributes[] = {
&dev_attr_nstype.attr,
&dev_attr_size.attr,
&dev_attr_mode.attr,
&dev_attr_uuid.attr,
&dev_attr_holder.attr,
&dev_attr_resource.attr,
......@@ -1267,7 +1347,8 @@ static umode_t namespace_visible(struct kobject *kobj,
if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr
|| a == &dev_attr_holder.attr
|| a == &dev_attr_force_raw.attr)
|| a == &dev_attr_force_raw.attr
|| a == &dev_attr_mode.attr)
return a->mode;
return 0;
......@@ -1343,14 +1424,19 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
struct nd_namespace_pmem *nspm;
nspm = to_nd_namespace_pmem(&ndns->dev);
if (!nspm->uuid) {
dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__);
if (uuid_not_set(nspm->uuid, &ndns->dev, __func__))
return ERR_PTR(-ENODEV);
}
} else if (is_namespace_blk(&ndns->dev)) {
struct nd_namespace_blk *nsblk;
nsblk = to_nd_namespace_blk(&ndns->dev);
if (uuid_not_set(nsblk->uuid, &ndns->dev, __func__))
return ERR_PTR(-ENODEV);
if (!nsblk->lbasize) {
dev_dbg(&ndns->dev, "%s: sector size not set\n",
__func__);
return ERR_PTR(-ENODEV);
}
if (!nd_namespace_blk_validate(nsblk))
return ERR_PTR(-ENODEV);
}
......@@ -1689,6 +1775,18 @@ void nd_region_create_blk_seed(struct nd_region *nd_region)
nd_device_register(nd_region->ns_seed);
}
void nd_region_create_pfn_seed(struct nd_region *nd_region)
{
WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
nd_region->pfn_seed = nd_pfn_create(nd_region);
/*
* Seed creation failures are not fatal, provisioning is simply
* disabled until memory becomes available
*/
if (!nd_region->pfn_seed)
dev_err(&nd_region->dev, "failed to create pfn namespace\n");
}
void nd_region_create_btt_seed(struct nd_region *nd_region)
{
WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
......
......@@ -30,6 +30,7 @@ struct nvdimm_bus {
struct list_head list;
struct device dev;
int id, probe_active;
struct list_head poison_list;
struct mutex reconfig_mutex;
};
......@@ -52,6 +53,7 @@ void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
struct nd_region;
void nd_region_create_blk_seed(struct nd_region *nd_region);
void nd_region_create_btt_seed(struct nd_region *nd_region);
void nd_region_create_pfn_seed(struct nd_region *nd_region);
void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
......
......@@ -29,13 +29,12 @@ enum {
ND_MAX_LANES = 256,
SECTOR_SHIFT = 9,
INT_LBASIZE_ALIGNMENT = 64,
#if IS_ENABLED(CONFIG_NVDIMM_PFN)
ND_PFN_ALIGN = PAGES_PER_SECTION * PAGE_SIZE,
ND_PFN_MASK = ND_PFN_ALIGN - 1,
#else
ND_PFN_ALIGN = 0,
ND_PFN_MASK = 0,
#endif
};
struct nd_poison {
u64 start;
u64 length;
struct list_head list;
};
struct nvdimm_drvdata {
......@@ -153,6 +152,7 @@ struct nd_pfn {
int id;
u8 *uuid;
struct device dev;
unsigned long align;
unsigned long npfns;
enum nd_pfn_mode mode;
struct nd_pfn_sb *pfn_sb;
......@@ -262,6 +262,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns);
const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
char *name);
void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
struct badblocks *bb, resource_size_t offset);
int nd_blk_region_init(struct nd_region *nd_region);
void __nd_iostat_start(struct bio *bio, unsigned long *start);
static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
......
......@@ -103,6 +103,52 @@ static ssize_t mode_store(struct device *dev,
}
static DEVICE_ATTR_RW(mode);
static ssize_t align_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
return sprintf(buf, "%lx\n", nd_pfn->align);
}
static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf)
{
unsigned long val;
int rc;
rc = kstrtoul(buf, 0, &val);
if (rc)
return rc;
if (!is_power_of_2(val) || val < PAGE_SIZE || val > SZ_1G)
return -EINVAL;
if (nd_pfn->dev.driver)
return -EBUSY;
else
nd_pfn->align = val;
return 0;
}
static ssize_t align_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
ssize_t rc;
device_lock(dev);
nvdimm_bus_lock(dev);
rc = __align_store(nd_pfn, buf);
dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
rc, buf, buf[len - 1] == '\n' ? "" : "\n");
nvdimm_bus_unlock(dev);
device_unlock(dev);
return rc ? rc : len;
}
static DEVICE_ATTR_RW(align);
static ssize_t uuid_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
......@@ -164,6 +210,7 @@ static struct attribute *nd_pfn_attributes[] = {
&dev_attr_mode.attr,
&dev_attr_namespace.attr,
&dev_attr_uuid.attr,
&dev_attr_align.attr,
NULL,
};
......@@ -179,7 +226,6 @@ static const struct attribute_group *nd_pfn_attribute_groups[] = {
};
static struct device *__nd_pfn_create(struct nd_region *nd_region,
u8 *uuid, enum nd_pfn_mode mode,
struct nd_namespace_common *ndns)
{
struct nd_pfn *nd_pfn;
......@@ -199,10 +245,8 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region,
return NULL;
}
nd_pfn->mode = mode;
if (uuid)
uuid = kmemdup(uuid, 16, GFP_KERNEL);
nd_pfn->uuid = uuid;
nd_pfn->mode = PFN_MODE_NONE;
nd_pfn->align = HPAGE_SIZE;
dev = &nd_pfn->dev;
dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id);
dev->parent = &nd_region->dev;
......@@ -220,8 +264,7 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region,
struct device *nd_pfn_create(struct nd_region *nd_region)
{
struct device *dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE,
NULL);
struct device *dev = __nd_pfn_create(nd_region, NULL);
if (dev)
__nd_device_register(dev);
......@@ -230,10 +273,11 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
int nd_pfn_validate(struct nd_pfn *nd_pfn)
{
struct nd_namespace_common *ndns = nd_pfn->ndns;
struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
struct nd_namespace_io *nsio;
u64 checksum, offset;
struct nd_namespace_io *nsio;
struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
struct nd_namespace_common *ndns = nd_pfn->ndns;
const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev);
if (!pfn_sb || !ndns)
return -ENODEV;
......@@ -241,10 +285,6 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
if (!is_nd_pmem(nd_pfn->dev.parent))
return -ENODEV;
/* section alignment for simple hotplug */
if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN)
return -ENODEV;
if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)))
return -ENXIO;
......@@ -257,6 +297,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
return -ENODEV;
pfn_sb->checksum = cpu_to_le64(checksum);
if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0)
return -ENODEV;
switch (le32_to_cpu(pfn_sb->mode)) {
case PFN_MODE_RAM:
break;
......@@ -278,6 +321,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
return -EINVAL;
}
if (nd_pfn->align > nvdimm_namespace_capacity(ndns)) {
dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
nd_pfn->align, nvdimm_namespace_capacity(ndns));
return -EINVAL;
}
/*
* These warnings are verbose because they can only trigger in
* the case where the physical address alignment of the
......@@ -286,17 +335,19 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
*/
offset = le64_to_cpu(pfn_sb->dataoff);
nsio = to_nd_namespace_io(&ndns->dev);
if (nsio->res.start & ND_PFN_MASK) {
dev_err(&nd_pfn->dev,
"init failed: %s not section aligned\n",
dev_name(&ndns->dev));
return -EBUSY;
} else if (offset >= resource_size(&nsio->res)) {
if (offset >= resource_size(&nsio->res)) {
dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n",
dev_name(&ndns->dev));
return -EBUSY;
}
nd_pfn->align = 1UL << ilog2(offset);
if (!is_power_of_2(offset) || offset < PAGE_SIZE) {
dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n",
offset);
return -ENXIO;
}
return 0;
}
EXPORT_SYMBOL(nd_pfn_validate);
......@@ -313,7 +364,7 @@ int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata)
return -ENODEV;
nvdimm_bus_lock(&ndns->dev);
dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, ndns);
dev = __nd_pfn_create(nd_region, ndns);
nvdimm_bus_unlock(&ndns->dev);
if (!dev)
return -ENOMEM;
......
......@@ -23,6 +23,7 @@
#include <linux/module.h>
#include <linux/memory_hotplug.h>
#include <linux/moduleparam.h>
#include <linux/badblocks.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/pmem.h>
......@@ -41,11 +42,25 @@ struct pmem_device {
phys_addr_t data_offset;
void __pmem *virt_addr;
size_t size;
struct badblocks bb;
};
static int pmem_major;
static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
{
if (bb->count) {
sector_t first_bad;
int num_bad;
return !!badblocks_check(bb, sector, len / 512, &first_bad,
&num_bad);
}
return false;
}
static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
unsigned int len, unsigned int off, int rw,
sector_t sector)
{
......@@ -54,6 +69,8 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
if (rw == READ) {
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
return -EIO;
memcpy_from_pmem(mem + off, pmem_addr, len);
flush_dcache_page(page);
} else {
......@@ -62,10 +79,12 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
}
kunmap_atomic(mem);
return 0;
}
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
{
int rc = 0;
bool do_acct;
unsigned long start;
struct bio_vec bvec;
......@@ -74,9 +93,15 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
struct pmem_device *pmem = bdev->bd_disk->private_data;
do_acct = nd_iostat_start(bio, &start);
bio_for_each_segment(bvec, bio, iter)
pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
bio_data_dir(bio), iter.bi_sector);
bio_for_each_segment(bvec, bio, iter) {
rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
bvec.bv_offset, bio_data_dir(bio),
iter.bi_sector);
if (rc) {
bio->bi_error = rc;
break;
}
}
if (do_acct)
nd_iostat_end(bio, start);
......@@ -91,13 +116,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, int rw)
{
struct pmem_device *pmem = bdev->bd_disk->private_data;
int rc;
pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
if (rw & WRITE)
wmb_pmem();
/*
* The ->rw_page interface is subtle and tricky. The core
* retries on any error, so we can only invoke page_endio() in
* the successful completion case. Otherwise, we'll see crashes
* caused by double completion.
*/
if (rc == 0)
page_endio(page, rw & WRITE, 0);
return 0;
return rc;
}
static long pmem_direct_access(struct block_device *bdev, sector_t sector,
......@@ -195,7 +229,12 @@ static int pmem_attach_disk(struct device *dev,
disk->driverfs_dev = dev;
set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
pmem->pmem_disk = disk;
devm_exit_badblocks(dev, &pmem->bb);
if (devm_init_badblocks(dev, &pmem->bb))
return -ENOMEM;
nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset);
disk->bb = &pmem->bb;
add_disk(disk);
revalidate_disk(disk);
......@@ -212,9 +251,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
return -EFAULT;
}
if (rw == READ)
if (rw == READ) {
unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
return -EIO;
memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
else {
} else {
memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
wmb_pmem();
}
......@@ -238,14 +281,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
nd_pfn->pfn_sb = pfn_sb;
rc = nd_pfn_validate(nd_pfn);
if (rc == 0 || rc == -EBUSY)
if (rc == -ENODEV)
/* no info block, do init */;
else
return rc;
/* section alignment for simple hotplug */
if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN
|| pmem->phys_addr & ND_PFN_MASK)
return -ENODEV;
nd_region = to_nd_region(nd_pfn->dev.parent);
if (nd_region->ro) {
dev_info(&nd_pfn->dev,
......@@ -263,9 +303,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
* ->direct_access() to those that are included in the memmap.
*/
if (nd_pfn->mode == PFN_MODE_PMEM)
offset = ALIGN(SZ_8K + 64 * npfns, PMD_SIZE);
offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align);
else if (nd_pfn->mode == PFN_MODE_RAM)
offset = SZ_8K;
offset = ALIGN(SZ_8K, nd_pfn->align);
else
goto err;
......@@ -275,6 +315,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
pfn_sb->npfns = cpu_to_le64(npfns);
memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
pfn_sb->version_major = cpu_to_le16(1);
checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
pfn_sb->checksum = cpu_to_le64(checksum);
......@@ -326,21 +367,11 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
if (rc)
return rc;
if (PAGE_SIZE != SZ_4K) {
dev_err(dev, "only supported on systems with 4K PAGE_SIZE\n");
return -ENXIO;
}
if (nsio->res.start & ND_PFN_MASK) {
dev_err(dev, "%s not memory hotplug section aligned\n",
dev_name(&ndns->dev));
return -ENXIO;
}
pfn_sb = nd_pfn->pfn_sb;
offset = le64_to_cpu(pfn_sb->dataoff);
nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
if (nd_pfn->mode == PFN_MODE_RAM) {
if (offset != SZ_8K)
if (offset < SZ_8K)
return -EINVAL;
nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
altmap = NULL;
......@@ -389,6 +420,9 @@ static int nd_pmem_probe(struct device *dev)
pmem->ndns = ndns;
dev_set_drvdata(dev, pmem);
ndns->rw_bytes = pmem_rw_bytes;
if (devm_init_badblocks(dev, &pmem->bb))
return -ENOMEM;
nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
if (is_nd_btt(dev))
return nvdimm_namespace_attach_btt(ndns);
......
......@@ -134,62 +134,6 @@ int nd_region_to_nstype(struct nd_region *nd_region)
}
EXPORT_SYMBOL(nd_region_to_nstype);
static int is_uuid_busy(struct device *dev, void *data)
{
struct nd_region *nd_region = to_nd_region(dev->parent);
u8 *uuid = data;
switch (nd_region_to_nstype(nd_region)) {
case ND_DEVICE_NAMESPACE_PMEM: {
struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
if (!nspm->uuid)
break;
if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0)
return -EBUSY;
break;
}
case ND_DEVICE_NAMESPACE_BLK: {
struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
if (!nsblk->uuid)
break;
if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0)
return -EBUSY;
break;
}
default:
break;
}
return 0;
}
static int is_namespace_uuid_busy(struct device *dev, void *data)
{
if (is_nd_pmem(dev) || is_nd_blk(dev))
return device_for_each_child(dev, data, is_uuid_busy);
return 0;
}
/**
* nd_is_uuid_unique - verify that no other namespace has @uuid
* @dev: any device on a nvdimm_bus
* @uuid: uuid to check
*/
bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
{
struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
if (!nvdimm_bus)
return false;
WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
if (device_for_each_child(&nvdimm_bus->dev, uuid,
is_namespace_uuid_busy) != 0)
return false;
return true;
}
static ssize_t size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
......@@ -406,6 +350,9 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
struct nd_interleave_set *nd_set = nd_region->nd_set;
int type = nd_region_to_nstype(nd_region);
if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr)
return 0;
if (a != &dev_attr_set_cookie.attr
&& a != &dev_attr_available_size.attr)
return a->mode;
......@@ -487,6 +434,13 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
nd_region_create_blk_seed(nd_region);
nvdimm_bus_unlock(dev);
}
if (is_nd_pfn(dev) && probe) {
nd_region = to_nd_region(dev->parent);
nvdimm_bus_lock(dev);
if (nd_region->pfn_seed == dev)
nd_region_create_pfn_seed(nd_region);
nvdimm_bus_unlock(dev);
}
}
void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
......
......@@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
static struct inode *bdev_file_inode(struct file *file)
{
return file->f_mapping->host;
}
static ssize_t
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct inode *inode = bdev_file_inode(file);
if (IS_DAX(inode))
return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
......@@ -338,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
*/
static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *bd_inode = file->f_mapping->host;
struct inode *bd_inode = bdev_file_inode(file);
loff_t retval;
mutex_lock(&bd_inode->i_mutex);
......@@ -349,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *bd_inode = filp->f_mapping->host;
struct inode *bd_inode = bdev_file_inode(filp);
struct block_device *bdev = I_BDEV(bd_inode);
int error;
......@@ -1224,8 +1229,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
if (!ret)
if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
if (!blkdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
/*
* If the device is invalidated, rescan partition
......@@ -1239,6 +1247,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
else if (ret == -ENOMEDIUM)
invalidate_partitions(disk, bdev);
}
if (ret)
goto out_clear;
} else {
......@@ -1259,12 +1268,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
/*
* If the partition is not aligned on a page
* boundary, we can't do dax I/O to it.
*/
if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
(bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
if (!blkdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
......@@ -1599,14 +1603,14 @@ EXPORT_SYMBOL(blkdev_put);
static int blkdev_close(struct inode * inode, struct file * filp)
{
struct block_device *bdev = I_BDEV(filp->f_mapping->host);
struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
blkdev_put(bdev, filp->f_mode);
return 0;
}
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct block_device *bdev = I_BDEV(file->f_mapping->host);
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
fmode_t mode = file->f_mode;
/*
......@@ -1631,7 +1635,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *bd_inode = file->f_mapping->host;
struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
struct blk_plug plug;
ssize_t ret;
......@@ -1663,7 +1667,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter);
ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *bd_inode = file->f_mapping->host;
struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
loff_t pos = iocb->ki_pos;
......@@ -1702,13 +1706,101 @@ static const struct address_space_operations def_blk_aops = {
.is_dirty_writeback = buffer_check_dirty_writeback,
};
#ifdef CONFIG_FS_DAX
/*
* In the raw block case we do not need to contend with truncation nor
* unwritten file extents. Without those concerns there is no need for
* additional locking beyond the mmap_sem context that these routines
* are already executing under.
*
* Note, there is no protection if the block device is dynamically
* resized (partition grow/shrink) during a fault. A stable block device
* size is already not enforced in the blkdev_direct_IO path.
*
* For DAX, it is the responsibility of the block device driver to
* ensure the whole-disk device size is stable while requests are in
* flight.
*
* Finally, unlike the filemap_page_mkwrite() case there is no
* filesystem superblock to sync against freezing. We still include a
* pfn_mkwrite callback for dax drivers to receive write fault
* notifications.
*/
static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return __dax_fault(vma, vmf, blkdev_get_block, NULL);
}
static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags)
{
return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
}
static void blkdev_vm_open(struct vm_area_struct *vma)
{
struct inode *bd_inode = bdev_file_inode(vma->vm_file);
struct block_device *bdev = I_BDEV(bd_inode);
mutex_lock(&bd_inode->i_mutex);
bdev->bd_map_count++;
mutex_unlock(&bd_inode->i_mutex);
}
static void blkdev_vm_close(struct vm_area_struct *vma)
{
struct inode *bd_inode = bdev_file_inode(vma->vm_file);
struct block_device *bdev = I_BDEV(bd_inode);
mutex_lock(&bd_inode->i_mutex);
bdev->bd_map_count--;
mutex_unlock(&bd_inode->i_mutex);
}
static const struct vm_operations_struct blkdev_dax_vm_ops = {
.open = blkdev_vm_open,
.close = blkdev_vm_close,
.fault = blkdev_dax_fault,
.pmd_fault = blkdev_dax_pmd_fault,
.pfn_mkwrite = blkdev_dax_fault,
};
static const struct vm_operations_struct blkdev_default_vm_ops = {
.open = blkdev_vm_open,
.close = blkdev_vm_close,
.fault = filemap_fault,
.map_pages = filemap_map_pages,
};
static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *bd_inode = bdev_file_inode(file);
struct block_device *bdev = I_BDEV(bd_inode);
file_accessed(file);
mutex_lock(&bd_inode->i_mutex);
bdev->bd_map_count++;
if (IS_DAX(bd_inode)) {
vma->vm_ops = &blkdev_dax_vm_ops;
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &blkdev_default_vm_ops;
}
mutex_unlock(&bd_inode->i_mutex);
return 0;
}
#else
#define blkdev_mmap generic_file_mmap
#endif
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
.mmap = generic_file_mmap,
.mmap = blkdev_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
......
#ifndef _LINUX_BADBLOCKS_H
#define _LINUX_BADBLOCKS_H
#include <linux/seqlock.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/stddef.h>
#include <linux/types.h>
#define BB_LEN_MASK (0x00000000000001FFULL)
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
#define BB_ACK_MASK (0x8000000000000000ULL)
#define BB_MAX_LEN 512
#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
/* Bad block numbers are stored sorted in a single page.
* 64bits is used for each block or extent.
* 54 bits are sector number, 9 bits are extent size,
* 1 bit is an 'acknowledged' flag.
*/
#define MAX_BADBLOCKS (PAGE_SIZE/8)
struct badblocks {
struct device *dev; /* set by devm_init_badblocks */
int count; /* count of bad blocks */
int unacked_exist; /* there probably are unacknowledged
* bad blocks. This is only cleared
* when a read discovers none
*/
int shift; /* shift from sectors to block size
* a -ve shift means badblocks are
* disabled.*/
u64 *page; /* badblock list */
int changed;
seqlock_t lock;
sector_t sector;
sector_t size; /* in sectors */
};
int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors);
int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
int acknowledged);
int badblocks_clear(struct badblocks *bb, sector_t s, int sectors);
void ack_all_badblocks(struct badblocks *bb);
ssize_t badblocks_show(struct badblocks *bb, char *page, int unack);
ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
int unack);
int badblocks_init(struct badblocks *bb, int enable);
void badblocks_exit(struct badblocks *bb);
struct device;
int devm_init_badblocks(struct device *dev, struct badblocks *bb);
static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
{
if (bb->dev != dev) {
dev_WARN_ONCE(dev, 1, "%s: badblocks instance not associated\n",
__func__);
return;
}
badblocks_exit(bb);
}
#endif
......@@ -483,6 +483,9 @@ struct block_device {
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
#ifdef CONFIG_FS_DAX
int bd_map_count;
#endif
};
/*
......@@ -2280,6 +2283,14 @@ extern struct super_block *freeze_bdev(struct block_device *);
extern void emergency_thaw_all(void);
extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
extern int fsync_bdev(struct block_device *);
#ifdef CONFIG_FS_DAX
extern bool blkdev_dax_capable(struct block_device *bdev);
#else
static inline bool blkdev_dax_capable(struct block_device *bdev)
{
return false;
}
#endif
extern struct super_block *blockdev_superblock;
......
......@@ -162,6 +162,7 @@ struct disk_part_tbl {
};
struct disk_events;
struct badblocks;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
......@@ -213,6 +214,7 @@ struct gendisk {
struct kobject integrity_kobj;
#endif /* CONFIG_BLK_DEV_INTEGRITY */
int node_id;
struct badblocks *bb;
};
static inline struct gendisk *part_to_disk(struct hd_struct *part)
......
......@@ -116,6 +116,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
}
int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
#define nvdimm_bus_register(parent, desc) \
......
......@@ -188,6 +188,8 @@ struct inodes_stat_t {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
#define BLKDAXSET _IO(0x12,128)
#define BLKDAXGET _IO(0x12,129)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
......
......@@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr)
break;
if (p->end < addr)
continue;
if (p->flags & IORESOURCE_BUSY &&
p->flags & IORESOURCE_EXCLUSIVE) {
/*
* A resource is exclusive if IORESOURCE_EXCLUSIVE is set
* or CONFIG_IO_STRICT_DEVMEM is enabled and the
* resource is busy.
*/
if ((p->flags & IORESOURCE_BUSY) == 0)
continue;
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|| p->flags & IORESOURCE_EXCLUSIVE) {
err = 1;
break;
}
......
......@@ -1886,3 +1886,42 @@ source "samples/Kconfig"
source "lib/Kconfig.kgdb"
config ARCH_HAS_DEVMEM_IS_ALLOWED
bool
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
depends on ARCH_HAS_DEVMEM_IS_ALLOWED
default y if TILE || PPC
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel. Note that with PAT support
enabled, even in this case there are restrictions on /dev/mem
use due to the cache aliasing requirements.
If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem
file only allows userspace access to PCI space and the BIOS code and
data regions. This is sufficient for dosemu and X and all common
users of /dev/mem.
If in doubt, say Y.
config IO_STRICT_DEVMEM
bool "Filter I/O access to /dev/mem"
depends on STRICT_DEVMEM
default STRICT_DEVMEM
---help---
If this option is disabled, you allow userspace (root) access to all
io-memory regardless of whether a driver is actively using that
range. Accidental access to this is obviously disastrous, but
specific access can be used by people debugging kernel drivers.
If this option is switched on, the /dev/mem file only allows
userspace access to *idle* io-memory ranges (see /proc/iomem) This
may break traditional users of /dev/mem (dosemu, legacy X, etc...)
if the driver using a given range cannot be disabled.
If in doubt, say Y.
......@@ -9,6 +9,8 @@ ldflags-y += --wrap=memunmap
ldflags-y += --wrap=__devm_request_region
ldflags-y += --wrap=__request_region
ldflags-y += --wrap=__release_region
ldflags-y += --wrap=devm_memremap_pages
ldflags-y += --wrap=phys_to_pfn_t
DRIVERS := ../../../drivers
NVDIMM_SRC := $(DRIVERS)/nvdimm
......
......@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/io.h>
#include <linux/mm.h>
#include "nfit_test.h"
static LIST_HEAD(iomap_head);
......@@ -41,7 +42,7 @@ void nfit_test_teardown(void)
}
EXPORT_SYMBOL(nfit_test_teardown);
static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
static struct nfit_test_resource *__get_nfit_res(resource_size_t resource)
{
struct iomap_ops *ops;
......@@ -51,14 +52,22 @@ static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
return NULL;
}
void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
{
struct nfit_test_resource *nfit_res;
struct nfit_test_resource *res;
rcu_read_lock();
nfit_res = get_nfit_res(offset);
res = __get_nfit_res(resource);
rcu_read_unlock();
return res;
}
void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
{
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
if (nfit_res)
return (void __iomem *) nfit_res->buf + offset
- nfit_res->res->start;
......@@ -68,11 +77,8 @@ void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
void __iomem *__wrap_devm_ioremap_nocache(struct device *dev,
resource_size_t offset, unsigned long size)
{
struct nfit_test_resource *nfit_res;
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
rcu_read_lock();
nfit_res = get_nfit_res(offset);
rcu_read_unlock();
if (nfit_res)
return (void __iomem *) nfit_res->buf + offset
- nfit_res->res->start;
......@@ -83,25 +89,58 @@ EXPORT_SYMBOL(__wrap_devm_ioremap_nocache);
void *__wrap_devm_memremap(struct device *dev, resource_size_t offset,
size_t size, unsigned long flags)
{
struct nfit_test_resource *nfit_res;
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
rcu_read_lock();
nfit_res = get_nfit_res(offset);
rcu_read_unlock();
if (nfit_res)
return nfit_res->buf + offset - nfit_res->res->start;
return devm_memremap(dev, offset, size, flags);
}
EXPORT_SYMBOL(__wrap_devm_memremap);
#ifdef __HAVE_ARCH_PTE_DEVMAP
#include <linux/memremap.h>
#include <linux/pfn_t.h>
void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap)
{
resource_size_t offset = res->start;
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
if (nfit_res)
return nfit_res->buf + offset - nfit_res->res->start;
return devm_memremap_pages(dev, res, ref, altmap);
}
EXPORT_SYMBOL(__wrap_devm_memremap_pages);
pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
{
struct nfit_test_resource *nfit_res = get_nfit_res(addr);
if (nfit_res)
flags &= ~PFN_MAP;
return phys_to_pfn_t(addr, flags);
}
EXPORT_SYMBOL(__wrap_phys_to_pfn_t);
#else
/* to be removed post 4.5-rc1 */
void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res)
{
resource_size_t offset = res->start;
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
if (nfit_res)
return nfit_res->buf + offset - nfit_res->res->start;
return devm_memremap_pages(dev, res);
}
EXPORT_SYMBOL(__wrap_devm_memremap_pages);
#endif
void *__wrap_memremap(resource_size_t offset, size_t size,
unsigned long flags)
{
struct nfit_test_resource *nfit_res;
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
rcu_read_lock();
nfit_res = get_nfit_res(offset);
rcu_read_unlock();
if (nfit_res)
return nfit_res->buf + offset - nfit_res->res->start;
return memremap(offset, size, flags);
......@@ -110,11 +149,8 @@ EXPORT_SYMBOL(__wrap_memremap);
void __wrap_devm_memunmap(struct device *dev, void *addr)
{
struct nfit_test_resource *nfit_res;
struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
rcu_read_lock();
nfit_res = get_nfit_res((unsigned long) addr);
rcu_read_unlock();
if (nfit_res)
return;
return devm_memunmap(dev, addr);
......@@ -135,11 +171,7 @@ EXPORT_SYMBOL(__wrap_ioremap_wc);
void __wrap_iounmap(volatile void __iomem *addr)
{
struct nfit_test_resource *nfit_res;
rcu_read_lock();
nfit_res = get_nfit_res((unsigned long) addr);
rcu_read_unlock();
struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
if (nfit_res)
return;
return iounmap(addr);
......@@ -148,11 +180,8 @@ EXPORT_SYMBOL(__wrap_iounmap);
void __wrap_memunmap(void *addr)
{
struct nfit_test_resource *nfit_res;
struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
rcu_read_lock();
nfit_res = get_nfit_res((unsigned long) addr);
rcu_read_unlock();
if (nfit_res)
return;
return memunmap(addr);
......@@ -166,9 +195,7 @@ static struct resource *nfit_test_request_region(struct device *dev,
struct nfit_test_resource *nfit_res;
if (parent == &iomem_resource) {
rcu_read_lock();
nfit_res = get_nfit_res(start);
rcu_read_unlock();
if (nfit_res) {
struct resource *res = nfit_res->res + 1;
......@@ -218,9 +245,7 @@ void __wrap___release_region(struct resource *parent, resource_size_t start,
struct nfit_test_resource *nfit_res;
if (parent == &iomem_resource) {
rcu_read_lock();
nfit_res = get_nfit_res(start);
rcu_read_unlock();
if (nfit_res) {
struct resource *res = nfit_res->res + 1;
......
......@@ -248,6 +248,8 @@ static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd,
nd_cmd->out_length = 256;
nd_cmd->num_records = 0;
nd_cmd->address = 0;
nd_cmd->length = -1ULL;
nd_cmd->status = 0;
return 0;
......@@ -1088,6 +1090,8 @@ static void nfit_test1_setup(struct nfit_test *t)
struct acpi_nfit_memory_map *memdev;
struct acpi_nfit_control_region *dcr;
struct acpi_nfit_system_address *spa;
struct nvdimm_bus_descriptor *nd_desc;
struct acpi_nfit_desc *acpi_desc;
offset = 0;
/* spa0 (flat range with no bdw aliasing) */
......@@ -1135,6 +1139,13 @@ static void nfit_test1_setup(struct nfit_test *t)
dcr->command_size = 0;
dcr->status_offset = 0;
dcr->status_size = 0;
acpi_desc = &t->acpi_desc;
set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
nd_desc = &acpi_desc->nd_desc;
nd_desc->ndctl = nfit_test_ctl;
}
static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment