Commit d080827f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm updates from Dan Williams:
 "The bulk of this has appeared in -next and independently received a
  build success notification from the kbuild robot.  The 'for-4.5/block-
  dax' topic branch was rebased over the weekend to drop the "block
  device end-of-life" rework that Al would like to see re-implemented
  with a notifier, and to address bug reports against the badblocks
  integration.

  There is pending feedback against "libnvdimm: Add a poison list and
  export badblocks" received last week.  Linda identified some localized
  fixups that we will handle incrementally.

  Summary:

   - Media error handling: The 'badblocks' implementation that
     originated in md-raid is up-levelled to a generic capability of a
     block device.  This initial implementation is limited to being
     consulted in the pmem block-i/o path.  Later, 'badblocks' will be
     consulted when creating dax mappings.

   - Raw block device dax: For virtualization and other cases that want
     large contiguous mappings of persistent memory, add the capability
     to dax-mmap a block device directly.

   - Increased /dev/mem restrictions: Add an option to treat all
     io-memory as IORESOURCE_EXCLUSIVE, i.e. disable /dev/mem access
     while a driver is actively using an address range.  This behavior
     is controlled via the new CONFIG_IO_STRICT_DEVMEM option and can be
     overridden by the existing "iomem=relaxed" kernel command line
     option.

   - Miscellaneous fixes include a 'pfn'-device huge page alignment fix,
     block device shutdown crash fix, and other small libnvdimm fixes"

* tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (32 commits)
  block: kill disk_{check|set|clear|alloc}_badblocks
  libnvdimm, pmem: nvdimm_read_bytes() badblocks support
  pmem, dax: disable dax in the presence of bad blocks
  pmem: fail io-requests to known bad blocks
  libnvdimm: convert to statically allocated badblocks
  libnvdimm: don't fail init for full badblocks list
  block, badblocks: introduce devm_init_badblocks
  block: clarify badblocks lifetime
  badblocks: rename badblocks_free to badblocks_exit
  libnvdimm, pmem: move definition of nvdimm_namespace_add_poison to nd.h
  libnvdimm: Add a poison list and export badblocks
  nfit_test: Enable DSMs for all test NFITs
  md: convert to use the generic badblocks code
  block: Add badblock management for gendisks
  badblocks: Add core badblock management code
  block: fix del_gendisk() vs blkdev_ioctl crash
  block: enable dax for raw block devices
  block: introduce bdev_file_inode()
  restrict /dev/mem to idle io memory ranges
  arch: consolidate CONFIG_STRICT_DEVM in lib/Kconfig.debug
  ...
parents cbd88cd4 8b63b6bf
...@@ -2,6 +2,7 @@ config ARM ...@@ -2,6 +2,7 @@ config ARM
bool bool
default y default y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_CUSTOM_GPIO_H
......
...@@ -15,20 +15,6 @@ config ARM_PTDUMP ...@@ -15,20 +15,6 @@ config ARM_PTDUMP
kernel. kernel.
If in doubt, say "N" If in doubt, say "N"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel.
If this option is switched on, the /dev/mem file only allows
userspace access to memory mapped peripherals.
If in doubt, say Y.
# RMK wants arm kernels compiled with frame pointers or stack unwinding. # RMK wants arm kernels compiled with frame pointers or stack unwinding.
# If you know what you are doing and are willing to live without stack # If you know what you are doing and are willing to live without stack
# traces, you can get a slightly smaller kernel by setting this option to # traces, you can get a slightly smaller kernel by setting this option to
......
...@@ -3,6 +3,7 @@ config ARM64 ...@@ -3,6 +3,7 @@ config ARM64
select ACPI_CCA_REQUIRED if ACPI select ACPI_CCA_REQUIRED if ACPI
select ACPI_GENERIC_GSI if ACPI select ACPI_GENERIC_GSI if ACPI
select ACPI_REDUCED_HARDWARE_ONLY if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GCOV_PROFILE_ALL
......
...@@ -14,20 +14,6 @@ config ARM64_PTDUMP ...@@ -14,20 +14,6 @@ config ARM64_PTDUMP
kernel. kernel.
If in doubt, say "N" If in doubt, say "N"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
help
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel.
If this option is switched on, the /dev/mem file only allows
userspace access to memory mapped peripherals.
If in doubt, say Y.
config PID_IN_CONTEXTIDR config PID_IN_CONTEXTIDR
bool "Write the current PID to the CONTEXTIDR register" bool "Write the current PID to the CONTEXTIDR register"
help help
......
...@@ -10,6 +10,7 @@ config FRV ...@@ -10,6 +10,7 @@ config FRV
select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_BUGVERBOSE
select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_CPU_DEVICES select GENERIC_CPU_DEVICES
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IPC_PARSE_VERSION
select OLD_SIGSUSPEND3 select OLD_SIGSUSPEND3
select OLD_SIGACTION select OLD_SIGACTION
......
...@@ -13,6 +13,7 @@ config M32R ...@@ -13,6 +13,7 @@ config M32R
select GENERIC_IRQ_PROBE select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW
select GENERIC_ATOMIC64 select GENERIC_ATOMIC64
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_USES_GETTIMEOFFSET select ARCH_USES_GETTIMEOFFSET
select MODULES_USE_ELF_RELA select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW select HAVE_DEBUG_STACKOVERFLOW
......
...@@ -159,6 +159,7 @@ config PPC ...@@ -159,6 +159,7 @@ config PPC
select EDAC_SUPPORT select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB select EDAC_ATOMIC_SCRUB
select ARCH_HAS_DMA_SET_COHERENT_MASK select ARCH_HAS_DMA_SET_COHERENT_MASK
select ARCH_HAS_DEVMEM_IS_ALLOWED
select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SECCOMP_FILTER
config GENERIC_CSUM config GENERIC_CSUM
......
...@@ -335,18 +335,6 @@ config PPC_EARLY_DEBUG_CPM_ADDR ...@@ -335,18 +335,6 @@ config PPC_EARLY_DEBUG_CPM_ADDR
platform probing is done, all platforms selected must platform probing is done, all platforms selected must
share the same address. share the same address.
config STRICT_DEVMEM
def_bool y
prompt "Filter access to /dev/mem"
help
This option restricts access to /dev/mem. If this option is
disabled, you allow userspace access to all memory, including
kernel and userspace memory. Accidental memory access is likely
to be disastrous.
Memory access is required for experts who want to debug the kernel.
If you are unsure, say Y.
config FAIL_IOMMU config FAIL_IOMMU
bool "Fault-injection capability for IOMMU" bool "Fault-injection capability for IOMMU"
depends on FAULT_INJECTION depends on FAULT_INJECTION
......
...@@ -66,6 +66,7 @@ config S390 ...@@ -66,6 +66,7 @@ config S390
def_bool y def_bool y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_SG_CHAIN select ARCH_HAS_SG_CHAIN
......
...@@ -5,18 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT ...@@ -5,18 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug" source "lib/Kconfig.debug"
config STRICT_DEVMEM
def_bool y
prompt "Filter access to /dev/mem"
---help---
This option restricts access to /dev/mem. If this option is
disabled, you allow userspace access to all memory, including
kernel and userspace memory. Accidental memory access is likely
to be disastrous.
Memory access is required for experts who want to debug the kernel.
If you are unsure, say Y.
config S390_PTDUMP config S390_PTDUMP
bool "Export kernel pagetable layout to userspace via debugfs" bool "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL depends on DEBUG_KERNEL
......
...@@ -19,6 +19,7 @@ config TILE ...@@ -19,6 +19,7 @@ config TILE
select VIRT_TO_BUS select VIRT_TO_BUS
select SYS_HYPERVISOR select SYS_HYPERVISOR
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS
select MODULES_USE_ELF_RELA select MODULES_USE_ELF_RELA
...@@ -116,9 +117,6 @@ config ARCH_DISCONTIGMEM_DEFAULT ...@@ -116,9 +117,6 @@ config ARCH_DISCONTIGMEM_DEFAULT
config TRACE_IRQFLAGS_SUPPORT config TRACE_IRQFLAGS_SUPPORT
def_bool y def_bool y
config STRICT_DEVMEM
def_bool y
# SMP is required for Tilera Linux. # SMP is required for Tilera Linux.
config SMP config SMP
def_bool y def_bool y
......
config UNICORE32 config UNICORE32
def_bool y def_bool y
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_MEMBLOCK select HAVE_MEMBLOCK
......
...@@ -2,20 +2,6 @@ menu "Kernel hacking" ...@@ -2,20 +2,6 @@ menu "Kernel hacking"
source "lib/Kconfig.debug" source "lib/Kconfig.debug"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel.
If this option is switched on, the /dev/mem file only allows
userspace access to memory mapped peripherals.
If in doubt, say Y.
config EARLY_PRINTK config EARLY_PRINTK
def_bool DEBUG_OCD def_bool DEBUG_OCD
help help
......
...@@ -24,6 +24,7 @@ config X86 ...@@ -24,6 +24,7 @@ config X86
select ARCH_DISCARD_MEMBLOCK select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GCOV_PROFILE_ALL
......
...@@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT ...@@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug" source "lib/Kconfig.debug"
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel. Note that with PAT support
enabled, even in this case there are restrictions on /dev/mem
use due to the cache aliasing requirements.
If this option is switched on, the /dev/mem file only allows
userspace access to PCI space and the BIOS code and data regions.
This is sufficient for dosemu and X and all common users of
/dev/mem.
If in doubt, say Y.
config X86_VERBOSE_BOOTUP config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages" bool "Enable verbose x86 bootup info messages"
default y default y
......
...@@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ ...@@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
partitions/ badblocks.o partitions/
obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
......
This diff is collapsed.
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/idr.h> #include <linux/idr.h>
#include <linux/log2.h> #include <linux/log2.h>
#include <linux/pm_runtime.h> #include <linux/pm_runtime.h>
#include <linux/badblocks.h>
#include "blk.h" #include "blk.h"
...@@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk) ...@@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk)
kobject_put(disk->part0.holder_dir); kobject_put(disk->part0.holder_dir);
kobject_put(disk->slave_dir); kobject_put(disk->slave_dir);
disk->driverfs_dev = NULL;
if (!sysfs_deprecated) if (!sysfs_deprecated)
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
...@@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk) ...@@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk)
} }
EXPORT_SYMBOL(del_gendisk); EXPORT_SYMBOL(del_gendisk);
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
struct device_attribute *attr,
char *page)
{
struct gendisk *disk = dev_to_disk(dev);
if (!disk->bb)
return sprintf(page, "\n");
return badblocks_show(disk->bb, page, 0);
}
static ssize_t disk_badblocks_store(struct device *dev,
struct device_attribute *attr,
const char *page, size_t len)
{
struct gendisk *disk = dev_to_disk(dev);
if (!disk->bb)
return -ENXIO;
return badblocks_store(disk->bb, page, len, 0);
}
/** /**
* get_gendisk - get partitioning information for a given device * get_gendisk - get partitioning information for a given device
* @devt: device to get partitioning information for * @devt: device to get partitioning information for
...@@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, ...@@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
disk_badblocks_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST #ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail = static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
...@@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = { ...@@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_capability.attr, &dev_attr_capability.attr,
&dev_attr_stat.attr, &dev_attr_stat.attr,
&dev_attr_inflight.attr, &dev_attr_inflight.attr,
&dev_attr_badblocks.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST #ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr, &dev_attr_fail.attr,
#endif #endif
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/blkpg.h> #include <linux/blkpg.h>
#include <linux/hdreg.h> #include <linux/hdreg.h>
#include <linux/badblocks.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/blktrace_api.h> #include <linux/blktrace_api.h>
...@@ -406,6 +407,71 @@ static inline int is_unrecognized_ioctl(int ret) ...@@ -406,6 +407,71 @@ static inline int is_unrecognized_ioctl(int ret)
ret == -ENOIOCTLCMD; ret == -ENOIOCTLCMD;
} }
#ifdef CONFIG_FS_DAX
bool blkdev_dax_capable(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
if (!disk->fops->direct_access)
return false;
/*
* If the partition is not aligned on a page boundary, we can't
* do dax I/O to it.
*/
if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
return false;
/*
* If the device has known bad blocks, force all I/O through the
* driver / page cache.
*
* TODO: support finer grained dax error handling
*/
if (disk->bb && disk->bb->count)
return false;
return true;
}
static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
{
unsigned long arg;
int rc = 0;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (get_user(arg, (int __user *)(argp)))
return -EFAULT;
arg = !!arg;
if (arg == !!(bdev->bd_inode->i_flags & S_DAX))
return 0;
if (arg)
arg = S_DAX;
if (arg && !blkdev_dax_capable(bdev))
return -ENOTTY;
mutex_lock(&bdev->bd_inode->i_mutex);
if (bdev->bd_map_count == 0)
inode_set_flags(bdev->bd_inode, arg, S_DAX);
else
rc = -EBUSY;
mutex_unlock(&bdev->bd_inode->i_mutex);
return rc;
}
#else
static int blkdev_daxset(struct block_device *bdev, int arg)
{
if (arg)
return -ENOTTY;
return 0;
}
#endif
static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg) unsigned cmd, unsigned long arg)
{ {
...@@ -568,6 +634,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, ...@@ -568,6 +634,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESETUP: case BLKTRACESETUP:
case BLKTRACETEARDOWN: case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp); return blk_trace_ioctl(bdev, cmd, argp);
case BLKDAXSET:
return blkdev_daxset(bdev, arg);
case BLKDAXGET:
return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
break;
case IOC_PR_REGISTER: case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, argp); return blkdev_pr_register(bdev, argp);
case IOC_PR_RESERVE: case IOC_PR_RESERVE:
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/ndctl.h> #include <linux/ndctl.h>
#include <linux/delay.h>
#include <linux/list.h> #include <linux/list.h>
#include <linux/acpi.h> #include <linux/acpi.h>
#include <linux/sort.h> #include <linux/sort.h>
...@@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus, ...@@ -1473,6 +1474,201 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
/* devm will free nfit_blk */ /* devm will free nfit_blk */
} }
static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc,
struct nd_cmd_ars_cap *cmd, u64 addr, u64 length)
{
cmd->address = addr;
cmd->length = length;
return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
sizeof(*cmd));
}
static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
struct nd_cmd_ars_start *cmd, u64 addr, u64 length)
{
int rc;
cmd->address = addr;
cmd->length = length;
cmd->type = ND_ARS_PERSISTENT;
while (1) {
rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd,
sizeof(*cmd));
if (rc)
return rc;
switch (cmd->status) {
case 0:
return 0;
case 1:
/* ARS unsupported, but we should never get here */
return 0;
case 2:
return -EINVAL;
case 3:
/* ARS is in progress */
msleep(1000);
break;
default:
return -ENXIO;
}
}
}
static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
struct nd_cmd_ars_status *cmd)
{
int rc;
while (1) {
rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
sizeof(*cmd));
if (rc || cmd->status & 0xffff)
return -ENXIO;
/* Check extended status (Upper two bytes) */
switch (cmd->status >> 16) {
case 0:
return 0;
case 1:
/* ARS is in progress */
msleep(1000);
break;
case 2:
/* No ARS performed for the current boot */
return 0;
default:
return -ENXIO;
}
}
}
static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
struct nd_cmd_ars_status *ars_status, u64 start)
{
int rc;
u32 i;
/*
* The address field returned by ars_status should be either
* less than or equal to the address we last started ARS for.
* The (start, length) returned by ars_status should also have
* non-zero overlap with the range we started ARS for.
* If this is not the case, bail.
*/
if (ars_status->address > start ||
(ars_status->address + ars_status->length < start))
return -ENXIO;
for (i = 0; i < ars_status->num_records; i++) {
rc = nvdimm_bus_add_poison(nvdimm_bus,
ars_status->records[i].err_address,
ars_status->records[i].length);
if (rc)
return rc;
}
return 0;
}
static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
struct nd_region_desc *ndr_desc)
{
struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
struct nd_cmd_ars_status *ars_status = NULL;
struct nd_cmd_ars_start *ars_start = NULL;
struct nd_cmd_ars_cap *ars_cap = NULL;
u64 start, len, cur, remaining;
int rc;
ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL);
if (!ars_cap)
return -ENOMEM;
start = ndr_desc->res->start;
len = ndr_desc->res->end - ndr_desc->res->start + 1;
rc = ars_get_cap(nd_desc, ars_cap, start, len);
if (rc)
goto out;
/*
* If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in
* extended status is not set, skip this but continue initialization
*/
if ((ars_cap->status & 0xffff) ||
!(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
dev_warn(acpi_desc->dev,
"ARS unsupported (status: 0x%x), won't create an error list\n",
ars_cap->status);
goto out;
}
/*
* Check if a full-range ARS has been run. If so, use those results
* without having to start a new ARS.
*/
ars_status = kzalloc(ars_cap->max_ars_out + sizeof(*ars_status),
GFP_KERNEL);
if (!ars_status) {
rc = -ENOMEM;
goto out;
}
rc = ars_get_status(nd_desc, ars_status);
if (rc)
goto out;
if (ars_status->address <= start &&
(ars_status->address + ars_status->length >= start + len)) {
rc = ars_status_process_records(nvdimm_bus, ars_status, start);
goto out;
}
/*
* ARS_STATUS can overflow if the number of poison entries found is
* greater than the maximum buffer size (ars_cap->max_ars_out)
* To detect overflow, check if the length field of ars_status
* is less than the length we supplied. If so, process the
* error entries we got, adjust the start point, and start again
*/
ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL);
if (!ars_start)
return -ENOMEM;
cur = start;
remaining = len;
do {
u64 done, end;
rc = ars_do_start(nd_desc, ars_start, cur, remaining);
if (rc)
goto out;
rc = ars_get_status(nd_desc, ars_status);
if (rc)
goto out;
rc = ars_status_process_records(nvdimm_bus, ars_status, cur);
if (rc)
goto out;
end = min(cur + remaining,
ars_status->address + ars_status->length);
done = end - cur;
cur += done;
remaining -= done;
} while (remaining);
out:
kfree(ars_cap);
kfree(ars_start);
kfree(ars_status);
return rc;
}
static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
struct acpi_nfit_memory_map *memdev, struct acpi_nfit_memory_map *memdev,
...@@ -1585,6 +1781,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, ...@@ -1585,6 +1781,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
nvdimm_bus = acpi_desc->nvdimm_bus; nvdimm_bus = acpi_desc->nvdimm_bus;
if (nfit_spa_type(spa) == NFIT_SPA_PM) { if (nfit_spa_type(spa) == NFIT_SPA_PM) {
rc = acpi_nfit_find_poison(acpi_desc, ndr_desc);
if (rc) {
dev_err(acpi_desc->dev,
"error while performing ARS to find poison: %d\n",
rc);
return rc;
}
if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc)) if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc))
return -ENOMEM; return -ENOMEM;
} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) { } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
......
This diff is collapsed.
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/badblocks.h>
#include <linux/kobject.h> #include <linux/kobject.h>
#include <linux/list.h> #include <linux/list.h>
#include <linux/mm.h> #include <linux/mm.h>
...@@ -28,13 +29,6 @@ ...@@ -28,13 +29,6 @@
#define MaxSector (~(sector_t)0) #define MaxSector (~(sector_t)0)
/* Bad block numbers are stored sorted in a single page.
* 64bits is used for each block or extent.
* 54 bits are sector number, 9 bits are extent size,
* 1 bit is an 'acknowledged' flag.
*/
#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
/* /*
* MD's 'extended' device * MD's 'extended' device
*/ */
...@@ -117,22 +111,7 @@ struct md_rdev { ...@@ -117,22 +111,7 @@ struct md_rdev {
struct kernfs_node *sysfs_state; /* handle for 'state' struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */ * sysfs entry */
struct badblocks { struct badblocks badblocks;
int count; /* count of bad blocks */
int unacked_exist; /* there probably are unacknowledged
* bad blocks. This is only cleared
* when a read discovers none
*/
int shift; /* shift from sectors to block size
* a -ve shift means badblocks are
* disabled.*/
u64 *page; /* badblock list */
int changed;
seqlock_t lock;
sector_t sector;
sector_t size; /* in sectors */
} badblocks;
}; };
enum flag_bits { enum flag_bits {
Faulty, /* device is known to have a fault */ Faulty, /* device is known to have a fault */
...@@ -185,22 +164,11 @@ enum flag_bits { ...@@ -185,22 +164,11 @@ enum flag_bits {
*/ */
}; };
#define BB_LEN_MASK (0x00000000000001FFULL)
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
#define BB_ACK_MASK (0x8000000000000000ULL)
#define BB_MAX_LEN 512
#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors);
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors) sector_t *first_bad, int *bad_sectors)
{ {
if (unlikely(rdev->badblocks.count)) { if (unlikely(rdev->badblocks.count)) {
int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
sectors, sectors,
first_bad, bad_sectors); first_bad, bad_sectors);
if (rv) if (rv)
...@@ -213,8 +181,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, ...@@ -213,8 +181,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new); int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new); int is_new);
extern void md_ack_all_badblocks(struct badblocks *bb);
struct md_cluster_info; struct md_cluster_info;
struct mddev { struct mddev {
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
* General Public License for more details. * General Public License for more details.
*/ */
#include <linux/libnvdimm.h> #include <linux/libnvdimm.h>
#include <linux/badblocks.h>
#include <linux/export.h> #include <linux/export.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
...@@ -325,6 +326,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, ...@@ -325,6 +326,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
if (!nvdimm_bus) if (!nvdimm_bus)
return NULL; return NULL;
INIT_LIST_HEAD(&nvdimm_bus->list); INIT_LIST_HEAD(&nvdimm_bus->list);
INIT_LIST_HEAD(&nvdimm_bus->poison_list);
init_waitqueue_head(&nvdimm_bus->probe_wait); init_waitqueue_head(&nvdimm_bus->probe_wait);
nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
mutex_init(&nvdimm_bus->reconfig_mutex); mutex_init(&nvdimm_bus->reconfig_mutex);
...@@ -359,6 +361,172 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, ...@@ -359,6 +361,172 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
} }
EXPORT_SYMBOL_GPL(__nvdimm_bus_register); EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
static void set_badblock(struct badblocks *bb, sector_t s, int num)
{
dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
(u64) s * 512, (u64) num * 512);
/* this isn't an error as the hardware will still throw an exception */
if (badblocks_set(bb, s, num, 1))
dev_info_once(bb->dev, "%s: failed for sector %llx\n",
__func__, (u64) s);
}
/**
* __add_badblock_range() - Convert a physical address range to bad sectors
* @bb: badblocks instance to populate
* @ns_offset: namespace offset where the error range begins (in bytes)
* @len: number of bytes of poison to be added
*
* This assumes that the range provided with (ns_offset, len) is within
* the bounds of physical addresses for this namespace, i.e. lies in the
* interval [ns_start, ns_start + ns_size)
*/
static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
{
const unsigned int sector_size = 512;
sector_t start_sector;
u64 num_sectors;
u32 rem;
start_sector = div_u64(ns_offset, sector_size);
num_sectors = div_u64_rem(len, sector_size, &rem);
if (rem)
num_sectors++;
if (unlikely(num_sectors > (u64)INT_MAX)) {
u64 remaining = num_sectors;
sector_t s = start_sector;
while (remaining) {
int done = min_t(u64, remaining, INT_MAX);
set_badblock(bb, s, done);
remaining -= done;
s += done;
}
} else
set_badblock(bb, start_sector, num_sectors);
}
/**
* nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks
* @ndns: the namespace containing poison ranges
* @bb: badblocks instance to populate
* @offset: offset at the start of the namespace before 'sector 0'
*
* The poison list generated during NFIT initialization may contain multiple,
* possibly overlapping ranges in the SPA (System Physical Address) space.
* Compare each of these ranges to the namespace currently being initialized,
* and add badblocks to the gendisk for all matching sub-ranges
*/
void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
struct badblocks *bb, resource_size_t offset)
{
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
struct nvdimm_bus *nvdimm_bus;
struct list_head *poison_list;
u64 ns_start, ns_end, ns_size;
struct nd_poison *pl;
ns_size = nvdimm_namespace_capacity(ndns) - offset;
ns_start = nsio->res.start + offset;
ns_end = nsio->res.end;
nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent);
poison_list = &nvdimm_bus->poison_list;
if (list_empty(poison_list))
return;
list_for_each_entry(pl, poison_list, list) {
u64 pl_end = pl->start + pl->length - 1;
/* Discard intervals with no intersection */
if (pl_end < ns_start)
continue;
if (pl->start > ns_end)
continue;
/* Deal with any overlap after start of the namespace */
if (pl->start >= ns_start) {
u64 start = pl->start;
u64 len;
if (pl_end <= ns_end)
len = pl->length;
else
len = ns_start + ns_size - pl->start;
__add_badblock_range(bb, start - ns_start, len);
continue;
}
/* Deal with overlap for poison starting before the namespace */
if (pl->start < ns_start) {
u64 len;
if (pl_end < ns_end)
len = pl->start + pl->length - ns_start;
else
len = ns_size;
__add_badblock_range(bb, 0, len);
}
}
}
EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison);
static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
{
struct nd_poison *pl;
pl = kzalloc(sizeof(*pl), GFP_KERNEL);
if (!pl)
return -ENOMEM;
pl->start = addr;
pl->length = length;
list_add_tail(&pl->list, &nvdimm_bus->poison_list);
return 0;
}
int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
{
struct nd_poison *pl;
if (list_empty(&nvdimm_bus->poison_list))
return __add_poison(nvdimm_bus, addr, length);
/*
* There is a chance this is a duplicate, check for those first.
* This will be the common case as ARS_STATUS returns all known
* errors in the SPA space, and we can't query it per region
*/
list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
if (pl->start == addr) {
/* If length has changed, update this list entry */
if (pl->length != length)
pl->length = length;
return 0;
}
/*
* If not a duplicate or a simple length update, add the entry as is,
* as any overlapping ranges will get resolved when the list is consumed
* and converted to badblocks
*/
return __add_poison(nvdimm_bus, addr, length);
}
EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
static void free_poison_list(struct list_head *poison_list)
{
struct nd_poison *pl, *next;
list_for_each_entry_safe(pl, next, poison_list, list) {
list_del(&pl->list);
kfree(pl);
}
list_del_init(poison_list);
}
static int child_unregister(struct device *dev, void *data) static int child_unregister(struct device *dev, void *data)
{ {
/* /*
...@@ -385,6 +553,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) ...@@ -385,6 +553,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
nd_synchronize(); nd_synchronize();
device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
free_poison_list(&nvdimm_bus->poison_list);
nvdimm_bus_destroy_ndctl(nvdimm_bus); nvdimm_bus_destroy_ndctl(nvdimm_bus);
device_unregister(&nvdimm_bus->dev); device_unregister(&nvdimm_bus->dev);
......
...@@ -77,6 +77,59 @@ static bool is_namespace_io(struct device *dev) ...@@ -77,6 +77,59 @@ static bool is_namespace_io(struct device *dev)
return dev ? dev->type == &namespace_io_device_type : false; return dev ? dev->type == &namespace_io_device_type : false;
} }
static int is_uuid_busy(struct device *dev, void *data)
{
u8 *uuid1 = data, *uuid2 = NULL;
if (is_namespace_pmem(dev)) {
struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
uuid2 = nspm->uuid;
} else if (is_namespace_blk(dev)) {
struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
uuid2 = nsblk->uuid;
} else if (is_nd_btt(dev)) {
struct nd_btt *nd_btt = to_nd_btt(dev);
uuid2 = nd_btt->uuid;
} else if (is_nd_pfn(dev)) {
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
uuid2 = nd_pfn->uuid;
}
if (uuid2 && memcmp(uuid1, uuid2, NSLABEL_UUID_LEN) == 0)
return -EBUSY;
return 0;
}
static int is_namespace_uuid_busy(struct device *dev, void *data)
{
if (is_nd_pmem(dev) || is_nd_blk(dev))
return device_for_each_child(dev, data, is_uuid_busy);
return 0;
}
/**
* nd_is_uuid_unique - verify that no other namespace has @uuid
* @dev: any device on a nvdimm_bus
* @uuid: uuid to check
*/
bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
{
struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
if (!nvdimm_bus)
return false;
WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
if (device_for_each_child(&nvdimm_bus->dev, uuid,
is_namespace_uuid_busy) != 0)
return false;
return true;
}
bool pmem_should_map_pages(struct device *dev) bool pmem_should_map_pages(struct device *dev)
{ {
struct nd_region *nd_region = to_nd_region(dev->parent); struct nd_region *nd_region = to_nd_region(dev->parent);
...@@ -104,20 +157,10 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, ...@@ -104,20 +157,10 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
struct nd_region *nd_region = to_nd_region(ndns->dev.parent); struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
const char *suffix = NULL; const char *suffix = NULL;
if (ndns->claim) { if (ndns->claim && is_nd_btt(ndns->claim))
if (is_nd_btt(ndns->claim))
suffix = "s"; suffix = "s";
else if (is_nd_pfn(ndns->claim))
suffix = "m";
else
dev_WARN_ONCE(&ndns->dev, 1,
"unknown claim type by %s\n",
dev_name(ndns->claim));
}
if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) { if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) {
if (!suffix && pmem_should_map_pages(&ndns->dev))
suffix = "m";
sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : ""); sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : "");
} else if (is_namespace_blk(&ndns->dev)) { } else if (is_namespace_blk(&ndns->dev)) {
struct nd_namespace_blk *nsblk; struct nd_namespace_blk *nsblk;
...@@ -791,6 +834,15 @@ static void nd_namespace_pmem_set_size(struct nd_region *nd_region, ...@@ -791,6 +834,15 @@ static void nd_namespace_pmem_set_size(struct nd_region *nd_region,
res->end = nd_region->ndr_start + size - 1; res->end = nd_region->ndr_start + size - 1;
} }
static bool uuid_not_set(const u8 *uuid, struct device *dev, const char *where)
{
if (!uuid) {
dev_dbg(dev, "%s: uuid not set\n", where);
return true;
}
return false;
}
static ssize_t __size_store(struct device *dev, unsigned long long val) static ssize_t __size_store(struct device *dev, unsigned long long val)
{ {
resource_size_t allocated = 0, available = 0; resource_size_t allocated = 0, available = 0;
...@@ -820,8 +872,12 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) ...@@ -820,8 +872,12 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
* We need a uuid for the allocation-label and dimm(s) on which * We need a uuid for the allocation-label and dimm(s) on which
* to store the label. * to store the label.
*/ */
if (!uuid || nd_region->ndr_mappings == 0) if (uuid_not_set(uuid, dev, __func__))
return -ENXIO;
if (nd_region->ndr_mappings == 0) {
dev_dbg(dev, "%s: not associated with dimm(s)\n", __func__);
return -ENXIO; return -ENXIO;
}
div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder); div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder);
if (remainder) { if (remainder) {
...@@ -1211,6 +1267,29 @@ static ssize_t holder_show(struct device *dev, ...@@ -1211,6 +1267,29 @@ static ssize_t holder_show(struct device *dev,
} }
static DEVICE_ATTR_RO(holder); static DEVICE_ATTR_RO(holder);
static ssize_t mode_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nd_namespace_common *ndns = to_ndns(dev);
struct device *claim;
char *mode;
ssize_t rc;
device_lock(dev);
claim = ndns->claim;
if (pmem_should_map_pages(dev) || (claim && is_nd_pfn(claim)))
mode = "memory";
else if (claim && is_nd_btt(claim))
mode = "safe";
else
mode = "raw";
rc = sprintf(buf, "%s\n", mode);
device_unlock(dev);
return rc;
}
static DEVICE_ATTR_RO(mode);
static ssize_t force_raw_store(struct device *dev, static ssize_t force_raw_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len) struct device_attribute *attr, const char *buf, size_t len)
{ {
...@@ -1234,6 +1313,7 @@ static DEVICE_ATTR_RW(force_raw); ...@@ -1234,6 +1313,7 @@ static DEVICE_ATTR_RW(force_raw);
static struct attribute *nd_namespace_attributes[] = { static struct attribute *nd_namespace_attributes[] = {
&dev_attr_nstype.attr, &dev_attr_nstype.attr,
&dev_attr_size.attr, &dev_attr_size.attr,
&dev_attr_mode.attr,
&dev_attr_uuid.attr, &dev_attr_uuid.attr,
&dev_attr_holder.attr, &dev_attr_holder.attr,
&dev_attr_resource.attr, &dev_attr_resource.attr,
...@@ -1267,7 +1347,8 @@ static umode_t namespace_visible(struct kobject *kobj, ...@@ -1267,7 +1347,8 @@ static umode_t namespace_visible(struct kobject *kobj,
if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr
|| a == &dev_attr_holder.attr || a == &dev_attr_holder.attr
|| a == &dev_attr_force_raw.attr) || a == &dev_attr_force_raw.attr
|| a == &dev_attr_mode.attr)
return a->mode; return a->mode;
return 0; return 0;
...@@ -1343,14 +1424,19 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) ...@@ -1343,14 +1424,19 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
struct nd_namespace_pmem *nspm; struct nd_namespace_pmem *nspm;
nspm = to_nd_namespace_pmem(&ndns->dev); nspm = to_nd_namespace_pmem(&ndns->dev);
if (!nspm->uuid) { if (uuid_not_set(nspm->uuid, &ndns->dev, __func__))
dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__);
return ERR_PTR(-ENODEV); return ERR_PTR(-ENODEV);
}
} else if (is_namespace_blk(&ndns->dev)) { } else if (is_namespace_blk(&ndns->dev)) {
struct nd_namespace_blk *nsblk; struct nd_namespace_blk *nsblk;
nsblk = to_nd_namespace_blk(&ndns->dev); nsblk = to_nd_namespace_blk(&ndns->dev);
if (uuid_not_set(nsblk->uuid, &ndns->dev, __func__))
return ERR_PTR(-ENODEV);
if (!nsblk->lbasize) {
dev_dbg(&ndns->dev, "%s: sector size not set\n",
__func__);
return ERR_PTR(-ENODEV);
}
if (!nd_namespace_blk_validate(nsblk)) if (!nd_namespace_blk_validate(nsblk))
return ERR_PTR(-ENODEV); return ERR_PTR(-ENODEV);
} }
...@@ -1689,6 +1775,18 @@ void nd_region_create_blk_seed(struct nd_region *nd_region) ...@@ -1689,6 +1775,18 @@ void nd_region_create_blk_seed(struct nd_region *nd_region)
nd_device_register(nd_region->ns_seed); nd_device_register(nd_region->ns_seed);
} }
void nd_region_create_pfn_seed(struct nd_region *nd_region)
{
WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
nd_region->pfn_seed = nd_pfn_create(nd_region);
/*
* Seed creation failures are not fatal, provisioning is simply
* disabled until memory becomes available
*/
if (!nd_region->pfn_seed)
dev_err(&nd_region->dev, "failed to create pfn namespace\n");
}
void nd_region_create_btt_seed(struct nd_region *nd_region) void nd_region_create_btt_seed(struct nd_region *nd_region)
{ {
WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
......
...@@ -30,6 +30,7 @@ struct nvdimm_bus { ...@@ -30,6 +30,7 @@ struct nvdimm_bus {
struct list_head list; struct list_head list;
struct device dev; struct device dev;
int id, probe_active; int id, probe_active;
struct list_head poison_list;
struct mutex reconfig_mutex; struct mutex reconfig_mutex;
}; };
...@@ -52,6 +53,7 @@ void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); ...@@ -52,6 +53,7 @@ void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
struct nd_region; struct nd_region;
void nd_region_create_blk_seed(struct nd_region *nd_region); void nd_region_create_blk_seed(struct nd_region *nd_region);
void nd_region_create_btt_seed(struct nd_region *nd_region); void nd_region_create_btt_seed(struct nd_region *nd_region);
void nd_region_create_pfn_seed(struct nd_region *nd_region);
void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev); void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
......
...@@ -29,13 +29,12 @@ enum { ...@@ -29,13 +29,12 @@ enum {
ND_MAX_LANES = 256, ND_MAX_LANES = 256,
SECTOR_SHIFT = 9, SECTOR_SHIFT = 9,
INT_LBASIZE_ALIGNMENT = 64, INT_LBASIZE_ALIGNMENT = 64,
#if IS_ENABLED(CONFIG_NVDIMM_PFN) };
ND_PFN_ALIGN = PAGES_PER_SECTION * PAGE_SIZE,
ND_PFN_MASK = ND_PFN_ALIGN - 1, struct nd_poison {
#else u64 start;
ND_PFN_ALIGN = 0, u64 length;
ND_PFN_MASK = 0, struct list_head list;
#endif
}; };
struct nvdimm_drvdata { struct nvdimm_drvdata {
...@@ -153,6 +152,7 @@ struct nd_pfn { ...@@ -153,6 +152,7 @@ struct nd_pfn {
int id; int id;
u8 *uuid; u8 *uuid;
struct device dev; struct device dev;
unsigned long align;
unsigned long npfns; unsigned long npfns;
enum nd_pfn_mode mode; enum nd_pfn_mode mode;
struct nd_pfn_sb *pfn_sb; struct nd_pfn_sb *pfn_sb;
...@@ -262,6 +262,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); ...@@ -262,6 +262,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns);
const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
char *name); char *name);
void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
struct badblocks *bb, resource_size_t offset);
int nd_blk_region_init(struct nd_region *nd_region); int nd_blk_region_init(struct nd_region *nd_region);
void __nd_iostat_start(struct bio *bio, unsigned long *start); void __nd_iostat_start(struct bio *bio, unsigned long *start);
static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
......
...@@ -103,6 +103,52 @@ static ssize_t mode_store(struct device *dev, ...@@ -103,6 +103,52 @@ static ssize_t mode_store(struct device *dev,
} }
static DEVICE_ATTR_RW(mode); static DEVICE_ATTR_RW(mode);
static ssize_t align_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
return sprintf(buf, "%lx\n", nd_pfn->align);
}
static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf)
{
unsigned long val;
int rc;
rc = kstrtoul(buf, 0, &val);
if (rc)
return rc;
if (!is_power_of_2(val) || val < PAGE_SIZE || val > SZ_1G)
return -EINVAL;
if (nd_pfn->dev.driver)
return -EBUSY;
else
nd_pfn->align = val;
return 0;
}
static ssize_t align_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
ssize_t rc;
device_lock(dev);
nvdimm_bus_lock(dev);
rc = __align_store(nd_pfn, buf);
dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
rc, buf, buf[len - 1] == '\n' ? "" : "\n");
nvdimm_bus_unlock(dev);
device_unlock(dev);
return rc ? rc : len;
}
static DEVICE_ATTR_RW(align);
static ssize_t uuid_show(struct device *dev, static ssize_t uuid_show(struct device *dev,
struct device_attribute *attr, char *buf) struct device_attribute *attr, char *buf)
{ {
...@@ -164,6 +210,7 @@ static struct attribute *nd_pfn_attributes[] = { ...@@ -164,6 +210,7 @@ static struct attribute *nd_pfn_attributes[] = {
&dev_attr_mode.attr, &dev_attr_mode.attr,
&dev_attr_namespace.attr, &dev_attr_namespace.attr,
&dev_attr_uuid.attr, &dev_attr_uuid.attr,
&dev_attr_align.attr,
NULL, NULL,
}; };
...@@ -179,7 +226,6 @@ static const struct attribute_group *nd_pfn_attribute_groups[] = { ...@@ -179,7 +226,6 @@ static const struct attribute_group *nd_pfn_attribute_groups[] = {
}; };
static struct device *__nd_pfn_create(struct nd_region *nd_region, static struct device *__nd_pfn_create(struct nd_region *nd_region,
u8 *uuid, enum nd_pfn_mode mode,
struct nd_namespace_common *ndns) struct nd_namespace_common *ndns)
{ {
struct nd_pfn *nd_pfn; struct nd_pfn *nd_pfn;
...@@ -199,10 +245,8 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region, ...@@ -199,10 +245,8 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region,
return NULL; return NULL;
} }
nd_pfn->mode = mode; nd_pfn->mode = PFN_MODE_NONE;
if (uuid) nd_pfn->align = HPAGE_SIZE;
uuid = kmemdup(uuid, 16, GFP_KERNEL);
nd_pfn->uuid = uuid;
dev = &nd_pfn->dev; dev = &nd_pfn->dev;
dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id); dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id);
dev->parent = &nd_region->dev; dev->parent = &nd_region->dev;
...@@ -220,8 +264,7 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region, ...@@ -220,8 +264,7 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region,
struct device *nd_pfn_create(struct nd_region *nd_region) struct device *nd_pfn_create(struct nd_region *nd_region)
{ {
struct device *dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, struct device *dev = __nd_pfn_create(nd_region, NULL);
NULL);
if (dev) if (dev)
__nd_device_register(dev); __nd_device_register(dev);
...@@ -230,10 +273,11 @@ struct device *nd_pfn_create(struct nd_region *nd_region) ...@@ -230,10 +273,11 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
int nd_pfn_validate(struct nd_pfn *nd_pfn) int nd_pfn_validate(struct nd_pfn *nd_pfn)
{ {
struct nd_namespace_common *ndns = nd_pfn->ndns;
struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
struct nd_namespace_io *nsio;
u64 checksum, offset; u64 checksum, offset;
struct nd_namespace_io *nsio;
struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
struct nd_namespace_common *ndns = nd_pfn->ndns;
const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev);
if (!pfn_sb || !ndns) if (!pfn_sb || !ndns)
return -ENODEV; return -ENODEV;
...@@ -241,10 +285,6 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) ...@@ -241,10 +285,6 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
if (!is_nd_pmem(nd_pfn->dev.parent)) if (!is_nd_pmem(nd_pfn->dev.parent))
return -ENODEV; return -ENODEV;
/* section alignment for simple hotplug */
if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN)
return -ENODEV;
if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb))) if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)))
return -ENXIO; return -ENXIO;
...@@ -257,6 +297,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) ...@@ -257,6 +297,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
return -ENODEV; return -ENODEV;
pfn_sb->checksum = cpu_to_le64(checksum); pfn_sb->checksum = cpu_to_le64(checksum);
if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0)
return -ENODEV;
switch (le32_to_cpu(pfn_sb->mode)) { switch (le32_to_cpu(pfn_sb->mode)) {
case PFN_MODE_RAM: case PFN_MODE_RAM:
break; break;
...@@ -278,6 +321,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) ...@@ -278,6 +321,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
return -EINVAL; return -EINVAL;
} }
if (nd_pfn->align > nvdimm_namespace_capacity(ndns)) {
dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
nd_pfn->align, nvdimm_namespace_capacity(ndns));
return -EINVAL;
}
/* /*
* These warnings are verbose because they can only trigger in * These warnings are verbose because they can only trigger in
* the case where the physical address alignment of the * the case where the physical address alignment of the
...@@ -286,17 +335,19 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn) ...@@ -286,17 +335,19 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
*/ */
offset = le64_to_cpu(pfn_sb->dataoff); offset = le64_to_cpu(pfn_sb->dataoff);
nsio = to_nd_namespace_io(&ndns->dev); nsio = to_nd_namespace_io(&ndns->dev);
if (nsio->res.start & ND_PFN_MASK) { if (offset >= resource_size(&nsio->res)) {
dev_err(&nd_pfn->dev,
"init failed: %s not section aligned\n",
dev_name(&ndns->dev));
return -EBUSY;
} else if (offset >= resource_size(&nsio->res)) {
dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n", dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n",
dev_name(&ndns->dev)); dev_name(&ndns->dev));
return -EBUSY; return -EBUSY;
} }
nd_pfn->align = 1UL << ilog2(offset);
if (!is_power_of_2(offset) || offset < PAGE_SIZE) {
dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n",
offset);
return -ENXIO;
}
return 0; return 0;
} }
EXPORT_SYMBOL(nd_pfn_validate); EXPORT_SYMBOL(nd_pfn_validate);
...@@ -313,7 +364,7 @@ int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata) ...@@ -313,7 +364,7 @@ int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata)
return -ENODEV; return -ENODEV;
nvdimm_bus_lock(&ndns->dev); nvdimm_bus_lock(&ndns->dev);
dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, ndns); dev = __nd_pfn_create(nd_region, ndns);
nvdimm_bus_unlock(&ndns->dev); nvdimm_bus_unlock(&ndns->dev);
if (!dev) if (!dev)
return -ENOMEM; return -ENOMEM;
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/memory_hotplug.h> #include <linux/memory_hotplug.h>
#include <linux/moduleparam.h> #include <linux/moduleparam.h>
#include <linux/badblocks.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/pmem.h> #include <linux/pmem.h>
...@@ -41,11 +42,25 @@ struct pmem_device { ...@@ -41,11 +42,25 @@ struct pmem_device {
phys_addr_t data_offset; phys_addr_t data_offset;
void __pmem *virt_addr; void __pmem *virt_addr;
size_t size; size_t size;
struct badblocks bb;
}; };
static int pmem_major; static int pmem_major;
static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
{
if (bb->count) {
sector_t first_bad;
int num_bad;
return !!badblocks_check(bb, sector, len / 512, &first_bad,
&num_bad);
}
return false;
}
static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
unsigned int len, unsigned int off, int rw, unsigned int len, unsigned int off, int rw,
sector_t sector) sector_t sector)
{ {
...@@ -54,6 +69,8 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, ...@@ -54,6 +69,8 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
void __pmem *pmem_addr = pmem->virt_addr + pmem_off; void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
if (rw == READ) { if (rw == READ) {
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
return -EIO;
memcpy_from_pmem(mem + off, pmem_addr, len); memcpy_from_pmem(mem + off, pmem_addr, len);
flush_dcache_page(page); flush_dcache_page(page);
} else { } else {
...@@ -62,10 +79,12 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, ...@@ -62,10 +79,12 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
} }
kunmap_atomic(mem); kunmap_atomic(mem);
return 0;
} }
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
{ {
int rc = 0;
bool do_acct; bool do_acct;
unsigned long start; unsigned long start;
struct bio_vec bvec; struct bio_vec bvec;
...@@ -74,9 +93,15 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) ...@@ -74,9 +93,15 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
struct pmem_device *pmem = bdev->bd_disk->private_data; struct pmem_device *pmem = bdev->bd_disk->private_data;
do_acct = nd_iostat_start(bio, &start); do_acct = nd_iostat_start(bio, &start);
bio_for_each_segment(bvec, bio, iter) bio_for_each_segment(bvec, bio, iter) {
pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
bio_data_dir(bio), iter.bi_sector); bvec.bv_offset, bio_data_dir(bio),
iter.bi_sector);
if (rc) {
bio->bi_error = rc;
break;
}
}
if (do_acct) if (do_acct)
nd_iostat_end(bio, start); nd_iostat_end(bio, start);
...@@ -91,13 +116,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, ...@@ -91,13 +116,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, int rw) struct page *page, int rw)
{ {
struct pmem_device *pmem = bdev->bd_disk->private_data; struct pmem_device *pmem = bdev->bd_disk->private_data;
int rc;
pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
if (rw & WRITE) if (rw & WRITE)
wmb_pmem(); wmb_pmem();
/*
* The ->rw_page interface is subtle and tricky. The core
* retries on any error, so we can only invoke page_endio() in
* the successful completion case. Otherwise, we'll see crashes
* caused by double completion.
*/
if (rc == 0)
page_endio(page, rw & WRITE, 0); page_endio(page, rw & WRITE, 0);
return 0; return rc;
} }
static long pmem_direct_access(struct block_device *bdev, sector_t sector, static long pmem_direct_access(struct block_device *bdev, sector_t sector,
...@@ -195,7 +229,12 @@ static int pmem_attach_disk(struct device *dev, ...@@ -195,7 +229,12 @@ static int pmem_attach_disk(struct device *dev,
disk->driverfs_dev = dev; disk->driverfs_dev = dev;
set_capacity(disk, (pmem->size - pmem->data_offset) / 512); set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
pmem->pmem_disk = disk; pmem->pmem_disk = disk;
devm_exit_badblocks(dev, &pmem->bb);
if (devm_init_badblocks(dev, &pmem->bb))
return -ENOMEM;
nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset);
disk->bb = &pmem->bb;
add_disk(disk); add_disk(disk);
revalidate_disk(disk); revalidate_disk(disk);
...@@ -212,9 +251,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns, ...@@ -212,9 +251,13 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
return -EFAULT; return -EFAULT;
} }
if (rw == READ) if (rw == READ) {
unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
return -EIO;
memcpy_from_pmem(buf, pmem->virt_addr + offset, size); memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
else { } else {
memcpy_to_pmem(pmem->virt_addr + offset, buf, size); memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
wmb_pmem(); wmb_pmem();
} }
...@@ -238,14 +281,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) ...@@ -238,14 +281,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
nd_pfn->pfn_sb = pfn_sb; nd_pfn->pfn_sb = pfn_sb;
rc = nd_pfn_validate(nd_pfn); rc = nd_pfn_validate(nd_pfn);
if (rc == 0 || rc == -EBUSY) if (rc == -ENODEV)
/* no info block, do init */;
else
return rc; return rc;
/* section alignment for simple hotplug */
if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN
|| pmem->phys_addr & ND_PFN_MASK)
return -ENODEV;
nd_region = to_nd_region(nd_pfn->dev.parent); nd_region = to_nd_region(nd_pfn->dev.parent);
if (nd_region->ro) { if (nd_region->ro) {
dev_info(&nd_pfn->dev, dev_info(&nd_pfn->dev,
...@@ -263,9 +303,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) ...@@ -263,9 +303,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
* ->direct_access() to those that are included in the memmap. * ->direct_access() to those that are included in the memmap.
*/ */
if (nd_pfn->mode == PFN_MODE_PMEM) if (nd_pfn->mode == PFN_MODE_PMEM)
offset = ALIGN(SZ_8K + 64 * npfns, PMD_SIZE); offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align);
else if (nd_pfn->mode == PFN_MODE_RAM) else if (nd_pfn->mode == PFN_MODE_RAM)
offset = SZ_8K; offset = ALIGN(SZ_8K, nd_pfn->align);
else else
goto err; goto err;
...@@ -275,6 +315,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) ...@@ -275,6 +315,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
pfn_sb->npfns = cpu_to_le64(npfns); pfn_sb->npfns = cpu_to_le64(npfns);
memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN); memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
pfn_sb->version_major = cpu_to_le16(1); pfn_sb->version_major = cpu_to_le16(1);
checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
pfn_sb->checksum = cpu_to_le64(checksum); pfn_sb->checksum = cpu_to_le64(checksum);
...@@ -326,21 +367,11 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) ...@@ -326,21 +367,11 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
if (rc) if (rc)
return rc; return rc;
if (PAGE_SIZE != SZ_4K) {
dev_err(dev, "only supported on systems with 4K PAGE_SIZE\n");
return -ENXIO;
}
if (nsio->res.start & ND_PFN_MASK) {
dev_err(dev, "%s not memory hotplug section aligned\n",
dev_name(&ndns->dev));
return -ENXIO;
}
pfn_sb = nd_pfn->pfn_sb; pfn_sb = nd_pfn->pfn_sb;
offset = le64_to_cpu(pfn_sb->dataoff); offset = le64_to_cpu(pfn_sb->dataoff);
nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode); nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
if (nd_pfn->mode == PFN_MODE_RAM) { if (nd_pfn->mode == PFN_MODE_RAM) {
if (offset != SZ_8K) if (offset < SZ_8K)
return -EINVAL; return -EINVAL;
nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
altmap = NULL; altmap = NULL;
...@@ -389,6 +420,9 @@ static int nd_pmem_probe(struct device *dev) ...@@ -389,6 +420,9 @@ static int nd_pmem_probe(struct device *dev)
pmem->ndns = ndns; pmem->ndns = ndns;
dev_set_drvdata(dev, pmem); dev_set_drvdata(dev, pmem);
ndns->rw_bytes = pmem_rw_bytes; ndns->rw_bytes = pmem_rw_bytes;
if (devm_init_badblocks(dev, &pmem->bb))
return -ENOMEM;
nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
if (is_nd_btt(dev)) if (is_nd_btt(dev))
return nvdimm_namespace_attach_btt(ndns); return nvdimm_namespace_attach_btt(ndns);
......
...@@ -134,62 +134,6 @@ int nd_region_to_nstype(struct nd_region *nd_region) ...@@ -134,62 +134,6 @@ int nd_region_to_nstype(struct nd_region *nd_region)
} }
EXPORT_SYMBOL(nd_region_to_nstype); EXPORT_SYMBOL(nd_region_to_nstype);
static int is_uuid_busy(struct device *dev, void *data)
{
struct nd_region *nd_region = to_nd_region(dev->parent);
u8 *uuid = data;
switch (nd_region_to_nstype(nd_region)) {
case ND_DEVICE_NAMESPACE_PMEM: {
struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
if (!nspm->uuid)
break;
if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0)
return -EBUSY;
break;
}
case ND_DEVICE_NAMESPACE_BLK: {
struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
if (!nsblk->uuid)
break;
if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0)
return -EBUSY;
break;
}
default:
break;
}
return 0;
}
static int is_namespace_uuid_busy(struct device *dev, void *data)
{
if (is_nd_pmem(dev) || is_nd_blk(dev))
return device_for_each_child(dev, data, is_uuid_busy);
return 0;
}
/**
* nd_is_uuid_unique - verify that no other namespace has @uuid
* @dev: any device on a nvdimm_bus
* @uuid: uuid to check
*/
bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
{
struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
if (!nvdimm_bus)
return false;
WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
if (device_for_each_child(&nvdimm_bus->dev, uuid,
is_namespace_uuid_busy) != 0)
return false;
return true;
}
static ssize_t size_show(struct device *dev, static ssize_t size_show(struct device *dev,
struct device_attribute *attr, char *buf) struct device_attribute *attr, char *buf)
{ {
...@@ -406,6 +350,9 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) ...@@ -406,6 +350,9 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
struct nd_interleave_set *nd_set = nd_region->nd_set; struct nd_interleave_set *nd_set = nd_region->nd_set;
int type = nd_region_to_nstype(nd_region); int type = nd_region_to_nstype(nd_region);
if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr)
return 0;
if (a != &dev_attr_set_cookie.attr if (a != &dev_attr_set_cookie.attr
&& a != &dev_attr_available_size.attr) && a != &dev_attr_available_size.attr)
return a->mode; return a->mode;
...@@ -487,6 +434,13 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, ...@@ -487,6 +434,13 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
nd_region_create_blk_seed(nd_region); nd_region_create_blk_seed(nd_region);
nvdimm_bus_unlock(dev); nvdimm_bus_unlock(dev);
} }
if (is_nd_pfn(dev) && probe) {
nd_region = to_nd_region(dev->parent);
nvdimm_bus_lock(dev);
if (nd_region->pfn_seed == dev)
nd_region_create_pfn_seed(nd_region);
nvdimm_bus_unlock(dev);
}
} }
void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev) void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
......
...@@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock, ...@@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
return 0; return 0;
} }
static struct inode *bdev_file_inode(struct file *file)
{
return file->f_mapping->host;
}
static ssize_t static ssize_t
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host; struct inode *inode = bdev_file_inode(file);
if (IS_DAX(inode)) if (IS_DAX(inode))
return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
...@@ -338,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, ...@@ -338,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
*/ */
static loff_t block_llseek(struct file *file, loff_t offset, int whence) static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{ {
struct inode *bd_inode = file->f_mapping->host; struct inode *bd_inode = bdev_file_inode(file);
loff_t retval; loff_t retval;
mutex_lock(&bd_inode->i_mutex); mutex_lock(&bd_inode->i_mutex);
...@@ -349,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) ...@@ -349,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{ {
struct inode *bd_inode = filp->f_mapping->host; struct inode *bd_inode = bdev_file_inode(filp);
struct block_device *bdev = I_BDEV(bd_inode); struct block_device *bdev = I_BDEV(bd_inode);
int error; int error;
...@@ -1224,8 +1229,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) ...@@ -1224,8 +1229,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
} }
} }
if (!ret) if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
if (!blkdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
/* /*
* If the device is invalidated, rescan partition * If the device is invalidated, rescan partition
...@@ -1239,6 +1247,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) ...@@ -1239,6 +1247,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
else if (ret == -ENOMEDIUM) else if (ret == -ENOMEDIUM)
invalidate_partitions(disk, bdev); invalidate_partitions(disk, bdev);
} }
if (ret) if (ret)
goto out_clear; goto out_clear;
} else { } else {
...@@ -1259,12 +1268,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) ...@@ -1259,12 +1268,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear; goto out_clear;
} }
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
/* if (!blkdev_dax_capable(bdev))
* If the partition is not aligned on a page
* boundary, we can't do dax I/O to it.
*/
if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
(bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
bdev->bd_inode->i_flags &= ~S_DAX; bdev->bd_inode->i_flags &= ~S_DAX;
} }
} else { } else {
...@@ -1599,14 +1603,14 @@ EXPORT_SYMBOL(blkdev_put); ...@@ -1599,14 +1603,14 @@ EXPORT_SYMBOL(blkdev_put);
static int blkdev_close(struct inode * inode, struct file * filp) static int blkdev_close(struct inode * inode, struct file * filp)
{ {
struct block_device *bdev = I_BDEV(filp->f_mapping->host); struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
blkdev_put(bdev, filp->f_mode); blkdev_put(bdev, filp->f_mode);
return 0; return 0;
} }
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{ {
struct block_device *bdev = I_BDEV(file->f_mapping->host); struct block_device *bdev = I_BDEV(bdev_file_inode(file));
fmode_t mode = file->f_mode; fmode_t mode = file->f_mode;
/* /*
...@@ -1631,7 +1635,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) ...@@ -1631,7 +1635,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *bd_inode = file->f_mapping->host; struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode); loff_t size = i_size_read(bd_inode);
struct blk_plug plug; struct blk_plug plug;
ssize_t ret; ssize_t ret;
...@@ -1663,7 +1667,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter); ...@@ -1663,7 +1667,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter);
ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *bd_inode = file->f_mapping->host; struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode); loff_t size = i_size_read(bd_inode);
loff_t pos = iocb->ki_pos; loff_t pos = iocb->ki_pos;
...@@ -1702,13 +1706,101 @@ static const struct address_space_operations def_blk_aops = { ...@@ -1702,13 +1706,101 @@ static const struct address_space_operations def_blk_aops = {
.is_dirty_writeback = buffer_check_dirty_writeback, .is_dirty_writeback = buffer_check_dirty_writeback,
}; };
#ifdef CONFIG_FS_DAX
/*
* In the raw block case we do not need to contend with truncation nor
* unwritten file extents. Without those concerns there is no need for
* additional locking beyond the mmap_sem context that these routines
* are already executing under.
*
* Note, there is no protection if the block device is dynamically
* resized (partition grow/shrink) during a fault. A stable block device
* size is already not enforced in the blkdev_direct_IO path.
*
* For DAX, it is the responsibility of the block device driver to
* ensure the whole-disk device size is stable while requests are in
* flight.
*
* Finally, unlike the filemap_page_mkwrite() case there is no
* filesystem superblock to sync against freezing. We still include a
* pfn_mkwrite callback for dax drivers to receive write fault
* notifications.
*/
static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return __dax_fault(vma, vmf, blkdev_get_block, NULL);
}
static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags)
{
return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
}
static void blkdev_vm_open(struct vm_area_struct *vma)
{
struct inode *bd_inode = bdev_file_inode(vma->vm_file);
struct block_device *bdev = I_BDEV(bd_inode);
mutex_lock(&bd_inode->i_mutex);
bdev->bd_map_count++;
mutex_unlock(&bd_inode->i_mutex);
}
static void blkdev_vm_close(struct vm_area_struct *vma)
{
struct inode *bd_inode = bdev_file_inode(vma->vm_file);
struct block_device *bdev = I_BDEV(bd_inode);
mutex_lock(&bd_inode->i_mutex);
bdev->bd_map_count--;
mutex_unlock(&bd_inode->i_mutex);
}
static const struct vm_operations_struct blkdev_dax_vm_ops = {
.open = blkdev_vm_open,
.close = blkdev_vm_close,
.fault = blkdev_dax_fault,
.pmd_fault = blkdev_dax_pmd_fault,
.pfn_mkwrite = blkdev_dax_fault,
};
static const struct vm_operations_struct blkdev_default_vm_ops = {
.open = blkdev_vm_open,
.close = blkdev_vm_close,
.fault = filemap_fault,
.map_pages = filemap_map_pages,
};
static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *bd_inode = bdev_file_inode(file);
struct block_device *bdev = I_BDEV(bd_inode);
file_accessed(file);
mutex_lock(&bd_inode->i_mutex);
bdev->bd_map_count++;
if (IS_DAX(bd_inode)) {
vma->vm_ops = &blkdev_dax_vm_ops;
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &blkdev_default_vm_ops;
}
mutex_unlock(&bd_inode->i_mutex);
return 0;
}
#else
#define blkdev_mmap generic_file_mmap
#endif
const struct file_operations def_blk_fops = { const struct file_operations def_blk_fops = {
.open = blkdev_open, .open = blkdev_open,
.release = blkdev_close, .release = blkdev_close,
.llseek = block_llseek, .llseek = block_llseek,
.read_iter = blkdev_read_iter, .read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter, .write_iter = blkdev_write_iter,
.mmap = generic_file_mmap, .mmap = blkdev_mmap,
.fsync = blkdev_fsync, .fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl, .unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
......
#ifndef _LINUX_BADBLOCKS_H
#define _LINUX_BADBLOCKS_H
#include <linux/seqlock.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/stddef.h>
#include <linux/types.h>
#define BB_LEN_MASK (0x00000000000001FFULL)
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
#define BB_ACK_MASK (0x8000000000000000ULL)
#define BB_MAX_LEN 512
#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
/* Bad block numbers are stored sorted in a single page.
* 64bits is used for each block or extent.
* 54 bits are sector number, 9 bits are extent size,
* 1 bit is an 'acknowledged' flag.
*/
#define MAX_BADBLOCKS (PAGE_SIZE/8)
struct badblocks {
struct device *dev; /* set by devm_init_badblocks */
int count; /* count of bad blocks */
int unacked_exist; /* there probably are unacknowledged
* bad blocks. This is only cleared
* when a read discovers none
*/
int shift; /* shift from sectors to block size
* a -ve shift means badblocks are
* disabled.*/
u64 *page; /* badblock list */
int changed;
seqlock_t lock;
sector_t sector;
sector_t size; /* in sectors */
};
int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors);
int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
int acknowledged);
int badblocks_clear(struct badblocks *bb, sector_t s, int sectors);
void ack_all_badblocks(struct badblocks *bb);
ssize_t badblocks_show(struct badblocks *bb, char *page, int unack);
ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
int unack);
int badblocks_init(struct badblocks *bb, int enable);
void badblocks_exit(struct badblocks *bb);
struct device;
int devm_init_badblocks(struct device *dev, struct badblocks *bb);
static inline void devm_exit_badblocks(struct device *dev, struct badblocks *bb)
{
if (bb->dev != dev) {
dev_WARN_ONCE(dev, 1, "%s: badblocks instance not associated\n",
__func__);
return;
}
badblocks_exit(bb);
}
#endif
...@@ -483,6 +483,9 @@ struct block_device { ...@@ -483,6 +483,9 @@ struct block_device {
int bd_fsfreeze_count; int bd_fsfreeze_count;
/* Mutex for freeze */ /* Mutex for freeze */
struct mutex bd_fsfreeze_mutex; struct mutex bd_fsfreeze_mutex;
#ifdef CONFIG_FS_DAX
int bd_map_count;
#endif
}; };
/* /*
...@@ -2280,6 +2283,14 @@ extern struct super_block *freeze_bdev(struct block_device *); ...@@ -2280,6 +2283,14 @@ extern struct super_block *freeze_bdev(struct block_device *);
extern void emergency_thaw_all(void); extern void emergency_thaw_all(void);
extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
extern int fsync_bdev(struct block_device *); extern int fsync_bdev(struct block_device *);
#ifdef CONFIG_FS_DAX
extern bool blkdev_dax_capable(struct block_device *bdev);
#else
static inline bool blkdev_dax_capable(struct block_device *bdev)
{
return false;
}
#endif
extern struct super_block *blockdev_superblock; extern struct super_block *blockdev_superblock;
......
...@@ -162,6 +162,7 @@ struct disk_part_tbl { ...@@ -162,6 +162,7 @@ struct disk_part_tbl {
}; };
struct disk_events; struct disk_events;
struct badblocks;
#if defined(CONFIG_BLK_DEV_INTEGRITY) #if defined(CONFIG_BLK_DEV_INTEGRITY)
...@@ -213,6 +214,7 @@ struct gendisk { ...@@ -213,6 +214,7 @@ struct gendisk {
struct kobject integrity_kobj; struct kobject integrity_kobj;
#endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* CONFIG_BLK_DEV_INTEGRITY */
int node_id; int node_id;
struct badblocks *bb;
}; };
static inline struct gendisk *part_to_disk(struct hd_struct *part) static inline struct gendisk *part_to_disk(struct hd_struct *part)
......
...@@ -116,6 +116,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc( ...@@ -116,6 +116,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
} }
int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
struct nvdimm_bus_descriptor *nfit_desc, struct module *module); struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
#define nvdimm_bus_register(parent, desc) \ #define nvdimm_bus_register(parent, desc) \
......
...@@ -188,6 +188,8 @@ struct inodes_stat_t { ...@@ -188,6 +188,8 @@ struct inodes_stat_t {
#define BLKSECDISCARD _IO(0x12,125) #define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126) #define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127) #define BLKZEROOUT _IO(0x12,127)
#define BLKDAXSET _IO(0x12,128)
#define BLKDAXGET _IO(0x12,129)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */ #define FIBMAP _IO(0x00,1) /* bmap access */
......
...@@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr) ...@@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr)
break; break;
if (p->end < addr) if (p->end < addr)
continue; continue;
if (p->flags & IORESOURCE_BUSY && /*
p->flags & IORESOURCE_EXCLUSIVE) { * A resource is exclusive if IORESOURCE_EXCLUSIVE is set
* or CONFIG_IO_STRICT_DEVMEM is enabled and the
* resource is busy.
*/
if ((p->flags & IORESOURCE_BUSY) == 0)
continue;
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|| p->flags & IORESOURCE_EXCLUSIVE) {
err = 1; err = 1;
break; break;
} }
......
...@@ -1886,3 +1886,42 @@ source "samples/Kconfig" ...@@ -1886,3 +1886,42 @@ source "samples/Kconfig"
source "lib/Kconfig.kgdb" source "lib/Kconfig.kgdb"
config ARCH_HAS_DEVMEM_IS_ALLOWED
bool
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
depends on ARCH_HAS_DEVMEM_IS_ALLOWED
default y if TILE || PPC
---help---
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
access to this is obviously disastrous, but specific access can
be used by people debugging the kernel. Note that with PAT support
enabled, even in this case there are restrictions on /dev/mem
use due to the cache aliasing requirements.
If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem
file only allows userspace access to PCI space and the BIOS code and
data regions. This is sufficient for dosemu and X and all common
users of /dev/mem.
If in doubt, say Y.
config IO_STRICT_DEVMEM
bool "Filter I/O access to /dev/mem"
depends on STRICT_DEVMEM
default STRICT_DEVMEM
---help---
If this option is disabled, you allow userspace (root) access to all
io-memory regardless of whether a driver is actively using that
range. Accidental access to this is obviously disastrous, but
specific access can be used by people debugging kernel drivers.
If this option is switched on, the /dev/mem file only allows
userspace access to *idle* io-memory ranges (see /proc/iomem) This
may break traditional users of /dev/mem (dosemu, legacy X, etc...)
if the driver using a given range cannot be disabled.
If in doubt, say Y.
...@@ -9,6 +9,8 @@ ldflags-y += --wrap=memunmap ...@@ -9,6 +9,8 @@ ldflags-y += --wrap=memunmap
ldflags-y += --wrap=__devm_request_region ldflags-y += --wrap=__devm_request_region
ldflags-y += --wrap=__request_region ldflags-y += --wrap=__request_region
ldflags-y += --wrap=__release_region ldflags-y += --wrap=__release_region
ldflags-y += --wrap=devm_memremap_pages
ldflags-y += --wrap=phys_to_pfn_t
DRIVERS := ../../../drivers DRIVERS := ../../../drivers
NVDIMM_SRC := $(DRIVERS)/nvdimm NVDIMM_SRC := $(DRIVERS)/nvdimm
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/io.h> #include <linux/io.h>
#include <linux/mm.h>
#include "nfit_test.h" #include "nfit_test.h"
static LIST_HEAD(iomap_head); static LIST_HEAD(iomap_head);
...@@ -41,7 +42,7 @@ void nfit_test_teardown(void) ...@@ -41,7 +42,7 @@ void nfit_test_teardown(void)
} }
EXPORT_SYMBOL(nfit_test_teardown); EXPORT_SYMBOL(nfit_test_teardown);
static struct nfit_test_resource *get_nfit_res(resource_size_t resource) static struct nfit_test_resource *__get_nfit_res(resource_size_t resource)
{ {
struct iomap_ops *ops; struct iomap_ops *ops;
...@@ -51,14 +52,22 @@ static struct nfit_test_resource *get_nfit_res(resource_size_t resource) ...@@ -51,14 +52,22 @@ static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
return NULL; return NULL;
} }
void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
{ {
struct nfit_test_resource *nfit_res; struct nfit_test_resource *res;
rcu_read_lock(); rcu_read_lock();
nfit_res = get_nfit_res(offset); res = __get_nfit_res(resource);
rcu_read_unlock(); rcu_read_unlock();
return res;
}
void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
{
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
if (nfit_res) if (nfit_res)
return (void __iomem *) nfit_res->buf + offset return (void __iomem *) nfit_res->buf + offset
- nfit_res->res->start; - nfit_res->res->start;
...@@ -68,11 +77,8 @@ void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, ...@@ -68,11 +77,8 @@ void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
void __iomem *__wrap_devm_ioremap_nocache(struct device *dev, void __iomem *__wrap_devm_ioremap_nocache(struct device *dev,
resource_size_t offset, unsigned long size) resource_size_t offset, unsigned long size)
{ {
struct nfit_test_resource *nfit_res; struct nfit_test_resource *nfit_res = get_nfit_res(offset);
rcu_read_lock();
nfit_res = get_nfit_res(offset);
rcu_read_unlock();
if (nfit_res) if (nfit_res)
return (void __iomem *) nfit_res->buf + offset return (void __iomem *) nfit_res->buf + offset
- nfit_res->res->start; - nfit_res->res->start;
...@@ -83,25 +89,58 @@ EXPORT_SYMBOL(__wrap_devm_ioremap_nocache); ...@@ -83,25 +89,58 @@ EXPORT_SYMBOL(__wrap_devm_ioremap_nocache);
void *__wrap_devm_memremap(struct device *dev, resource_size_t offset, void *__wrap_devm_memremap(struct device *dev, resource_size_t offset,
size_t size, unsigned long flags) size_t size, unsigned long flags)
{ {
struct nfit_test_resource *nfit_res; struct nfit_test_resource *nfit_res = get_nfit_res(offset);
rcu_read_lock();
nfit_res = get_nfit_res(offset);
rcu_read_unlock();
if (nfit_res) if (nfit_res)
return nfit_res->buf + offset - nfit_res->res->start; return nfit_res->buf + offset - nfit_res->res->start;
return devm_memremap(dev, offset, size, flags); return devm_memremap(dev, offset, size, flags);
} }
EXPORT_SYMBOL(__wrap_devm_memremap); EXPORT_SYMBOL(__wrap_devm_memremap);
#ifdef __HAVE_ARCH_PTE_DEVMAP
#include <linux/memremap.h>
#include <linux/pfn_t.h>
void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap)
{
resource_size_t offset = res->start;
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
if (nfit_res)
return nfit_res->buf + offset - nfit_res->res->start;
return devm_memremap_pages(dev, res, ref, altmap);
}
EXPORT_SYMBOL(__wrap_devm_memremap_pages);
pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
{
struct nfit_test_resource *nfit_res = get_nfit_res(addr);
if (nfit_res)
flags &= ~PFN_MAP;
return phys_to_pfn_t(addr, flags);
}
EXPORT_SYMBOL(__wrap_phys_to_pfn_t);
#else
/* to be removed post 4.5-rc1 */
void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res)
{
resource_size_t offset = res->start;
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
if (nfit_res)
return nfit_res->buf + offset - nfit_res->res->start;
return devm_memremap_pages(dev, res);
}
EXPORT_SYMBOL(__wrap_devm_memremap_pages);
#endif
void *__wrap_memremap(resource_size_t offset, size_t size, void *__wrap_memremap(resource_size_t offset, size_t size,
unsigned long flags) unsigned long flags)
{ {
struct nfit_test_resource *nfit_res; struct nfit_test_resource *nfit_res = get_nfit_res(offset);
rcu_read_lock();
nfit_res = get_nfit_res(offset);
rcu_read_unlock();
if (nfit_res) if (nfit_res)
return nfit_res->buf + offset - nfit_res->res->start; return nfit_res->buf + offset - nfit_res->res->start;
return memremap(offset, size, flags); return memremap(offset, size, flags);
...@@ -110,11 +149,8 @@ EXPORT_SYMBOL(__wrap_memremap); ...@@ -110,11 +149,8 @@ EXPORT_SYMBOL(__wrap_memremap);
void __wrap_devm_memunmap(struct device *dev, void *addr) void __wrap_devm_memunmap(struct device *dev, void *addr)
{ {
struct nfit_test_resource *nfit_res; struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
rcu_read_lock();
nfit_res = get_nfit_res((unsigned long) addr);
rcu_read_unlock();
if (nfit_res) if (nfit_res)
return; return;
return devm_memunmap(dev, addr); return devm_memunmap(dev, addr);
...@@ -135,11 +171,7 @@ EXPORT_SYMBOL(__wrap_ioremap_wc); ...@@ -135,11 +171,7 @@ EXPORT_SYMBOL(__wrap_ioremap_wc);
void __wrap_iounmap(volatile void __iomem *addr) void __wrap_iounmap(volatile void __iomem *addr)
{ {
struct nfit_test_resource *nfit_res; struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
rcu_read_lock();
nfit_res = get_nfit_res((unsigned long) addr);
rcu_read_unlock();
if (nfit_res) if (nfit_res)
return; return;
return iounmap(addr); return iounmap(addr);
...@@ -148,11 +180,8 @@ EXPORT_SYMBOL(__wrap_iounmap); ...@@ -148,11 +180,8 @@ EXPORT_SYMBOL(__wrap_iounmap);
void __wrap_memunmap(void *addr) void __wrap_memunmap(void *addr)
{ {
struct nfit_test_resource *nfit_res; struct nfit_test_resource *nfit_res = get_nfit_res((long) addr);
rcu_read_lock();
nfit_res = get_nfit_res((unsigned long) addr);
rcu_read_unlock();
if (nfit_res) if (nfit_res)
return; return;
return memunmap(addr); return memunmap(addr);
...@@ -166,9 +195,7 @@ static struct resource *nfit_test_request_region(struct device *dev, ...@@ -166,9 +195,7 @@ static struct resource *nfit_test_request_region(struct device *dev,
struct nfit_test_resource *nfit_res; struct nfit_test_resource *nfit_res;
if (parent == &iomem_resource) { if (parent == &iomem_resource) {
rcu_read_lock();
nfit_res = get_nfit_res(start); nfit_res = get_nfit_res(start);
rcu_read_unlock();
if (nfit_res) { if (nfit_res) {
struct resource *res = nfit_res->res + 1; struct resource *res = nfit_res->res + 1;
...@@ -218,9 +245,7 @@ void __wrap___release_region(struct resource *parent, resource_size_t start, ...@@ -218,9 +245,7 @@ void __wrap___release_region(struct resource *parent, resource_size_t start,
struct nfit_test_resource *nfit_res; struct nfit_test_resource *nfit_res;
if (parent == &iomem_resource) { if (parent == &iomem_resource) {
rcu_read_lock();
nfit_res = get_nfit_res(start); nfit_res = get_nfit_res(start);
rcu_read_unlock();
if (nfit_res) { if (nfit_res) {
struct resource *res = nfit_res->res + 1; struct resource *res = nfit_res->res + 1;
......
...@@ -248,6 +248,8 @@ static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd, ...@@ -248,6 +248,8 @@ static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd,
nd_cmd->out_length = 256; nd_cmd->out_length = 256;
nd_cmd->num_records = 0; nd_cmd->num_records = 0;
nd_cmd->address = 0;
nd_cmd->length = -1ULL;
nd_cmd->status = 0; nd_cmd->status = 0;
return 0; return 0;
...@@ -1088,6 +1090,8 @@ static void nfit_test1_setup(struct nfit_test *t) ...@@ -1088,6 +1090,8 @@ static void nfit_test1_setup(struct nfit_test *t)
struct acpi_nfit_memory_map *memdev; struct acpi_nfit_memory_map *memdev;
struct acpi_nfit_control_region *dcr; struct acpi_nfit_control_region *dcr;
struct acpi_nfit_system_address *spa; struct acpi_nfit_system_address *spa;
struct nvdimm_bus_descriptor *nd_desc;
struct acpi_nfit_desc *acpi_desc;
offset = 0; offset = 0;
/* spa0 (flat range with no bdw aliasing) */ /* spa0 (flat range with no bdw aliasing) */
...@@ -1135,6 +1139,13 @@ static void nfit_test1_setup(struct nfit_test *t) ...@@ -1135,6 +1139,13 @@ static void nfit_test1_setup(struct nfit_test *t)
dcr->command_size = 0; dcr->command_size = 0;
dcr->status_offset = 0; dcr->status_offset = 0;
dcr->status_size = 0; dcr->status_size = 0;
acpi_desc = &t->acpi_desc;
set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
nd_desc = &acpi_desc->nd_desc;
nd_desc->ndctl = nfit_test_ctl;
} }
static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa, static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment