Commit d65e1a0f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 's390-6.10-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux

Pull s390 updates from Alexander Gordeev:

 - Store AP Query Configuration Information in a static buffer

 - Rework the AP initialization and add missing cleanups to the error
   path

 - Swap IRQ and AP bus/device registration to avoid race conditions

 - Export prot_virt_guest symbol

 - Introduce AP configuration changes notifier interface to facilitate
   modularization of the AP bus

 - Add CONFIG_AP kernel configuration option to allow modularization of
   the AP bus

 - Rework CONFIG_ZCRYPT_DEBUG kernel configuration option description
   and dependency and rename it to CONFIG_AP_DEBUG

 - Convert sprintf() and snprintf() to sysfs_emit() in CIO code

 - Adjust indentation of RELOCS command build step

 - Make crypto performance counters upward compatible

 - Convert make_page_secure() and gmap_make_secure() to use folio

 - Rework channel-utilization-block (CUB) handling in preparation of
   introducing additional CUBs

 - Use attribute groups to simplify registration, removal and extension
   of measurement-related channel-path sysfs attributes

 - Add a per-channel-path binary "ext_measurement" sysfs attribute that
   provides access to extended channel-path measurement data

 - Export measurement data for all channel-measurement-groups (CMG), not
   only for a specific ones. This enables support of new CMG data
   formats in userspace without the need for kernel changes

 - Add a per-channel-path sysfs attribute "speed_bps" that provides the
   operating speed in bits per second or 0 if the operating speed is not
   available

 - The CIO tracepoint subchannel-type field "st" is incorrectly set to
   the value of subchannel-enabled SCHIB "ena" field. Fix that

 - Do not forcefully limit vmemmap starting address to MAX_PHYSMEM_BITS

 - Consider the maximum physical address available to a DCSS segment
   (512GB) when memory layout is set up

 - Simplify the virtual memory layout setup by reducing the size of
   identity mapping vs vmemmap overlap

 - Swap vmalloc and Lowcore/Real Memory Copy areas in virtual memory.
   This will allow to place the kernel image next to kernel modules

 - Move everyting KASLR related from <asm/setup.h> to <asm/page.h>

 - Put virtual memory layout information into a structure to improve
   code generation

 - Currently __kaslr_offset is the kernel offset in both physical and
   virtual memory spaces. Uncouple these offsets to allow uncoupling of
   the addresses spaces

 - Currently the identity mapping base address is implicit and is always
   set to zero. Make it explicit by putting into __identity_base
   persistent boot variable and use it in proper context

 - Introduce .amode31 section start and end macros AMODE31_START and
   AMODE31_END

 - Introduce OS_INFO entries that do not reference any data in memory,
   but rather provide only values

 - Store virtual memory layout in OS_INFO. It is read out by
   makedumpfile, crash and other tools

 - Store virtual memory layout in VMCORE_INFO. It is read out by crash
   and other tools when /proc/kcore device is used

 - Create additional PT_LOAD ELF program header that covers kernel image
   only, so that vmcore tools could locate kernel text and data when
   virtual and physical memory spaces are uncoupled

 - Uncouple physical and virtual address spaces

 - Map kernel at fixed location when KASLR mode is disabled. The
   location is defined by CONFIG_KERNEL_IMAGE_BASE kernel configuration
   value.

 - Rework deployment of kernel image for both compressed and
   uncompressed variants as defined by CONFIG_KERNEL_UNCOMPRESSED kernel
   configuration value

 - Move .vmlinux.relocs section in front of the compressed kernel. The
   interim section rescue step is avoided as result

 - Correct modules thunk offset calculation when branch target is more
   than 2GB away

 - Kernel modules contain their own set of expoline thunks. Now that the
   kernel modules area is less than 4GB away from kernel expoline
   thunks, make modules use kernel expolines. Also make EXPOLINE_EXTERN
   the default if the compiler supports it

 - userfaultfd can insert shared zeropages into processes running VMs,
   but that is not allowed for s390. Fallback to allocating a fresh
   zeroed anonymous folio and insert that instead

 - Re-enable shared zeropages for non-PV and non-skeys KVM guests

 - Rename hex2bitmap() to ap_hex2bitmap() and export it for external use

 - Add ap_config sysfs attribute to provide the means for setting or
   displaying adapters, domains and control domains assigned to a
   vfio-ap mediated device in a single operation

 - Make vfio_ap_mdev_link_queue() ignore duplicate link requests

 - Add write support to ap_config sysfs attribute to allow atomic update
   a vfio-ap mediated device state

 - Document ap_config sysfs attribute

 - Function os_info_old_init() is expected to be called only from a
   regular kdump kernel. Enable it to be called from a stand-alone dump
   kernel

 - Address gcc -Warray-bounds warning and fix array size in struct
   os_info

 - s390 does not support SMBIOS, so drop unneeded CONFIG_DMI checks

 - Use unwinder instead of __builtin_return_address() with ftrace to
   prevent returning of undefined values

 - Sections .hash and .gnu.hash are only created when CONFIG_PIE_BUILD
   kernel is enabled. Drop these for the case CONFIG_PIE_BUILD is
   disabled

 - Compile kernel with -fPIC and link with -no-pie to allow kpatch
   feature always succeed and drop the whole CONFIG_PIE_BUILD
   option-enabled code

 - Add missing virt_to_phys() converter for VSIE facility and crypto
   control blocks

* tag 's390-6.10-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (54 commits)
  Revert "s390: Relocate vmlinux ELF data to virtual address space"
  KVM: s390: vsie: Use virt_to_phys for crypto control block
  s390: Relocate vmlinux ELF data to virtual address space
  s390: Compile kernel with -fPIC and link with -no-pie
  s390: vmlinux.lds.S: Drop .hash and .gnu.hash for !CONFIG_PIE_BUILD
  s390/ftrace: Use unwinder instead of __builtin_return_address()
  s390/pci: Drop unneeded reference to CONFIG_DMI
  s390/os_info: Fix array size in struct os_info
  s390/os_info: Initialize old os_info in standalone dump kernel
  docs: Update s390 vfio-ap doc for ap_config sysfs attribute
  s390/vfio-ap: Add write support to sysfs attr ap_config
  s390/vfio-ap: Ignore duplicate link requests in vfio_ap_mdev_link_queue
  s390/vfio-ap: Add sysfs attr, ap_config, to export mdev state
  s390/ap: Externalize AP bus specific bitmap reading function
  s390/mm: Re-enable the shared zeropage for !PV and !skeys KVM guests
  mm/userfaultfd: Do not place zeropages when zeropages are disallowed
  s390/expoline: Make modules use kernel expolines
  s390/nospec: Correct modules thunk offset calculation
  s390/boot: Do not rescue .vmlinux.relocs section
  s390/boot: Rework deployment of the kernel image
  ...
parents a38297e3 1812dc9c
......@@ -4785,7 +4785,9 @@
prot_virt= [S390] enable hosting protected virtual machines
isolated from the hypervisor (if hardware supports
that).
that). If enabled, the default kernel base address
might be overridden even when Kernel Address Space
Layout Randomization is disabled.
Format: <bool>
psi= [KNL] Enable or disable pressure stall information
......
......@@ -8,6 +8,7 @@ s390 Architecture
cds
3270
driver-model
mm
monreader
qeth
s390dbf
......
.. SPDX-License-Identifier: GPL-2.0
=================
Memory Management
=================
Virtual memory layout
=====================
.. note::
- Some aspects of the virtual memory layout setup are not
clarified (number of page levels, alignment, DMA memory).
- Unused gaps in the virtual memory layout could be present
or not - depending on how partucular system is configured.
No page tables are created for the unused gaps.
- The virtual memory regions are tracked or untracked by KASAN
instrumentation, as well as the KASAN shadow memory itself is
created only when CONFIG_KASAN configuration option is enabled.
::
=============================================================================
| Physical | Virtual | VM area description
=============================================================================
+- 0 --------------+- 0 --------------+
| | S390_lowcore | Low-address memory
| +- 8 KB -----------+
| | |
| | |
| | ... unused gap | KASAN untracked
| | |
+- AMODE31_START --+- AMODE31_START --+ .amode31 rand. phys/virt start
|.amode31 text/data|.amode31 text/data| KASAN untracked
+- AMODE31_END ----+- AMODE31_END ----+ .amode31 rand. phys/virt end (<2GB)
| | |
| | |
+- __kaslr_offset_phys | kernel rand. phys start
| | |
| kernel text/data | |
| | |
+------------------+ | kernel phys end
| | |
| | |
| | |
| | |
+- ident_map_size -+ |
| |
| ... unused gap | KASAN untracked
| |
+- __identity_base + identity mapping start (>= 2GB)
| |
| identity | phys == virt - __identity_base
| mapping | virt == phys + __identity_base
| |
| | KASAN tracked
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
+---- vmemmap -----+ 'struct page' array start
| |
| virtually mapped |
| memory map | KASAN untracked
| |
+- __abs_lowcore --+
| |
| Absolute Lowcore | KASAN untracked
| |
+- __memcpy_real_area
| |
| Real Memory Copy| KASAN untracked
| |
+- VMALLOC_START --+ vmalloc area start
| | KASAN untracked or
| vmalloc area | KASAN shallowly populated in case
| | CONFIG_KASAN_VMALLOC=y
+- MODULES_VADDR --+ modules area start
| | KASAN allocated per module or
| modules area | KASAN shallowly populated in case
| | CONFIG_KASAN_VMALLOC=y
+- __kaslr_offset -+ kernel rand. virt start
| | KASAN tracked
| kernel text/data | phys == (kvirt - __kaslr_offset) +
| | __kaslr_offset_phys
+- kernel .bss end + kernel rand. virt end
| |
| ... unused gap | KASAN untracked
| |
+------------------+ UltraVisor Secure Storage limit
| |
| ... unused gap | KASAN untracked
| |
+KASAN_SHADOW_START+ KASAN shadow memory start
| |
| KASAN shadow | KASAN untracked
| |
+------------------+ ASCE limit
......@@ -380,6 +380,36 @@ matrix device.
control_domains:
A read-only file for displaying the control domain numbers assigned to the
vfio_ap mediated device.
ap_config:
A read/write file that, when written to, allows all three of the
vfio_ap mediated device's ap matrix masks to be replaced in one shot.
Three masks are given, one for adapters, one for domains, and one for
control domains. If the given state cannot be set then no changes are
made to the vfio-ap mediated device.
The format of the data written to ap_config is as follows:
{amask},{dmask},{cmask}\n
\n is a newline character.
amask, dmask, and cmask are masks identifying which adapters, domains,
and control domains should be assigned to the mediated device.
The format of a mask is as follows:
0xNN..NN
Where NN..NN is 64 hexadecimal characters representing a 256-bit value.
The leftmost (highest order) bit represents adapter/domain 0.
For an example set of masks that represent your mdev's current
configuration, simply cat ap_config.
Setting an adapter or domain number greater than the maximum allowed for
the system will result in an error.
This attribute is intended to be used by automation. End users would be
better served using the respective assign/unassign attributes for
adapters, domains, and control domains.
* functions:
......@@ -550,7 +580,7 @@ These are the steps:
following Kconfig elements selected:
* IOMMU_SUPPORT
* S390
* ZCRYPT
* AP
* VFIO
* KVM
......
......@@ -17,6 +17,9 @@ config ARCH_HAS_ILOG2_U32
config ARCH_HAS_ILOG2_U64
def_bool n
config ARCH_PROC_KCORE_TEXT
def_bool y
config GENERIC_HWEIGHT
def_bool y
......@@ -552,7 +555,7 @@ config EXPOLINE
If unsure, say N.
config EXPOLINE_EXTERN
def_bool n
def_bool y if EXPOLINE
depends on EXPOLINE
depends on CC_IS_GCC && GCC_VERSION >= 110200
depends on $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC))
......@@ -590,18 +593,6 @@ config RELOCATABLE
Note: this option exists only for documentation purposes, please do
not remove it.
config PIE_BUILD
def_bool CC_IS_CLANG && !$(cc-option,-munaligned-symbols)
help
If the compiler is unable to generate code that can manage unaligned
symbols, the kernel is linked as a position-independent executable
(PIE) and includes dynamic relocations that are processed early
during bootup.
For kpatch functionality, it is recommended to build the kernel
without the PIE_BUILD option. PIE_BUILD is only enabled when the
compiler lacks proper support for handling unaligned symbols.
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image (KASLR)"
default y
......@@ -611,6 +602,25 @@ config RANDOMIZE_BASE
as a security feature that deters exploit attempts relying on
knowledge of the location of kernel internals.
config KERNEL_IMAGE_BASE
hex "Kernel image base address"
range 0x100000 0x1FFFFFE0000000 if !KASAN
range 0x100000 0x1BFFFFE0000000 if KASAN
default 0x3FFE0000000 if !KASAN
default 0x7FFFE0000000 if KASAN
help
This is the address at which the kernel image is loaded in case
Kernel Address Space Layout Randomization (KASLR) is disabled.
In case the Protected virtualization guest support is enabled the
Ultravisor imposes a virtual address limit. If the value of this
option leads to the kernel image exceeding the Ultravisor limit,
this option is ignored and the image is loaded below the limit.
If the value of this option leads to the kernel image overlapping
the virtual memory where other data structures are located, this
option is ignored and the image is loaded above the structures.
endmenu
menu "Memory setup"
......@@ -724,6 +734,33 @@ config EADM_SCH
To compile this driver as a module, choose M here: the
module will be called eadm_sch.
config AP
def_tristate y
prompt "Support for Adjunct Processors (ap)"
help
This driver allows usage to Adjunct Processor (AP) devices via
the ap bus, cards and queues. Supported Adjunct Processors are
the CryptoExpress Cards (CEX).
To compile this driver as a module, choose M here: the
module will be called ap.
If unsure, say Y (default).
config AP_DEBUG
def_bool n
prompt "Enable debug features for Adjunct Processor (ap) devices"
depends on AP
help
Say 'Y' here to enable some additional debug features for Adjunct
Processor (ap) devices.
There will be some more sysfs attributes displayed for ap queues.
Do not enable on production level kernel build.
If unsure, say N.
config VFIO_CCW
def_tristate n
prompt "Support for VFIO-CCW subchannels"
......@@ -740,7 +777,7 @@ config VFIO_AP
prompt "VFIO support for AP devices"
depends on KVM
depends on VFIO
depends on ZCRYPT
depends on AP
select VFIO_MDEV
help
This driver grants access to Adjunct Processor (AP) devices
......
......@@ -14,14 +14,9 @@ KBUILD_AFLAGS_MODULE += -fPIC
KBUILD_CFLAGS_MODULE += -fPIC
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
ifdef CONFIG_PIE_BUILD
KBUILD_CFLAGS += -fPIE
LDFLAGS_vmlinux := -pie -z notext
else
KBUILD_CFLAGS += $(call cc-option,-munaligned-symbols,)
LDFLAGS_vmlinux := --emit-relocs --discard-none
KBUILD_CFLAGS += -fPIC
LDFLAGS_vmlinux := -no-pie --emit-relocs --discard-none
extra_tools := relocs
endif
aflags_dwarf := -Wa,-gdwarf-2
KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__
ifndef CONFIG_AS_IS_LLVM
......@@ -88,7 +83,6 @@ endif
ifdef CONFIG_EXPOLINE
ifdef CONFIG_EXPOLINE_EXTERN
KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline/expoline.o
CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern
CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern
else
......@@ -167,11 +161,6 @@ vdso_prepare: prepare0
vdso-install-y += arch/s390/kernel/vdso64/vdso64.so.dbg
vdso-install-$(CONFIG_COMPAT) += arch/s390/kernel/vdso32/vdso32.so.dbg
ifdef CONFIG_EXPOLINE_EXTERN
modules_prepare: expoline_prepare
expoline_prepare: scripts
$(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o
endif
endif
# Don't use tabs in echo arguments
......
......@@ -37,8 +37,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o
obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
obj-y += version.o pgm_check_info.o ctype.o ipl_data.o
obj-y += $(if $(CONFIG_PIE_BUILD),machine_kexec_reloc.o,relocs.o)
obj-y += version.o pgm_check_info.o ctype.o ipl_data.o relocs.o
obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
......@@ -49,9 +48,7 @@ targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y
targets += vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all)
ifndef CONFIG_PIE_BUILD
targets += relocs.S
endif
OBJECTS := $(addprefix $(obj)/,$(obj-y))
OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all))
......@@ -110,13 +107,11 @@ OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section
$(obj)/vmlinux.bin: vmlinux FORCE
$(call if_changed,objcopy)
ifndef CONFIG_PIE_BUILD
CMD_RELOCS=arch/s390/tools/relocs
quiet_cmd_relocs = RELOCS $@
quiet_cmd_relocs = RELOCS $@
cmd_relocs = $(CMD_RELOCS) $< > $@
$(obj)/relocs.S: vmlinux FORCE
$(call if_changed,relocs)
endif
suffix-$(CONFIG_KERNEL_GZIP) := .gz
suffix-$(CONFIG_KERNEL_BZIP2) := .bz2
......
......@@ -17,7 +17,6 @@ struct machine_info {
};
struct vmlinux_info {
unsigned long default_lma;
unsigned long entry;
unsigned long image_size; /* does not include .bss */
unsigned long bss_size; /* uncompressed image .bss size */
......@@ -25,14 +24,8 @@ struct vmlinux_info {
unsigned long bootdata_size;
unsigned long bootdata_preserved_off;
unsigned long bootdata_preserved_size;
#ifdef CONFIG_PIE_BUILD
unsigned long dynsym_start;
unsigned long rela_dyn_start;
unsigned long rela_dyn_end;
#else
unsigned long got_start;
unsigned long got_end;
#endif
unsigned long amode31_size;
unsigned long init_mm_off;
unsigned long swapper_pg_dir_off;
......@@ -74,10 +67,11 @@ void sclp_early_setup_buffer(void);
void print_pgm_check_info(void);
unsigned long randomize_within_range(unsigned long size, unsigned long align,
unsigned long min, unsigned long max);
void setup_vmem(unsigned long asce_limit);
void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit);
void __printf(1, 2) decompressor_printk(const char *fmt, ...);
void print_stacktrace(unsigned long sp);
void error(char *m);
int get_random(unsigned long limit, unsigned long *value);
extern struct machine_info machine;
......@@ -98,6 +92,10 @@ extern struct vmlinux_info _vmlinux_info;
#define vmlinux _vmlinux_info
#define __abs_lowcore_pa(x) (((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore))
#define __kernel_va(x) ((void *)((unsigned long)(x) - __kaslr_offset_phys + __kaslr_offset))
#define __kernel_pa(x) ((unsigned long)(x) - __kaslr_offset + __kaslr_offset_phys)
#define __identity_va(x) ((void *)((unsigned long)(x) + __identity_base))
#define __identity_pa(x) ((unsigned long)(x) - __identity_base)
static inline bool intersects(unsigned long addr0, unsigned long size0,
unsigned long addr1, unsigned long size1)
......
......@@ -63,24 +63,13 @@ static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE;
#include "../../../../lib/decompress_unzstd.c"
#endif
#define decompress_offset ALIGN((unsigned long)_end + BOOT_HEAP_SIZE, PAGE_SIZE)
unsigned long mem_safe_offset(void)
{
/*
* due to 4MB HEAD_SIZE for bzip2
* 'decompress_offset + vmlinux.image_size' could be larger than
* kernel at final position + its .bss, so take the larger of two
*/
return max(decompress_offset + vmlinux.image_size,
vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size);
return ALIGN(free_mem_end_ptr, PAGE_SIZE);
}
void *decompress_kernel(void)
void deploy_kernel(void *output)
{
void *output = (void *)decompress_offset;
__decompress(_compressed_start, _compressed_end - _compressed_start,
NULL, NULL, output, vmlinux.image_size, NULL, error);
return output;
}
......@@ -2,11 +2,9 @@
#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H
#define BOOT_COMPRESSED_DECOMPRESSOR_H
#ifdef CONFIG_KERNEL_UNCOMPRESSED
static inline void *decompress_kernel(void) { return NULL; }
#else
void *decompress_kernel(void);
#endif
#ifndef CONFIG_KERNEL_UNCOMPRESSED
unsigned long mem_safe_offset(void);
void deploy_kernel(void *output);
#endif
#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */
......@@ -43,7 +43,7 @@ static int check_prng(void)
return PRNG_MODE_TDES;
}
static int get_random(unsigned long limit, unsigned long *value)
int get_random(unsigned long limit, unsigned long *value)
{
struct prng_parm prng = {
/* initial parameter block for tdes mode, copied from libica */
......
......@@ -153,8 +153,10 @@ void print_pgm_check_info(void)
decompressor_printk("Kernel command line: %s\n", early_command_line);
decompressor_printk("Kernel fault: interruption code %04x ilc:%x\n",
S390_lowcore.pgm_code, S390_lowcore.pgm_ilc >> 1);
if (kaslr_enabled())
if (kaslr_enabled()) {
decompressor_printk("Kernel random base: %lx\n", __kaslr_offset);
decompressor_printk("Kernel random base phys: %lx\n", __kaslr_offset_phys);
}
decompressor_printk("PSW : %016lx %016lx (%pS)\n",
S390_lowcore.psw_save_area.mask,
S390_lowcore.psw_save_area.addr,
......
This diff is collapsed.
......@@ -27,6 +27,8 @@ enum populate_mode {
POPULATE_NONE,
POPULATE_DIRECT,
POPULATE_ABS_LOWCORE,
POPULATE_IDENTITY,
POPULATE_KERNEL,
#ifdef CONFIG_KASAN
POPULATE_KASAN_MAP_SHADOW,
POPULATE_KASAN_ZERO_SHADOW,
......@@ -54,7 +56,7 @@ static inline void kasan_populate(unsigned long start, unsigned long end, enum p
pgtable_populate(start, end, mode);
}
static void kasan_populate_shadow(void)
static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end)
{
pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY);
......@@ -76,44 +78,20 @@ static void kasan_populate_shadow(void)
__arch_set_page_dat(kasan_early_shadow_pmd, 1UL << CRST_ALLOC_ORDER);
__arch_set_page_dat(kasan_early_shadow_pte, 1);
/*
* Current memory layout:
* +- 0 -------------+ +- shadow start -+
* |1:1 ident mapping| /|1/8 of ident map|
* | | / | |
* +-end of ident map+ / +----------------+
* | ... gap ... | / | kasan |
* | | / | zero page |
* +- vmalloc area -+ / | mapping |
* | vmalloc_size | / | (untracked) |
* +- modules vaddr -+ / +----------------+
* | 2Gb |/ | unmapped | allocated per module
* +- shadow start -+ +----------------+
* | 1/8 addr space | | zero pg mapping| (untracked)
* +- shadow end ----+---------+- shadow end ---+
*
* Current memory layout (KASAN_VMALLOC):
* +- 0 -------------+ +- shadow start -+
* |1:1 ident mapping| /|1/8 of ident map|
* | | / | |
* +-end of ident map+ / +----------------+
* | ... gap ... | / | kasan zero page| (untracked)
* | | / | mapping |
* +- vmalloc area -+ / +----------------+
* | vmalloc_size | / |shallow populate|
* +- modules vaddr -+ / +----------------+
* | 2Gb |/ |shallow populate|
* +- shadow start -+ +----------------+
* | 1/8 addr space | | zero pg mapping| (untracked)
* +- shadow end ----+---------+- shadow end ---+
*/
for_each_physmem_usable_range(i, &start, &end) {
kasan_populate(start, end, POPULATE_KASAN_MAP_SHADOW);
if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260)
kasan_populate(memgap_start, start, POPULATE_KASAN_ZERO_SHADOW);
kasan_populate((unsigned long)__identity_va(start),
(unsigned long)__identity_va(end),
POPULATE_KASAN_MAP_SHADOW);
if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260) {
kasan_populate((unsigned long)__identity_va(memgap_start),
(unsigned long)__identity_va(start),
POPULATE_KASAN_ZERO_SHADOW);
}
memgap_start = end;
}
kasan_populate(kernel_start, kernel_end, POPULATE_KASAN_MAP_SHADOW);
kasan_populate(0, (unsigned long)__identity_va(0), POPULATE_KASAN_ZERO_SHADOW);
kasan_populate(AMODE31_START, AMODE31_END, POPULATE_KASAN_ZERO_SHADOW);
if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
untracked_end = VMALLOC_START;
/* shallowly populate kasan shadow for vmalloc and modules */
......@@ -122,8 +100,9 @@ static void kasan_populate_shadow(void)
untracked_end = MODULES_VADDR;
}
/* populate kasan shadow for untracked memory */
kasan_populate(ident_map_size, untracked_end, POPULATE_KASAN_ZERO_SHADOW);
kasan_populate(MODULES_END, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW);
kasan_populate((unsigned long)__identity_va(ident_map_size), untracked_end,
POPULATE_KASAN_ZERO_SHADOW);
kasan_populate(kernel_end, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW);
}
static bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr,
......@@ -180,7 +159,9 @@ static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode)
}
#else
static inline void kasan_populate_shadow(void) {}
static inline void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end)
{
}
static inline bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr,
unsigned long end, enum populate_mode mode)
......@@ -263,6 +244,10 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m
return addr;
case POPULATE_ABS_LOWCORE:
return __abs_lowcore_pa(addr);
case POPULATE_KERNEL:
return __kernel_pa(addr);
case POPULATE_IDENTITY:
return __identity_pa(addr);
#ifdef CONFIG_KASAN
case POPULATE_KASAN_MAP_SHADOW:
addr = physmem_alloc_top_down(RR_VMEM, size, size);
......@@ -274,15 +259,22 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m
}
}
static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end)
static bool large_allowed(enum populate_mode mode)
{
return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY);
}
static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end,
enum populate_mode mode)
{
return machine.has_edat2 &&
return machine.has_edat2 && large_allowed(mode) &&
IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE;
}
static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end)
static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end,
enum populate_mode mode)
{
return machine.has_edat1 &&
return machine.has_edat1 && large_allowed(mode) &&
IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE;
}
......@@ -322,7 +314,7 @@ static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long e
if (pmd_none(*pmd)) {
if (kasan_pmd_populate_zero_shadow(pmd, addr, next, mode))
continue;
if (can_large_pmd(pmd, addr, next)) {
if (can_large_pmd(pmd, addr, next, mode)) {
entry = __pmd(_pa(addr, _SEGMENT_SIZE, mode));
entry = set_pmd_bit(entry, SEGMENT_KERNEL);
if (!machine.has_nx)
......@@ -355,7 +347,7 @@ static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long e
if (pud_none(*pud)) {
if (kasan_pud_populate_zero_shadow(pud, addr, next, mode))
continue;
if (can_large_pud(pud, addr, next)) {
if (can_large_pud(pud, addr, next, mode)) {
entry = __pud(_pa(addr, _REGION3_SIZE, mode));
entry = set_pud_bit(entry, REGION3_KERNEL);
if (!machine.has_nx)
......@@ -418,11 +410,12 @@ static void pgtable_populate(unsigned long addr, unsigned long end, enum populat
}
}
void setup_vmem(unsigned long asce_limit)
void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit)
{
unsigned long start, end;
unsigned long asce_type;
unsigned long asce_bits;
pgd_t *init_mm_pgd;
int i;
/*
......@@ -433,6 +426,15 @@ void setup_vmem(unsigned long asce_limit)
for_each_physmem_online_range(i, &start, &end)
__arch_set_page_nodat((void *)start, (end - start) >> PAGE_SHIFT);
/*
* init_mm->pgd contains virtual address of swapper_pg_dir.
* It is unusable at this stage since DAT is yet off. Swap
* it for physical address of swapper_pg_dir and restore
* the virtual address after all page tables are created.
*/
init_mm_pgd = init_mm.pgd;
init_mm.pgd = (pgd_t *)swapper_pg_dir;
if (asce_limit == _REGION1_SIZE) {
asce_type = _REGION2_ENTRY_EMPTY;
asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
......@@ -453,15 +455,20 @@ void setup_vmem(unsigned long asce_limit)
* the lowcore and create the identity mapping only afterwards.
*/
pgtable_populate(0, sizeof(struct lowcore), POPULATE_DIRECT);
for_each_physmem_usable_range(i, &start, &end)
pgtable_populate(start, end, POPULATE_DIRECT);
for_each_physmem_usable_range(i, &start, &end) {
pgtable_populate((unsigned long)__identity_va(start),
(unsigned long)__identity_va(end),
POPULATE_IDENTITY);
}
pgtable_populate(kernel_start, kernel_end, POPULATE_KERNEL);
pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT);
pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore),
POPULATE_ABS_LOWCORE);
pgtable_populate(__memcpy_real_area, __memcpy_real_area + PAGE_SIZE,
POPULATE_NONE);
memcpy_real_ptep = __virt_to_kpte(__memcpy_real_area);
memcpy_real_ptep = __identity_va(__virt_to_kpte(__memcpy_real_area));
kasan_populate_shadow();
kasan_populate_shadow(kernel_start, kernel_end);
S390_lowcore.kernel_asce.val = swapper_pg_dir | asce_bits;
S390_lowcore.user_asce = s390_invalid_asce;
......@@ -471,4 +478,5 @@ void setup_vmem(unsigned long asce_limit)
local_ctl_load(13, &S390_lowcore.kernel_asce);
init_mm.context.asce = S390_lowcore.kernel_asce.val;
init_mm.pgd = init_mm_pgd;
}
......@@ -99,8 +99,16 @@ SECTIONS
_decompressor_end = .;
. = ALIGN(4);
.vmlinux.relocs : {
__vmlinux_relocs_64_start = .;
*(.vmlinux.relocs_64)
__vmlinux_relocs_64_end = .;
}
#ifdef CONFIG_KERNEL_UNCOMPRESSED
. = 0x100000;
. = ALIGN(PAGE_SIZE);
. += AMODE31_SIZE; /* .amode31 section */
#else
. = ALIGN(8);
#endif
......@@ -110,24 +118,6 @@ SECTIONS
_compressed_end = .;
}
#ifndef CONFIG_PIE_BUILD
/*
* When the kernel is built with CONFIG_KERNEL_UNCOMPRESSED, the entire
* uncompressed vmlinux.bin is positioned in the bzImage decompressor
* image at the default kernel LMA of 0x100000, enabling it to be
* executed in-place. However, the size of .vmlinux.relocs could be
* large enough to cause an overlap with the uncompressed kernel at the
* address 0x100000. To address this issue, .vmlinux.relocs is
* positioned after the .rodata.compressed.
*/
. = ALIGN(4);
.vmlinux.relocs : {
__vmlinux_relocs_64_start = .;
*(.vmlinux.relocs_64)
__vmlinux_relocs_64_end = .;
}
#endif
#define SB_TRAILER_SIZE 32
/* Trailer needed for Secure Boot */
. += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */
......
......@@ -223,13 +223,18 @@ static inline struct ap_queue_status ap_zapq(ap_qid_t qid, int fbit)
* config info as returned by the ap_qci() function.
*/
struct ap_config_info {
unsigned int apsc : 1; /* S bit */
unsigned int apxa : 1; /* N bit */
unsigned int qact : 1; /* C bit */
unsigned int rc8a : 1; /* R bit */
unsigned int : 4;
unsigned int apsb : 1; /* B bit */
unsigned int : 23;
union {
unsigned int flags;
struct {
unsigned int apsc : 1; /* S bit */
unsigned int apxa : 1; /* N bit */
unsigned int qact : 1; /* C bit */
unsigned int rc8a : 1; /* R bit */
unsigned int : 4;
unsigned int apsb : 1; /* B bit */
unsigned int : 23;
};
};
unsigned char na; /* max # of APs - 1 */
unsigned char nd; /* max # of Domains - 1 */
unsigned char _reserved0[10];
......@@ -544,15 +549,4 @@ static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
return reg1.status;
}
/*
* Interface to tell the AP bus code that a configuration
* change has happened. The bus code should at least do
* an ap bus resource rescan.
*/
#if IS_ENABLED(CONFIG_ZCRYPT)
void ap_bus_cfg_chg(void);
#else
static inline void ap_bus_cfg_chg(void){}
#endif
#endif /* _ASM_S390_AP_H_ */
......@@ -4,6 +4,7 @@
#include <linux/kvm_host.h>
#include <linux/ftrace.h>
#include <asm/fpu.h>
#include <asm/nospec-branch.h>
#include <asm-generic/asm-prototypes.h>
__int128_t __ashlti3(__int128_t a, int b);
......
......@@ -11,6 +11,9 @@
#include <uapi/asm/chsc.h>
/* struct from linux/notifier.h */
struct notifier_block;
/**
* Operation codes for CHSC PNSO:
* PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport
......@@ -66,4 +69,16 @@ struct chsc_pnso_area {
struct chsc_pnso_naid_l2 entries[];
} __packed __aligned(PAGE_SIZE);
/*
* notifier interface - registered notifiers gets called on
* the following events:
* - ap config changed (CHSC_NOTIFY_AP_CFG)
*/
enum chsc_notify_type {
CHSC_NOTIFY_AP_CFG = 3,
};
int chsc_notifier_register(struct notifier_block *nb);
int chsc_notifier_unregister(struct notifier_block *nb);
#endif /* _ASM_S390_CHSC_H */
......@@ -8,6 +8,13 @@
#define _ASM_S390X_DCSS_H
#ifndef __ASSEMBLY__
/*
* DCSS segment is defined as a contiguous range of pages using DEFSEG command.
* The range start and end is a page number with a value less than or equal to
* 0x7ffffff (see CP Commands and Utilities Reference).
*/
#define MAX_DCSS_ADDR (512UL * SZ_1G)
/* possible values for segment type as returned by segment_info */
#define SEG_TYPE_SW 0
#define SEG_TYPE_EW 1
......
......@@ -8,12 +8,8 @@
#ifndef __ASSEMBLY__
#ifdef CONFIG_CC_IS_CLANG
/* https://llvm.org/pr41424 */
#define ftrace_return_address(n) 0UL
#else
#define ftrace_return_address(n) __builtin_return_address(n)
#endif
unsigned long return_address(unsigned int n);
#define ftrace_return_address(n) return_address(n)
void ftrace_caller(void);
......
......@@ -146,7 +146,7 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start,
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
int gmap_mark_unmergeable(void);
int s390_disable_cow_sharing(void);
void s390_unlist_old_asce(struct gmap *gmap);
int s390_replace_asce(struct gmap *gmap);
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
......
......@@ -32,6 +32,11 @@ typedef struct {
unsigned int uses_skeys:1;
/* The mmu context uses CMM. */
unsigned int uses_cmm:1;
/*
* The mmu context allows COW-sharing of memory pages (KSM, zeropage).
* Note that COW-sharing during fork() is currently always allowed.
*/
unsigned int allow_cow_sharing:1;
/* The gmaps associated with this context are allowed to use huge pages. */
unsigned int allow_gmap_hpage_1m:1;
} mm_context_t;
......
......@@ -35,6 +35,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.has_pgste = 0;
mm->context.uses_skeys = 0;
mm->context.uses_cmm = 0;
mm->context.allow_cow_sharing = 1;
mm->context.allow_gmap_hpage_1m = 0;
#endif
switch (mm->context.asce_limit) {
......
......@@ -17,6 +17,26 @@ static inline bool nospec_uses_trampoline(void)
return __is_defined(CC_USING_EXPOLINE) && !nospec_disable;
}
#ifdef CONFIG_EXPOLINE_EXTERN
void __s390_indirect_jump_r1(void);
void __s390_indirect_jump_r2(void);
void __s390_indirect_jump_r3(void);
void __s390_indirect_jump_r4(void);
void __s390_indirect_jump_r5(void);
void __s390_indirect_jump_r6(void);
void __s390_indirect_jump_r7(void);
void __s390_indirect_jump_r8(void);
void __s390_indirect_jump_r9(void);
void __s390_indirect_jump_r10(void);
void __s390_indirect_jump_r11(void);
void __s390_indirect_jump_r12(void);
void __s390_indirect_jump_r13(void);
void __s390_indirect_jump_r14(void);
void __s390_indirect_jump_r15(void);
#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_EXPOLINE_H */
......@@ -16,24 +16,25 @@
*/
.macro __THUNK_PROLOG_NAME name
#ifdef CONFIG_EXPOLINE_EXTERN
.pushsection .text,"ax",@progbits
__ALIGN
SYM_CODE_START(\name)
#else
.pushsection .text.\name,"axG",@progbits,\name,comdat
#endif
.globl \name
.hidden \name
.type \name,@function
\name:
CFI_STARTPROC
#endif
.endm
.macro __THUNK_EPILOG_NAME name
CFI_ENDPROC
#ifdef CONFIG_EXPOLINE_EXTERN
.size \name, .-\name
#endif
SYM_CODE_END(\name)
EXPORT_SYMBOL(\name)
#else
CFI_ENDPROC
.popsection
#endif
.endm
.macro __THUNK_PROLOG_BR r1
......
......@@ -17,11 +17,25 @@
#define OS_INFO_VMCOREINFO 0
#define OS_INFO_REIPL_BLOCK 1
#define OS_INFO_FLAGS_ENTRY 2
#define OS_INFO_RESERVED 3
#define OS_INFO_IDENTITY_BASE 4
#define OS_INFO_KASLR_OFFSET 5
#define OS_INFO_KASLR_OFF_PHYS 6
#define OS_INFO_VMEMMAP 7
#define OS_INFO_AMODE31_START 8
#define OS_INFO_AMODE31_END 9
#define OS_INFO_IMAGE_START 10
#define OS_INFO_IMAGE_END 11
#define OS_INFO_IMAGE_PHYS 12
#define OS_INFO_MAX 13
#define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0)
struct os_info_entry {
u64 addr;
union {
u64 addr;
u64 val;
};
u64 size;
u32 csum;
} __packed;
......@@ -33,17 +47,24 @@ struct os_info {
u16 version_minor;
u64 crashkernel_addr;
u64 crashkernel_size;
struct os_info_entry entry[3];
u8 reserved[4004];
struct os_info_entry entry[OS_INFO_MAX];
u8 reserved[3804];
} __packed;
void os_info_init(void);
void os_info_entry_add(int nr, void *ptr, u64 len);
void os_info_entry_add_data(int nr, void *ptr, u64 len);
void os_info_entry_add_val(int nr, u64 val);
void os_info_crashkernel_add(unsigned long base, unsigned long size);
u32 os_info_csum(struct os_info *os_info);
#ifdef CONFIG_CRASH_DUMP
void *os_info_old_entry(int nr, unsigned long *size);
static inline unsigned long os_info_old_value(int nr)
{
unsigned long size;
return (unsigned long)os_info_old_entry(nr, &size);
}
#else
static inline void *os_info_old_entry(int nr, unsigned long *size)
{
......
......@@ -178,19 +178,52 @@ int arch_make_page_accessible(struct page *page);
#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
#endif
#define __PAGE_OFFSET 0x0UL
#define PAGE_OFFSET 0x0UL
struct vm_layout {
unsigned long kaslr_offset;
unsigned long kaslr_offset_phys;
unsigned long identity_base;
unsigned long identity_size;
};
#define __pa_nodebug(x) ((unsigned long)(x))
extern struct vm_layout vm_layout;
#define __kaslr_offset vm_layout.kaslr_offset
#define __kaslr_offset_phys vm_layout.kaslr_offset_phys
#define __identity_base vm_layout.identity_base
#define ident_map_size vm_layout.identity_size
static inline unsigned long kaslr_offset(void)
{
return __kaslr_offset;
}
extern int __kaslr_enabled;
static inline int kaslr_enabled(void)
{
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
return __kaslr_enabled;
return 0;
}
#define __PAGE_OFFSET __identity_base
#define PAGE_OFFSET __PAGE_OFFSET
#ifdef __DECOMPRESSOR
#define __pa_nodebug(x) ((unsigned long)(x))
#define __pa(x) __pa_nodebug(x)
#define __pa32(x) __pa(x)
#define __va(x) ((void *)(unsigned long)(x))
#else /* __DECOMPRESSOR */
static inline unsigned long __pa_nodebug(unsigned long x)
{
if (x < __kaslr_offset)
return x - __identity_base;
return x - __kaslr_offset + __kaslr_offset_phys;
}
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x, bool is_31bit);
......@@ -206,7 +239,7 @@ static inline unsigned long __phys_addr(unsigned long x, bool is_31bit)
#define __pa(x) __phys_addr((unsigned long)(x), false)
#define __pa32(x) __phys_addr((unsigned long)(x), true)
#define __va(x) ((void *)(unsigned long)(x))
#define __va(x) ((void *)((unsigned long)(x) + __identity_base))
#endif /* __DECOMPRESSOR */
......@@ -231,7 +264,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr))
#define page_to_virt(page) pfn_to_virt(page_to_pfn(page))
#define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug(kaddr)))
#define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug((unsigned long)(kaddr))))
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
......@@ -240,4 +273,11 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
#define AMODE31_SIZE (3 * PAGE_SIZE)
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
#define __START_KERNEL 0x100000
#define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE
#define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE)
#endif /* _S390_PAGE_H */
......@@ -107,6 +107,12 @@ static inline int is_module_addr(void *addr)
return 1;
}
#ifdef CONFIG_RANDOMIZE_BASE
#define KASLR_LEN (1UL << 31)
#else
#define KASLR_LEN 0UL
#endif
/*
* A 64 bit pagetable entry of S390 has following format:
* | PFRA |0IPC| OS |
......@@ -566,10 +572,20 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot)
}
/*
* In the case that a guest uses storage keys
* faults should no longer be backed by zero pages
* As soon as the guest uses storage keys or enables PV, we deduplicate all
* mapped shared zeropages and prevent new shared zeropages from getting
* mapped.
*/
#define mm_forbids_zeropage mm_has_pgste
#define mm_forbids_zeropage mm_forbids_zeropage
static inline int mm_forbids_zeropage(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE
if (!mm->context.allow_cow_sharing)
return 1;
#endif
return 0;
}
static inline int mm_uses_skeys(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE
......
......@@ -22,7 +22,6 @@ enum reserved_range_type {
RR_DECOMPRESSOR,
RR_INITRD,
RR_VMLINUX,
RR_RELOC,
RR_AMODE31,
RR_IPLREPORT,
RR_CERT_COMP_LIST,
......@@ -170,4 +169,7 @@ static inline unsigned long get_physmem_reserved(enum reserved_range_type type,
return *size;
}
#define AMODE31_START (physmem_info.reserved[RR_AMODE31].start)
#define AMODE31_END (physmem_info.reserved[RR_AMODE31].end)
#endif
......@@ -127,20 +127,6 @@ extern void (*_machine_restart)(char *command);
extern void (*_machine_halt)(void);
extern void (*_machine_power_off)(void);
extern unsigned long __kaslr_offset;
static inline unsigned long kaslr_offset(void)
{
return __kaslr_offset;
}
extern int __kaslr_enabled;
static inline int kaslr_enabled(void)
{
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
return __kaslr_enabled;
return 0;
}
struct oldmem_data {
unsigned long start;
unsigned long size;
......
......@@ -11,6 +11,8 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
# Do not trace early setup code
CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_stacktrace.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_unwind_bc.o = $(CC_FLAGS_FTRACE)
endif
......
......@@ -465,7 +465,11 @@ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt)
ehdr->e_phoff = sizeof(Elf64_Ehdr);
ehdr->e_ehsize = sizeof(Elf64_Ehdr);
ehdr->e_phentsize = sizeof(Elf64_Phdr);
ehdr->e_phnum = mem_chunk_cnt + 1;
/*
* Number of memory chunk PT_LOAD program headers plus one kernel
* image PT_LOAD program header plus one PT_NOTE program header.
*/
ehdr->e_phnum = mem_chunk_cnt + 1 + 1;
return ehdr + 1;
}
......@@ -501,15 +505,16 @@ static int get_mem_chunk_cnt(void)
*/
static void loads_init(Elf64_Phdr *phdr)
{
unsigned long old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE);
phys_addr_t start, end;
u64 idx;
for_each_physmem_range(idx, &oldmem_type, &start, &end) {
phdr->p_filesz = end - start;
phdr->p_type = PT_LOAD;
phdr->p_vaddr = old_identity_base + start;
phdr->p_offset = start;
phdr->p_vaddr = (unsigned long)__va(start);
phdr->p_paddr = start;
phdr->p_filesz = end - start;
phdr->p_memsz = end - start;
phdr->p_flags = PF_R | PF_W | PF_X;
phdr->p_align = PAGE_SIZE;
......@@ -517,6 +522,25 @@ static void loads_init(Elf64_Phdr *phdr)
}
}
/*
* Prepare PT_LOAD type program header for kernel image region
*/
static void text_init(Elf64_Phdr *phdr)
{
unsigned long start_phys = os_info_old_value(OS_INFO_IMAGE_PHYS);
unsigned long start = os_info_old_value(OS_INFO_IMAGE_START);
unsigned long end = os_info_old_value(OS_INFO_IMAGE_END);
phdr->p_type = PT_LOAD;
phdr->p_vaddr = start;
phdr->p_filesz = end - start;
phdr->p_memsz = end - start;
phdr->p_offset = start_phys;
phdr->p_paddr = start_phys;
phdr->p_flags = PF_R | PF_W | PF_X;
phdr->p_align = PAGE_SIZE;
}
/*
* Initialize notes (new kernel)
*/
......@@ -557,6 +581,8 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
size += nt_vmcoreinfo_size();
/* nt_final */
size += sizeof(Elf64_Nhdr);
/* PT_LOAD type program header for kernel text region */
size += sizeof(Elf64_Phdr);
/* PT_LOADS */
size += mem_chunk_cnt * sizeof(Elf64_Phdr);
......@@ -568,7 +594,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
*/
int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
{
Elf64_Phdr *phdr_notes, *phdr_loads;
Elf64_Phdr *phdr_notes, *phdr_loads, *phdr_text;
size_t alloc_size;
int mem_chunk_cnt;
void *ptr, *hdr;
......@@ -606,14 +632,19 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
/* Init program headers */
phdr_notes = ptr;
ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr));
phdr_text = ptr;
ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr));
phdr_loads = ptr;
ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt);
/* Init notes */
hdr_off = PTR_DIFF(ptr, hdr);
ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off);
/* Init kernel text program header */
text_init(phdr_text);
/* Init loads */
hdr_off = PTR_DIFF(ptr, hdr);
loads_init(phdr_loads);
/* Finalize program headers */
hdr_off = PTR_DIFF(ptr, hdr);
*addr = (unsigned long long) hdr;
*size = (unsigned long long) hdr_off;
BUG_ON(elfcorehdr_size > alloc_size);
......
......@@ -1209,8 +1209,8 @@ static struct attribute_group reipl_nss_attr_group = {
void set_os_info_reipl_block(void)
{
os_info_entry_add(OS_INFO_REIPL_BLOCK, reipl_block_actual,
reipl_block_actual->hdr.len);
os_info_entry_add_data(OS_INFO_REIPL_BLOCK, reipl_block_actual,
reipl_block_actual->hdr.len);
}
/* reipl type */
......@@ -1940,7 +1940,7 @@ static void dump_reipl_run(struct shutdown_trigger *trigger)
reipl_type == IPL_TYPE_NSS ||
reipl_type == IPL_TYPE_UNKNOWN)
os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR;
os_info_entry_add(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags));
os_info_entry_add_data(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags));
csum = (__force unsigned int)cksm(reipl_block_actual, reipl_block_actual->hdr.len, 0);
abs_lc = get_abs_lowcore();
abs_lc->ipib = __pa(reipl_block_actual);
......
......@@ -114,10 +114,10 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end)
type = BRASL_EXPOLINE; /* brasl instruction */
else
continue;
thunk = instr + (*(int *)(instr + 2)) * 2;
thunk = instr + (long)(*(int *)(instr + 2)) * 2;
if (thunk[0] == 0xc6 && thunk[1] == 0x00)
/* exrl %r0,<target-br> */
br = thunk + (*(int *)(thunk + 2)) * 2;
br = thunk + (long)(*(int *)(thunk + 2)) * 2;
else
continue;
if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0)
......
......@@ -15,8 +15,10 @@
#include <asm/checksum.h>
#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
#include <asm/physmem_info.h>
#include <asm/maccess.h>
#include <asm/asm-offsets.h>
#include <asm/ipl.h>
/*
* OS info structure has to be page aligned
......@@ -43,9 +45,9 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size)
}
/*
* Add OS info entry and update checksum
* Add OS info data entry and update checksum
*/
void os_info_entry_add(int nr, void *ptr, u64 size)
void os_info_entry_add_data(int nr, void *ptr, u64 size)
{
os_info.entry[nr].addr = __pa(ptr);
os_info.entry[nr].size = size;
......@@ -53,6 +55,17 @@ void os_info_entry_add(int nr, void *ptr, u64 size)
os_info.csum = os_info_csum(&os_info);
}
/*
* Add OS info value entry and update checksum
*/
void os_info_entry_add_val(int nr, u64 value)
{
os_info.entry[nr].val = value;
os_info.entry[nr].size = 0;
os_info.entry[nr].csum = 0;
os_info.csum = os_info_csum(&os_info);
}
/*
* Initialize OS info structure and set lowcore pointer
*/
......@@ -60,9 +73,19 @@ void __init os_info_init(void)
{
struct lowcore *abs_lc;
BUILD_BUG_ON(sizeof(struct os_info) != PAGE_SIZE);
os_info.version_major = OS_INFO_VERSION_MAJOR;
os_info.version_minor = OS_INFO_VERSION_MINOR;
os_info.magic = OS_INFO_MAGIC;
os_info_entry_add_val(OS_INFO_IDENTITY_BASE, __identity_base);
os_info_entry_add_val(OS_INFO_KASLR_OFFSET, kaslr_offset());
os_info_entry_add_val(OS_INFO_KASLR_OFF_PHYS, __kaslr_offset_phys);
os_info_entry_add_val(OS_INFO_VMEMMAP, (unsigned long)vmemmap);
os_info_entry_add_val(OS_INFO_AMODE31_START, AMODE31_START);
os_info_entry_add_val(OS_INFO_AMODE31_END, AMODE31_END);
os_info_entry_add_val(OS_INFO_IMAGE_START, (unsigned long)_stext);
os_info_entry_add_val(OS_INFO_IMAGE_END, (unsigned long)_end);
os_info_entry_add_val(OS_INFO_IMAGE_PHYS, __pa_symbol(_stext));
os_info.csum = os_info_csum(&os_info);
abs_lc = get_abs_lowcore();
abs_lc->os_info = __pa(&os_info);
......@@ -125,7 +148,7 @@ static void os_info_old_init(void)
if (os_info_init)
return;
if (!oldmem_data.start)
if (!oldmem_data.start && !is_ipl_type_dump())
goto fail;
if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr)))
goto fail;
......
......@@ -428,7 +428,7 @@ static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset)
case CPUMF_CTR_SET_CRYPTO:
if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5)
ctrset_size = 16;
else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7)
else if (cpumf_ctr_info.csvn >= 6)
ctrset_size = 20;
break;
case CPUMF_CTR_SET_EXT:
......
......@@ -855,16 +855,11 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
}
/* Determine version specific crypto set */
switch (ci.csvn) {
case 1 ... 5:
csvn = none;
if (ci.csvn >= 1 && ci.csvn <= 5)
csvn = cpumcf_svn_12345_pmu_event_attr;
break;
case 6 ... 7:
else if (ci.csvn >= 6)
csvn = cpumcf_svn_67_pmu_event_attr;
break;
default:
csvn = none;
}
/* Determine model-specific counter set(s) */
get_cpu_id(&cpu_id);
......
......@@ -146,10 +146,10 @@ static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31;
static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31;
unsigned long __bootdata_preserved(max_mappable);
unsigned long __bootdata(ident_map_size);
struct physmem_info __bootdata(physmem_info);
unsigned long __bootdata_preserved(__kaslr_offset);
struct vm_layout __bootdata_preserved(vm_layout);
EXPORT_SYMBOL_GPL(vm_layout);
int __bootdata_preserved(__kaslr_enabled);
unsigned int __bootdata_preserved(zlib_dfltcc_support);
EXPORT_SYMBOL(zlib_dfltcc_support);
......@@ -765,7 +765,7 @@ static void __init relocate_amode31_section(void)
unsigned long amode31_size = __eamode31 - __samode31;
long amode31_offset, *ptr;
amode31_offset = physmem_info.reserved[RR_AMODE31].start - (unsigned long)__samode31;
amode31_offset = AMODE31_START - (unsigned long)__samode31;
pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size);
/* Move original AMODE31 section to the new one */
......
......@@ -101,3 +101,22 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
}
pagefault_enable();
}
unsigned long return_address(unsigned int n)
{
struct unwind_state state;
unsigned long addr;
/* Increment to skip current stack entry */
n++;
unwind_for_each_frame(&state, NULL, NULL, 0) {
addr = unwind_get_return_address(&state);
if (!addr)
break;
if (!n--)
return addr;
}
return 0;
}
EXPORT_SYMBOL_GPL(return_address);
......@@ -21,6 +21,7 @@
/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
int __bootdata_preserved(prot_virt_guest);
EXPORT_SYMBOL(prot_virt_guest);
#endif
/*
......@@ -181,36 +182,36 @@ int uv_convert_owned_from_secure(unsigned long paddr)
}
/*
* Calculate the expected ref_count for a page that would otherwise have no
* Calculate the expected ref_count for a folio that would otherwise have no
* further pins. This was cribbed from similar functions in other places in
* the kernel, but with some slight modifications. We know that a secure
* page can not be a huge page for example.
* folio can not be a large folio, for example.
*/
static int expected_page_refs(struct page *page)
static int expected_folio_refs(struct folio *folio)
{
int res;
res = page_mapcount(page);
if (PageSwapCache(page)) {
res = folio_mapcount(folio);
if (folio_test_swapcache(folio)) {
res++;
} else if (page_mapping(page)) {
} else if (folio_mapping(folio)) {
res++;
if (page_has_private(page))
if (folio->private)
res++;
}
return res;
}
static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
{
int expected, cc = 0;
if (PageWriteback(page))
if (folio_test_writeback(folio))
return -EAGAIN;
expected = expected_page_refs(page);
if (!page_ref_freeze(page, expected))
expected = expected_folio_refs(folio);
if (!folio_ref_freeze(folio, expected))
return -EBUSY;
set_bit(PG_arch_1, &page->flags);
set_bit(PG_arch_1, &folio->flags);
/*
* If the UVC does not succeed or fail immediately, we don't want to
* loop for long, or we might get stall notifications.
......@@ -220,9 +221,9 @@ static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
* -EAGAIN and we let the callers deal with it.
*/
cc = __uv_call(0, (u64)uvcb);
page_ref_unfreeze(page, expected);
folio_ref_unfreeze(folio, expected);
/*
* Return -ENXIO if the page was not mapped, -EINVAL for other errors.
* Return -ENXIO if the folio was not mapped, -EINVAL for other errors.
* If busy or partially completed, return -EAGAIN.
*/
if (cc == UVC_CC_OK)
......@@ -277,7 +278,7 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
bool local_drain = false;
spinlock_t *ptelock;
unsigned long uaddr;
struct page *page;
struct folio *folio;
pte_t *ptep;
int rc;
......@@ -306,15 +307,19 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
if (!ptep)
goto out;
if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
page = pte_page(*ptep);
folio = page_folio(pte_page(*ptep));
rc = -EINVAL;
if (folio_test_large(folio))
goto unlock;
rc = -EAGAIN;
if (trylock_page(page)) {
if (folio_trylock(folio)) {
if (should_export_before_import(uvcb, gmap->mm))
uv_convert_from_secure(page_to_phys(page));
rc = make_page_secure(page, uvcb);
unlock_page(page);
uv_convert_from_secure(PFN_PHYS(folio_pfn(folio)));
rc = make_folio_secure(folio, uvcb);
folio_unlock(folio);
}
}
unlock:
pte_unmap_unlock(ptep, ptelock);
out:
mmap_read_unlock(gmap->mm);
......@@ -324,10 +329,10 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
* If we are here because the UVC returned busy or partial
* completion, this is just a useless check, but it is safe.
*/
wait_on_page_writeback(page);
folio_wait_writeback(folio);
} else if (rc == -EBUSY) {
/*
* If we have tried a local drain and the page refcount
* If we have tried a local drain and the folio refcount
* still does not match our expected safe value, try with a
* system wide drain. This is needed if the pagevecs holding
* the page are on a different CPU.
......@@ -338,7 +343,7 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
return -EAGAIN;
}
/*
* We are here if the page refcount does not match the
* We are here if the folio refcount does not match the
* expected safe value. The main culprits are usually
* pagevecs. With lru_add_drain() we drain the pagevecs
* on the local CPU so that hopefully the refcount will
......
......@@ -14,7 +14,9 @@ void arch_crash_save_vmcoreinfo(void)
VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31);
vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31);
vmcoreinfo_append_str("IDENTITYBASE=%lx\n", __identity_base);
vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
vmcoreinfo_append_str("KERNELOFFPHYS=%lx\n", __kaslr_offset_phys);
abs_lc = get_abs_lowcore();
abs_lc->vmcore_info = paddr_vmcoreinfo_note();
put_abs_lowcore(abs_lc);
......
......@@ -39,7 +39,7 @@ PHDRS {
SECTIONS
{
. = 0x100000;
. = __START_KERNEL;
.text : {
_stext = .; /* Start of text section */
_text = .; /* Text and read-only data */
......@@ -183,7 +183,7 @@ SECTIONS
.amode31.data : {
*(.amode31.data)
}
. = ALIGN(PAGE_SIZE);
. = _samode31 + AMODE31_SIZE;
_eamode31 = .;
/* early.c uses stsi, which requires page aligned data. */
......@@ -192,31 +192,6 @@ SECTIONS
PERCPU_SECTION(0x100)
#ifdef CONFIG_PIE_BUILD
.dynsym ALIGN(8) : {
__dynsym_start = .;
*(.dynsym)
__dynsym_end = .;
}
.rela.dyn ALIGN(8) : {
__rela_dyn_start = .;
*(.rela*)
__rela_dyn_end = .;
}
.dynamic ALIGN(8) : {
*(.dynamic)
}
.dynstr ALIGN(8) : {
*(.dynstr)
}
#endif
.hash ALIGN(8) : {
*(.hash)
}
.gnu.hash ALIGN(8) : {
*(.gnu.hash)
}
. = ALIGN(PAGE_SIZE);
__init_end = .; /* freed after init ends here */
......@@ -230,7 +205,6 @@ SECTIONS
* it should match struct vmlinux_info
*/
.vmlinux.info 0 (INFO) : {
QUAD(_stext) /* default_lma */
QUAD(startup_continue) /* entry */
QUAD(__bss_start - _stext) /* image_size */
QUAD(__bss_stop - __bss_start) /* bss_size */
......@@ -239,14 +213,8 @@ SECTIONS
QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */
QUAD(__boot_data_preserved_end -
__boot_data_preserved_start) /* bootdata_preserved_size */
#ifdef CONFIG_PIE_BUILD
QUAD(__dynsym_start) /* dynsym_start */
QUAD(__rela_dyn_start) /* rela_dyn_start */
QUAD(__rela_dyn_end) /* rela_dyn_end */
#else
QUAD(__got_start) /* got_start */
QUAD(__got_end) /* got_end */
#endif
QUAD(_eamode31 - _samode31) /* amode31_size */
QUAD(init_mm)
QUAD(swapper_pg_dir)
......@@ -282,12 +250,10 @@ SECTIONS
*(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt)
}
ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
#ifndef CONFIG_PIE_BUILD
.rela.dyn : {
*(.rela.*) *(.rela_*)
}
ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
#endif
/* Sections to be discarded */
DISCARDS
......
......@@ -2631,9 +2631,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
if (r)
break;
mmap_write_lock(current->mm);
r = gmap_mark_unmergeable();
mmap_write_unlock(current->mm);
r = s390_disable_cow_sharing();
if (r)
break;
......
......@@ -12,6 +12,7 @@
#include <linux/list.h>
#include <linux/bitmap.h>
#include <linux/sched/signal.h>
#include <linux/io.h>
#include <asm/gmap.h>
#include <asm/mmu_context.h>
......@@ -361,7 +362,7 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
case -EACCES:
return set_validity_icpt(scb_s, 0x003CU);
}
scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2;
scb_s->crycbd = (u32)virt_to_phys(&vsie_page->crycb) | CRYCB_FORMAT2;
return 0;
}
......@@ -1005,7 +1006,7 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (read_guest_real(vcpu, fac, &vsie_page->fac,
stfle_size() * sizeof(u64)))
return set_validity_icpt(scb_s, 0x1090U);
scb_s->fac = (__u32)(__u64) &vsie_page->fac;
scb_s->fac = (u32)virt_to_phys(&vsie_page->fac);
}
return 0;
}
......
......@@ -23,4 +23,4 @@ obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
obj-$(CONFIG_EXPOLINE_EXTERN) += expoline/
obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o
# SPDX-License-Identifier: GPL-2.0
obj-y += expoline.o
......@@ -2549,41 +2549,6 @@ static inline void thp_split_mm(struct mm_struct *mm)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* Remove all empty zero pages from the mapping for lazy refaulting
* - This must be called after mm->context.has_pgste is set, to avoid
* future creation of zero pages
* - This must be called after THP was disabled.
*
* mm contracts with s390, that even if mm were to remove a page table,
* racing with the loop below and so causing pte_offset_map_lock() to fail,
* it will never insert a page table containing empty zero pages once
* mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set.
*/
static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
{
unsigned long addr;
for (addr = start; addr != end; addr += PAGE_SIZE) {
pte_t *ptep;
spinlock_t *ptl;
ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
if (!ptep)
break;
if (is_zero_pfn(pte_pfn(*ptep)))
ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
pte_unmap_unlock(ptep, ptl);
}
return 0;
}
static const struct mm_walk_ops zap_zero_walk_ops = {
.pmd_entry = __zap_zero_pages,
.walk_lock = PGWALK_WRLOCK,
};
/*
* switch on pgstes for its userspace process (for kvm)
*/
......@@ -2601,22 +2566,142 @@ int s390_enable_sie(void)
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
mmap_write_unlock(mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
int gmap_mark_unmergeable(void)
static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
unsigned long *found_addr = walk->private;
/* Return 1 of the page is a zeropage. */
if (is_zero_pfn(pte_pfn(*pte))) {
/*
* Shared zeropage in e.g., a FS DAX mapping? We cannot do the
* right thing and likely don't care: FAULT_FLAG_UNSHARE
* currently only works in COW mappings, which is also where
* mm_forbids_zeropage() is checked.
*/
if (!is_cow_mapping(walk->vma->vm_flags))
return -EFAULT;
*found_addr = addr;
return 1;
}
return 0;
}
static const struct mm_walk_ops find_zeropage_ops = {
.pte_entry = find_zeropage_pte_entry,
.walk_lock = PGWALK_WRLOCK,
};
/*
* Unshare all shared zeropages, replacing them by anonymous pages. Note that
* we cannot simply zap all shared zeropages, because this could later
* trigger unexpected userfaultfd missing events.
*
* This must be called after mm->context.allow_cow_sharing was
* set to 0, to avoid future mappings of shared zeropages.
*
* mm contracts with s390, that even if mm were to remove a page table,
* and racing with walk_page_range_vma() calling pte_offset_map_lock()
* would fail, it will never insert a page table containing empty zero
* pages once mm_forbids_zeropage(mm) i.e.
* mm->context.allow_cow_sharing is set to 0.
*/
static int __s390_unshare_zeropages(struct mm_struct *mm)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
unsigned long addr;
vm_fault_t fault;
int rc;
for_each_vma(vmi, vma) {
/*
* We could only look at COW mappings, but it's more future
* proof to catch unexpected zeropages in other mappings and
* fail.
*/
if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
continue;
addr = vma->vm_start;
retry:
rc = walk_page_range_vma(vma, addr, vma->vm_end,
&find_zeropage_ops, &addr);
if (rc < 0)
return rc;
else if (!rc)
continue;
/* addr was updated by find_zeropage_pte_entry() */
fault = handle_mm_fault(vma, addr,
FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
NULL);
if (fault & VM_FAULT_OOM)
return -ENOMEM;
/*
* See break_ksm(): even after handle_mm_fault() returned 0, we
* must start the lookup from the current address, because
* handle_mm_fault() may back out if there's any difficulty.
*
* VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
* maybe they could trigger in the future on concurrent
* truncation. In that case, the shared zeropage would be gone
* and we can simply retry and make progress.
*/
cond_resched();
goto retry;
}
return 0;
}
static int __s390_disable_cow_sharing(struct mm_struct *mm)
{
int rc;
if (!mm->context.allow_cow_sharing)
return 0;
mm->context.allow_cow_sharing = 0;
/* Replace all shared zeropages by anonymous pages. */
rc = __s390_unshare_zeropages(mm);
/*
* Make sure to disable KSM (if enabled for the whole process or
* individual VMAs). Note that nothing currently hinders user space
* from re-enabling it.
*/
return ksm_disable(current->mm);
if (!rc)
rc = ksm_disable(mm);
if (rc)
mm->context.allow_cow_sharing = 1;
return rc;
}
/*
* Disable most COW-sharing of memory pages for the whole process:
* (1) Disable KSM and unmerge/unshare any KSM pages.
* (2) Disallow shared zeropages and unshare any zerpages that are mapped.
*
* Not that we currently don't bother with COW-shared pages that are shared
* with parent/child processes due to fork().
*/
int s390_disable_cow_sharing(void)
{
int rc;
mmap_write_lock(current->mm);
rc = __s390_disable_cow_sharing(current->mm);
mmap_write_unlock(current->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
/*
* Enable storage key handling from now on and initialize the storage
......@@ -2685,7 +2770,7 @@ int s390_enable_skey(void)
goto out_up;
mm->context.uses_skeys = 1;
rc = gmap_mark_unmergeable();
rc = __s390_disable_cow_sharing(mm);
if (rc) {
mm->context.uses_skeys = 0;
goto out_up;
......
......@@ -13,7 +13,9 @@
#include <linux/slab.h>
#include <linux/sort.h>
#include <asm/page-states.h>
#include <asm/abs_lowcore.h>
#include <asm/cacheflush.h>
#include <asm/maccess.h>
#include <asm/nospec-branch.h>
#include <asm/ctlreg.h>
#include <asm/pgalloc.h>
......@@ -21,6 +23,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/set_memory.h>
#include <asm/physmem_info.h>
static DEFINE_MUTEX(vmem_mutex);
......@@ -436,7 +439,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
return -EINVAL;
/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
if (WARN_ON_ONCE(end > VMALLOC_START))
if (WARN_ON_ONCE(end > __abs_lowcore))
return -EINVAL;
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);
......
......@@ -172,7 +172,6 @@ static ssize_t uid_is_unique_show(struct device *dev,
}
static DEVICE_ATTR_RO(uid_is_unique);
#ifndef CONFIG_DMI
/* analogous to smbios index */
static ssize_t index_show(struct device *dev,
struct device_attribute *attr, char *buf)
......@@ -202,7 +201,6 @@ static struct attribute_group zpci_ident_attr_group = {
.attrs = zpci_ident_attrs,
.is_visible = zpci_index_is_visible,
};
#endif
static struct bin_attribute *zpci_bin_attrs[] = {
&bin_attr_util_string,
......@@ -245,8 +243,6 @@ static struct attribute_group pfip_attr_group = {
const struct attribute_group *zpci_attr_groups[] = {
&zpci_attr_group,
&pfip_attr_group,
#ifndef CONFIG_DMI
&zpci_ident_attr_group,
#endif
NULL,
};
......@@ -280,7 +280,7 @@ static int do_reloc(struct section *sec, Elf_Rel *rel)
case R_390_GOTOFF64:
break;
case R_390_64:
add_reloc(&relocs64, offset);
add_reloc(&relocs64, offset - ehdr.e_entry);
break;
default:
die("Unsupported relocation type: %d\n", r_type);
......
......@@ -67,6 +67,7 @@ config CRYPTO_DEV_GEODE
config ZCRYPT
tristate "Support for s390 cryptographic adapters"
depends on S390
depends on AP
select HW_RANDOM
help
Select this option if you want to enable support for
......@@ -74,23 +75,6 @@ config ZCRYPT
to 8 in Coprocessor (CEXxC), EP11 Coprocessor (CEXxP)
or Accelerator (CEXxA) mode.
config ZCRYPT_DEBUG
bool "Enable debug features for s390 cryptographic adapters"
default n
depends on DEBUG_KERNEL
depends on ZCRYPT
help
Say 'Y' here to enable some additional debug features on the
s390 cryptographic adapters driver.
There will be some more sysfs attributes displayed for ap cards
and queues and some flags on crypto requests are interpreted as
debugging messages to force error injection.
Do not enable on production level kernel build.
If unsure, say N.
config PKEY
tristate "Kernel API for protected key handling"
depends on S390
......
......@@ -32,7 +32,7 @@ obj-$(CONFIG_SCLP_VT220_TTY) += sclp_vt220.o
obj-$(CONFIG_PCI) += sclp_pci.o
obj-$(subst m,y,$(CONFIG_ZCRYPT)) += sclp_ap.o
obj-$(subst m,y,$(CONFIG_AP)) += sclp_ap.o
obj-$(CONFIG_VMLOGRDR) += vmlogrdr.o
obj-$(CONFIG_VMCP) += vmcp.o
......
......@@ -127,10 +127,9 @@ static int s390_vary_chpid(struct chp_id chpid, int on)
/*
* Channel measurement related functions
*/
static ssize_t chp_measurement_chars_read(struct file *filp,
struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
static ssize_t measurement_chars_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
struct channel_path *chp;
struct device *device;
......@@ -143,87 +142,79 @@ static ssize_t chp_measurement_chars_read(struct file *filp,
return memory_read_from_buffer(buf, count, &off, &chp->cmg_chars,
sizeof(chp->cmg_chars));
}
static BIN_ATTR_ADMIN_RO(measurement_chars, sizeof(struct cmg_chars));
static const struct bin_attribute chp_measurement_chars_attr = {
.attr = {
.name = "measurement_chars",
.mode = S_IRUSR,
},
.size = sizeof(struct cmg_chars),
.read = chp_measurement_chars_read,
};
static void chp_measurement_copy_block(struct cmg_entry *buf,
struct channel_subsystem *css,
struct chp_id chpid)
{
void *area;
struct cmg_entry *entry, reference_buf;
int idx;
if (chpid.id < 128) {
area = css->cub_addr1;
idx = chpid.id;
} else {
area = css->cub_addr2;
idx = chpid.id - 128;
}
entry = area + (idx * sizeof(struct cmg_entry));
do {
memcpy(buf, entry, sizeof(*entry));
memcpy(&reference_buf, entry, sizeof(*entry));
} while (reference_buf.values[0] != buf->values[0]);
}
static ssize_t chp_measurement_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
static ssize_t chp_measurement_copy_block(void *buf, loff_t off, size_t count,
struct kobject *kobj, bool extended)
{
struct channel_path *chp;
struct channel_subsystem *css;
struct device *device;
unsigned int size;
void *area, *entry;
int id, idx;
device = kobj_to_dev(kobj);
chp = to_channelpath(device);
css = to_css(chp->dev.parent);
id = chp->chpid.id;
size = sizeof(struct cmg_entry);
if (extended) {
/* Check if extended measurement data is available. */
if (!chp->extended)
return 0;
size = sizeof(struct cmg_ext_entry);
area = css->ecub[id / CSS_ECUES_PER_PAGE];
idx = id % CSS_ECUES_PER_PAGE;
} else {
size = sizeof(struct cmg_entry);
area = css->cub[id / CSS_CUES_PER_PAGE];
idx = id % CSS_CUES_PER_PAGE;
}
entry = area + (idx * size);
/* Only allow single reads. */
if (off || count < size)
return 0;
chp_measurement_copy_block((struct cmg_entry *)buf, css, chp->chpid);
count = size;
return count;
memcpy(buf, entry, size);
return size;
}
static const struct bin_attribute chp_measurement_attr = {
.attr = {
.name = "measurement",
.mode = S_IRUSR,
},
.size = sizeof(struct cmg_entry),
.read = chp_measurement_read,
static ssize_t measurement_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
return chp_measurement_copy_block(buf, off, count, kobj, false);
}
static BIN_ATTR_ADMIN_RO(measurement, sizeof(struct cmg_entry));
static ssize_t ext_measurement_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
return chp_measurement_copy_block(buf, off, count, kobj, true);
}
static BIN_ATTR_ADMIN_RO(ext_measurement, sizeof(struct cmg_ext_entry));
static struct bin_attribute *measurement_attrs[] = {
&bin_attr_measurement_chars,
&bin_attr_measurement,
&bin_attr_ext_measurement,
NULL,
};
BIN_ATTRIBUTE_GROUPS(measurement);
void chp_remove_cmg_attr(struct channel_path *chp)
{
device_remove_bin_file(&chp->dev, &chp_measurement_chars_attr);
device_remove_bin_file(&chp->dev, &chp_measurement_attr);
device_remove_groups(&chp->dev, measurement_groups);
}
int chp_add_cmg_attr(struct channel_path *chp)
{
int ret;
ret = device_create_bin_file(&chp->dev, &chp_measurement_chars_attr);
if (ret)
return ret;
ret = device_create_bin_file(&chp->dev, &chp_measurement_attr);
if (ret)
device_remove_bin_file(&chp->dev, &chp_measurement_chars_attr);
return ret;
return device_add_groups(&chp->dev, measurement_groups);
}
/*
......@@ -401,6 +392,35 @@ static ssize_t chp_esc_show(struct device *dev,
}
static DEVICE_ATTR(esc, 0444, chp_esc_show, NULL);
static char apply_max_suffix(unsigned long *value, unsigned long base)
{
static char suffixes[] = { 0, 'K', 'M', 'G', 'T' };
int i;
for (i = 0; i < ARRAY_SIZE(suffixes) - 1; i++) {
if (*value < base || *value % base != 0)
break;
*value /= base;
}
return suffixes[i];
}
static ssize_t speed_bps_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct channel_path *chp = to_channelpath(dev);
unsigned long speed = chp->speed;
char suffix;
suffix = apply_max_suffix(&speed, 1000);
return suffix ? sysfs_emit(buf, "%lu%c\n", speed, suffix) :
sysfs_emit(buf, "%lu\n", speed);
}
static DEVICE_ATTR_RO(speed_bps);
static ssize_t util_string_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
loff_t off, size_t count)
......@@ -432,6 +452,7 @@ static struct attribute *chp_attrs[] = {
&dev_attr_chid.attr,
&dev_attr_chid_external.attr,
&dev_attr_esc.attr,
&dev_attr_speed_bps.attr,
NULL,
};
static struct attribute_group chp_attr_group = {
......
......@@ -51,6 +51,8 @@ struct channel_path {
/* Channel-measurement related stuff: */
int cmg;
int shared;
int extended;
unsigned long speed;
struct cmg_chars cmg_chars;
};
......
......@@ -24,7 +24,6 @@
#include <asm/crw.h>
#include <asm/isc.h>
#include <asm/ebcdic.h>
#include <asm/ap.h>
#include "css.h"
#include "cio.h"
......@@ -40,6 +39,20 @@ static DEFINE_SPINLOCK(chsc_page_lock);
#define SEI_VF_FLA 0xc0 /* VF flag for Full Link Address */
#define SEI_RS_CHPID 0x4 /* 4 in RS field indicates CHPID */
static BLOCKING_NOTIFIER_HEAD(chsc_notifiers);
int chsc_notifier_register(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&chsc_notifiers, nb);
}
EXPORT_SYMBOL(chsc_notifier_register);
int chsc_notifier_unregister(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&chsc_notifiers, nb);
}
EXPORT_SYMBOL(chsc_notifier_unregister);
/**
* chsc_error_from_response() - convert a chsc response to an error
* @response: chsc response code
......@@ -581,7 +594,8 @@ static void chsc_process_sei_ap_cfg_chg(struct chsc_sei_nt0_area *sei_area)
if (sei_area->rs != 5)
return;
ap_bus_cfg_chg();
blocking_notifier_call_chain(&chsc_notifiers,
CHSC_NOTIFY_AP_CFG, NULL);
}
static void chsc_process_sei_fces_event(struct chsc_sei_nt0_area *sei_area)
......@@ -857,22 +871,22 @@ int __chsc_do_secm(struct channel_subsystem *css, int enable)
struct {
struct chsc_header request;
u32 operation_code : 2;
u32 : 30;
u32 : 1;
u32 e : 1;
u32 : 28;
u32 key : 4;
u32 : 28;
u32 zeroes1;
dma32_t cub_addr1;
u32 zeroes2;
dma32_t cub_addr2;
u32 reserved[13];
dma64_t cub[CSS_NUM_CUB_PAGES];
dma64_t ecub[CSS_NUM_ECUB_PAGES];
u32 reserved[5];
struct chsc_header response;
u32 status : 8;
u32 : 4;
u32 fmt : 4;
u32 : 16;
} *secm_area;
} __packed *secm_area;
unsigned long flags;
int ret, ccode;
int ret, ccode, i;
spin_lock_irqsave(&chsc_page_lock, flags);
memset(chsc_page, 0, PAGE_SIZE);
......@@ -881,8 +895,12 @@ int __chsc_do_secm(struct channel_subsystem *css, int enable)
secm_area->request.code = 0x0016;
secm_area->key = PAGE_DEFAULT_KEY >> 4;
secm_area->cub_addr1 = virt_to_dma32(css->cub_addr1);
secm_area->cub_addr2 = virt_to_dma32(css->cub_addr2);
secm_area->e = 1;
for (i = 0; i < CSS_NUM_CUB_PAGES; i++)
secm_area->cub[i] = (__force dma64_t)virt_to_dma32(css->cub[i]);
for (i = 0; i < CSS_NUM_ECUB_PAGES; i++)
secm_area->ecub[i] = virt_to_dma64(css->ecub[i]);
secm_area->operation_code = enable ? 0 : 1;
......@@ -908,19 +926,47 @@ int __chsc_do_secm(struct channel_subsystem *css, int enable)
return ret;
}
static int cub_alloc(struct channel_subsystem *css)
{
int i;
for (i = 0; i < CSS_NUM_CUB_PAGES; i++) {
css->cub[i] = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
if (!css->cub[i])
return -ENOMEM;
}
for (i = 0; i < CSS_NUM_ECUB_PAGES; i++) {
css->ecub[i] = (void *)get_zeroed_page(GFP_KERNEL);
if (!css->ecub[i])
return -ENOMEM;
}
return 0;
}
static void cub_free(struct channel_subsystem *css)
{
int i;
for (i = 0; i < CSS_NUM_CUB_PAGES; i++) {
free_page((unsigned long)css->cub[i]);
css->cub[i] = NULL;
}
for (i = 0; i < CSS_NUM_ECUB_PAGES; i++) {
free_page((unsigned long)css->ecub[i]);
css->ecub[i] = NULL;
}
}
int
chsc_secm(struct channel_subsystem *css, int enable)
{
int ret;
if (enable && !css->cm_enabled) {
css->cub_addr1 = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
css->cub_addr2 = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
if (!css->cub_addr1 || !css->cub_addr2) {
free_page((unsigned long)css->cub_addr1);
free_page((unsigned long)css->cub_addr2);
return -ENOMEM;
}
ret = cub_alloc(css);
if (ret)
goto out;
}
ret = __chsc_do_secm(css, enable);
if (!ret) {
......@@ -934,10 +980,11 @@ chsc_secm(struct channel_subsystem *css, int enable)
} else
chsc_remove_cmg_attr(css);
}
if (!css->cm_enabled) {
free_page((unsigned long)css->cub_addr1);
free_page((unsigned long)css->cub_addr2);
}
out:
if (!css->cm_enabled)
cub_free(css);
return ret;
}
......@@ -1019,6 +1066,18 @@ chsc_initialize_cmg_chars(struct channel_path *chp, u8 cmcv,
}
}
static unsigned long scmc_get_speed(u32 s, u32 p)
{
unsigned long speed = s;
if (!p)
p = 8;
while (p--)
speed *= 10;
return speed;
}
int chsc_get_channel_measurement_chars(struct channel_path *chp)
{
unsigned long flags;
......@@ -1035,18 +1094,23 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp)
u32 zeroes2;
u32 not_valid : 1;
u32 shared : 1;
u32 : 22;
u32 extended : 1;
u32 : 21;
u32 chpid : 8;
u32 cmcv : 5;
u32 : 11;
u32 : 7;
u32 cmgp : 4;
u32 cmgq : 8;
u32 cmg : 8;
u32 zeroes3;
u32 : 16;
u32 cmgs : 16;
u32 data[NR_MEASUREMENT_CHARS];
} *scmc_area;
chp->shared = -1;
chp->cmg = -1;
chp->extended = 0;
chp->speed = 0;
if (!css_chsc_characteristics.scmc || !css_chsc_characteristics.secm)
return -EINVAL;
......@@ -1076,10 +1140,8 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp)
chp->cmg = scmc_area->cmg;
chp->shared = scmc_area->shared;
if (chp->cmg != 2 && chp->cmg != 3) {
/* No cmg-dependent data. */
goto out;
}
chp->extended = scmc_area->extended;
chp->speed = scmc_get_speed(scmc_area->cmgs, scmc_area->cmgp);
chsc_initialize_cmg_chars(chp, scmc_area->cmcv,
(struct cmg_chars *) &scmc_area->data);
out:
......
......@@ -22,6 +22,11 @@ struct cmg_entry {
u32 values[NR_MEASUREMENT_ENTRIES];
};
#define NR_EXT_MEASUREMENT_ENTRIES 16
struct cmg_ext_entry {
u32 values[NR_EXT_MEASUREMENT_ENTRIES];
};
struct channel_path_desc_fmt1 {
u8 flags;
u8 lsn;
......
......@@ -309,7 +309,7 @@ static ssize_t type_show(struct device *dev, struct device_attribute *attr,
{
struct subchannel *sch = to_subchannel(dev);
return sprintf(buf, "%01x\n", sch->st);
return sysfs_emit(buf, "%01x\n", sch->st);
}
static DEVICE_ATTR_RO(type);
......@@ -319,7 +319,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
{
struct subchannel *sch = to_subchannel(dev);
return sprintf(buf, "css:t%01X\n", sch->st);
return sysfs_emit(buf, "css:t%01X\n", sch->st);
}
static DEVICE_ATTR_RO(modalias);
......@@ -345,7 +345,7 @@ static ssize_t driver_override_show(struct device *dev,
ssize_t len;
device_lock(dev);
len = snprintf(buf, PAGE_SIZE, "%s\n", sch->driver_override);
len = sysfs_emit(buf, "%s\n", sch->driver_override);
device_unlock(dev);
return len;
}
......@@ -396,8 +396,8 @@ static ssize_t pimpampom_show(struct device *dev,
struct subchannel *sch = to_subchannel(dev);
struct pmcw *pmcw = &sch->schib.pmcw;
return sprintf(buf, "%02x %02x %02x\n",
pmcw->pim, pmcw->pam, pmcw->pom);
return sysfs_emit(buf, "%02x %02x %02x\n",
pmcw->pim, pmcw->pam, pmcw->pom);
}
static DEVICE_ATTR_RO(pimpampom);
......@@ -881,7 +881,7 @@ static ssize_t real_cssid_show(struct device *dev, struct device_attribute *a,
if (!css->id_valid)
return -EINVAL;
return sprintf(buf, "%x\n", css->cssid);
return sysfs_emit(buf, "%x\n", css->cssid);
}
static DEVICE_ATTR_RO(real_cssid);
......@@ -904,7 +904,7 @@ static ssize_t cm_enable_show(struct device *dev, struct device_attribute *a,
int ret;
mutex_lock(&css->mutex);
ret = sprintf(buf, "%x\n", css->cm_enabled);
ret = sysfs_emit(buf, "%x\n", css->cm_enabled);
mutex_unlock(&css->mutex);
return ret;
}
......
......@@ -34,6 +34,15 @@
#define SNID_STATE3_MULTI_PATH 1
#define SNID_STATE3_SINGLE_PATH 0
/*
* Miscellaneous constants
*/
#define CSS_NUM_CUB_PAGES 2
#define CSS_CUES_PER_PAGE 128
#define CSS_NUM_ECUB_PAGES 4
#define CSS_ECUES_PER_PAGE 64
/*
* Conditions used to specify which subchannels need evaluation
*/
......@@ -122,8 +131,8 @@ struct channel_subsystem {
struct mutex mutex;
/* channel measurement related */
int cm_enabled;
void *cub_addr1;
void *cub_addr2;
void *cub[CSS_NUM_CUB_PAGES];
void *ecub[CSS_NUM_ECUB_PAGES];
/* for orphaned ccw devices */
struct subchannel *pseudo_subchannel;
};
......
......@@ -50,7 +50,7 @@ DECLARE_EVENT_CLASS(s390_class_schib,
__entry->devno = schib->pmcw.dev;
__entry->schib = *schib;
__entry->pmcw_ena = schib->pmcw.ena;
__entry->pmcw_st = schib->pmcw.ena;
__entry->pmcw_st = schib->pmcw.st;
__entry->pmcw_dnv = schib->pmcw.dnv;
__entry->pmcw_dev = schib->pmcw.dev;
__entry->pmcw_lpm = schib->pmcw.lpm;
......
......@@ -4,7 +4,7 @@
#
ap-objs := ap_bus.o ap_card.o ap_queue.o
obj-$(subst m,y,$(CONFIG_ZCRYPT)) += ap.o
obj-$(CONFIG_AP) += ap.o
# zcrypt_api.o and zcrypt_msgtype*.o depend on ap.o
zcrypt-objs := zcrypt_api.o zcrypt_card.o zcrypt_queue.o
zcrypt-objs += zcrypt_msgtype6.o zcrypt_msgtype50.o
......
This diff is collapsed.
......@@ -343,6 +343,28 @@ int ap_parse_mask_str(const char *str,
unsigned long *bitmap, int bits,
struct mutex *lock);
/*
* ap_hex2bitmap() - Convert a string containing a hexadecimal number (str)
* into a bitmap (bitmap) with bits set that correspond to the bits represented
* by the hex string. Input and output data is in big endian order.
*
* str - Input hex string of format "0x1234abcd". The leading "0x" is optional.
* At least one digit is required. Must be large enough to hold the number of
* bits represented by the bits parameter.
*
* bitmap - Pointer to a bitmap. Upon successful completion of this function,
* this bitmap will have bits set to match the value of str. If bitmap is longer
* than str, then the rightmost bits of bitmap are padded with zeros. Must be
* large enough to hold the number of bits represented by the bits parameter.
*
* bits - Length, in bits, of the bitmap represented by str. Must be a multiple
* of 8.
*
* Returns: 0 On success
* -EINVAL If str format is invalid or bits is not a multiple of 8.
*/
int ap_hex2bitmap(const char *str, unsigned long *bitmap, int bits);
/*
* Interface to wait for the AP bus to have done one initial ap bus
* scan and all detected APQNs have been bound to device drivers.
......
......@@ -708,7 +708,7 @@ static ssize_t ap_functions_show(struct device *dev,
static DEVICE_ATTR_RO(ap_functions);
#ifdef CONFIG_ZCRYPT_DEBUG
#ifdef CONFIG_AP_DEBUG
static ssize_t states_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
......@@ -820,7 +820,7 @@ static struct attribute *ap_queue_dev_attrs[] = {
&dev_attr_config.attr,
&dev_attr_chkstop.attr,
&dev_attr_ap_functions.attr,
#ifdef CONFIG_ZCRYPT_DEBUG
#ifdef CONFIG_AP_DEBUG
&dev_attr_states.attr,
&dev_attr_last_err_rc.attr,
#endif
......
......@@ -794,10 +794,11 @@ static int vfio_ap_mdev_probe(struct mdev_device *mdev)
static void vfio_ap_mdev_link_queue(struct ap_matrix_mdev *matrix_mdev,
struct vfio_ap_queue *q)
{
if (q) {
q->matrix_mdev = matrix_mdev;
hash_add(matrix_mdev->qtable.queues, &q->mdev_qnode, q->apqn);
}
if (!q || vfio_ap_mdev_get_queue(matrix_mdev, q->apqn))
return;
q->matrix_mdev = matrix_mdev;
hash_add(matrix_mdev->qtable.queues, &q->mdev_qnode, q->apqn);
}
static void vfio_ap_mdev_link_apqn(struct ap_matrix_mdev *matrix_mdev, int apqn)
......@@ -1118,20 +1119,29 @@ static void vfio_ap_mdev_unlink_adapter(struct ap_matrix_mdev *matrix_mdev,
}
}
static void vfio_ap_mdev_hot_unplug_adapter(struct ap_matrix_mdev *matrix_mdev,
unsigned long apid)
static void vfio_ap_mdev_hot_unplug_adapters(struct ap_matrix_mdev *matrix_mdev,
unsigned long *apids)
{
struct vfio_ap_queue *q, *tmpq;
struct list_head qlist;
unsigned long apid;
bool apcb_update = false;
INIT_LIST_HEAD(&qlist);
vfio_ap_mdev_unlink_adapter(matrix_mdev, apid, &qlist);
if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm)) {
clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm);
vfio_ap_mdev_update_guest_apcb(matrix_mdev);
for_each_set_bit_inv(apid, apids, AP_DEVICES) {
vfio_ap_mdev_unlink_adapter(matrix_mdev, apid, &qlist);
if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm)) {
clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm);
apcb_update = true;
}
}
/* Only update apcb if needed to avoid impacting guest */
if (apcb_update)
vfio_ap_mdev_update_guest_apcb(matrix_mdev);
vfio_ap_mdev_reset_qlist(&qlist);
list_for_each_entry_safe(q, tmpq, &qlist, reset_qnode) {
......@@ -1140,6 +1150,16 @@ static void vfio_ap_mdev_hot_unplug_adapter(struct ap_matrix_mdev *matrix_mdev,
}
}
static void vfio_ap_mdev_hot_unplug_adapter(struct ap_matrix_mdev *matrix_mdev,
unsigned long apid)
{
DECLARE_BITMAP(apids, AP_DEVICES);
bitmap_zero(apids, AP_DEVICES);
set_bit_inv(apid, apids);
vfio_ap_mdev_hot_unplug_adapters(matrix_mdev, apids);
}
/**
* unassign_adapter_store - parses the APID from @buf and clears the
* corresponding bit in the mediated matrix device's APM
......@@ -1300,20 +1320,29 @@ static void vfio_ap_mdev_unlink_domain(struct ap_matrix_mdev *matrix_mdev,
}
}
static void vfio_ap_mdev_hot_unplug_domain(struct ap_matrix_mdev *matrix_mdev,
unsigned long apqi)
static void vfio_ap_mdev_hot_unplug_domains(struct ap_matrix_mdev *matrix_mdev,
unsigned long *apqis)
{
struct vfio_ap_queue *q, *tmpq;
struct list_head qlist;
unsigned long apqi;
bool apcb_update = false;
INIT_LIST_HEAD(&qlist);
vfio_ap_mdev_unlink_domain(matrix_mdev, apqi, &qlist);
if (test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) {
clear_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm);
vfio_ap_mdev_update_guest_apcb(matrix_mdev);
for_each_set_bit_inv(apqi, apqis, AP_DOMAINS) {
vfio_ap_mdev_unlink_domain(matrix_mdev, apqi, &qlist);
if (test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) {
clear_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm);
apcb_update = true;
}
}
/* Only update apcb if needed to avoid impacting guest */
if (apcb_update)
vfio_ap_mdev_update_guest_apcb(matrix_mdev);
vfio_ap_mdev_reset_qlist(&qlist);
list_for_each_entry_safe(q, tmpq, &qlist, reset_qnode) {
......@@ -1322,6 +1351,16 @@ static void vfio_ap_mdev_hot_unplug_domain(struct ap_matrix_mdev *matrix_mdev,
}
}
static void vfio_ap_mdev_hot_unplug_domain(struct ap_matrix_mdev *matrix_mdev,
unsigned long apqi)
{
DECLARE_BITMAP(apqis, AP_DOMAINS);
bitmap_zero(apqis, AP_DEVICES);
set_bit_inv(apqi, apqis);
vfio_ap_mdev_hot_unplug_domains(matrix_mdev, apqis);
}
/**
* unassign_domain_store - parses the APQI from @buf and clears the
* corresponding bit in the mediated matrix device's AQM
......@@ -1570,6 +1609,158 @@ static ssize_t guest_matrix_show(struct device *dev,
}
static DEVICE_ATTR_RO(guest_matrix);
static ssize_t write_ap_bitmap(unsigned long *bitmap, char *buf, int offset, char sep)
{
return sysfs_emit_at(buf, offset, "0x%016lx%016lx%016lx%016lx%c",
bitmap[0], bitmap[1], bitmap[2], bitmap[3], sep);
}
static ssize_t ap_config_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);
int idx = 0;
idx += write_ap_bitmap(matrix_mdev->matrix.apm, buf, idx, ',');
idx += write_ap_bitmap(matrix_mdev->matrix.aqm, buf, idx, ',');
idx += write_ap_bitmap(matrix_mdev->matrix.adm, buf, idx, '\n');
return idx;
}
/* Number of characters needed for a complete hex mask representing the bits in .. */
#define AP_DEVICES_STRLEN (AP_DEVICES / 4 + 3)
#define AP_DOMAINS_STRLEN (AP_DOMAINS / 4 + 3)
#define AP_CONFIG_STRLEN (AP_DEVICES_STRLEN + 2 * AP_DOMAINS_STRLEN)
static int parse_bitmap(char **strbufptr, unsigned long *bitmap, int nbits)
{
char *curmask;
curmask = strsep(strbufptr, ",\n");
if (!curmask)
return -EINVAL;
bitmap_clear(bitmap, 0, nbits);
return ap_hex2bitmap(curmask, bitmap, nbits);
}
static int ap_matrix_overflow_check(struct ap_matrix_mdev *matrix_mdev)
{
unsigned long bit;
for_each_set_bit_inv(bit, matrix_mdev->matrix.apm, AP_DEVICES) {
if (bit > matrix_mdev->matrix.apm_max)
return -ENODEV;
}
for_each_set_bit_inv(bit, matrix_mdev->matrix.aqm, AP_DOMAINS) {
if (bit > matrix_mdev->matrix.aqm_max)
return -ENODEV;
}
for_each_set_bit_inv(bit, matrix_mdev->matrix.adm, AP_DOMAINS) {
if (bit > matrix_mdev->matrix.adm_max)
return -ENODEV;
}
return 0;
}
static void ap_matrix_copy(struct ap_matrix *dst, struct ap_matrix *src)
{
/* This check works around false positive gcc -Wstringop-overread */
if (!src)
return;
bitmap_copy(dst->apm, src->apm, AP_DEVICES);
bitmap_copy(dst->aqm, src->aqm, AP_DOMAINS);
bitmap_copy(dst->adm, src->adm, AP_DOMAINS);
}
static ssize_t ap_config_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);
struct ap_matrix m_new, m_old, m_added, m_removed;
DECLARE_BITMAP(apm_filtered, AP_DEVICES);
unsigned long newbit;
char *newbuf, *rest;
int rc = count;
bool do_update;
newbuf = kstrndup(buf, AP_CONFIG_STRLEN, GFP_KERNEL);
if (!newbuf)
return -ENOMEM;
rest = newbuf;
mutex_lock(&ap_perms_mutex);
get_update_locks_for_mdev(matrix_mdev);
/* Save old state */
ap_matrix_copy(&m_old, &matrix_mdev->matrix);
if (parse_bitmap(&rest, m_new.apm, AP_DEVICES) ||
parse_bitmap(&rest, m_new.aqm, AP_DOMAINS) ||
parse_bitmap(&rest, m_new.adm, AP_DOMAINS)) {
rc = -EINVAL;
goto out;
}
bitmap_andnot(m_removed.apm, m_old.apm, m_new.apm, AP_DEVICES);
bitmap_andnot(m_removed.aqm, m_old.aqm, m_new.aqm, AP_DOMAINS);
bitmap_andnot(m_added.apm, m_new.apm, m_old.apm, AP_DEVICES);
bitmap_andnot(m_added.aqm, m_new.aqm, m_old.aqm, AP_DOMAINS);
/* Need new bitmaps in matrix_mdev for validation */
ap_matrix_copy(&matrix_mdev->matrix, &m_new);
/* Ensure new state is valid, else undo new state */
rc = vfio_ap_mdev_validate_masks(matrix_mdev);
if (rc) {
ap_matrix_copy(&matrix_mdev->matrix, &m_old);
goto out;
}
rc = ap_matrix_overflow_check(matrix_mdev);
if (rc) {
ap_matrix_copy(&matrix_mdev->matrix, &m_old);
goto out;
}
rc = count;
/* Need old bitmaps in matrix_mdev for unplug/unlink */
ap_matrix_copy(&matrix_mdev->matrix, &m_old);
/* Unlink removed adapters/domains */
vfio_ap_mdev_hot_unplug_adapters(matrix_mdev, m_removed.apm);
vfio_ap_mdev_hot_unplug_domains(matrix_mdev, m_removed.aqm);
/* Need new bitmaps in matrix_mdev for linking new adapters/domains */
ap_matrix_copy(&matrix_mdev->matrix, &m_new);
/* Link newly added adapters */
for_each_set_bit_inv(newbit, m_added.apm, AP_DEVICES)
vfio_ap_mdev_link_adapter(matrix_mdev, newbit);
for_each_set_bit_inv(newbit, m_added.aqm, AP_DOMAINS)
vfio_ap_mdev_link_domain(matrix_mdev, newbit);
/* filter resources not bound to vfio-ap */
do_update = vfio_ap_mdev_filter_matrix(matrix_mdev, apm_filtered);
do_update |= vfio_ap_mdev_filter_cdoms(matrix_mdev);
/* Apply changes to shadow apbc if things changed */
if (do_update) {
vfio_ap_mdev_update_guest_apcb(matrix_mdev);
reset_queues_for_apids(matrix_mdev, apm_filtered);
}
out:
release_update_locks_for_mdev(matrix_mdev);
mutex_unlock(&ap_perms_mutex);
kfree(newbuf);
return rc;
}
static DEVICE_ATTR_RW(ap_config);
static struct attribute *vfio_ap_mdev_attrs[] = {
&dev_attr_assign_adapter.attr,
&dev_attr_unassign_adapter.attr,
......@@ -1577,6 +1768,7 @@ static struct attribute *vfio_ap_mdev_attrs[] = {
&dev_attr_unassign_domain.attr,
&dev_attr_assign_control_domain.attr,
&dev_attr_unassign_control_domain.attr,
&dev_attr_ap_config.attr,
&dev_attr_control_domains.attr,
&dev_attr_matrix.attr,
&dev_attr_guest_matrix.attr,
......
......@@ -75,11 +75,11 @@ extern struct ap_matrix_dev *matrix_dev;
*/
struct ap_matrix {
unsigned long apm_max;
DECLARE_BITMAP(apm, 256);
DECLARE_BITMAP(apm, AP_DEVICES);
unsigned long aqm_max;
DECLARE_BITMAP(aqm, 256);
DECLARE_BITMAP(aqm, AP_DOMAINS);
unsigned long adm_max;
DECLARE_BITMAP(adm, 256);
DECLARE_BITMAP(adm, AP_DOMAINS);
};
/**
......
......@@ -316,6 +316,38 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
goto out;
}
static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr)
{
struct folio *folio;
int ret = -ENOMEM;
folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
if (!folio)
return ret;
if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
goto out_put;
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
* zeroing out the folio become visible before mapping the page
* using set_pte_at(). See do_anonymous_page().
*/
__folio_mark_uptodate(folio);
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
&folio->page, true, 0);
if (ret)
goto out_put;
return 0;
out_put:
folio_put(folio);
return ret;
}
static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr)
......@@ -324,6 +356,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
spinlock_t *ptl;
int ret;
if (mm_forbids_zeropage(dst_vma->vm_mm))
return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
ret = -EAGAIN;
......
......@@ -601,11 +601,6 @@ static int ignore_undef_symbol(struct elf_info *info, const char *symname)
strstarts(symname, "_savevr_") ||
strcmp(symname, ".TOC.") == 0)
return 1;
if (info->hdr->e_machine == EM_S390)
/* Expoline thunks are linked on all kernel modules during final link of .ko */
if (strstarts(symname, "__s390_indirect_jump_r"))
return 1;
/* Do not ignore this symbol */
return 0;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment