Commit 98896d87 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86_cc_for_v6.11_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 confidential computing updates from Borislav Petkov:
 "Unrelated x86/cc changes queued here to avoid ugly cross-merges and
  conflicts:

   - Carve out CPU hotplug function declarations into a separate header
     with the goal to be able to use the lockdep assertions in a more
     flexible manner

   - As a result, refactor cacheinfo code after carving out a function
     to return the cache ID associated with a given cache level

   - Cleanups

  Add support to be able to kexec TDX guests:

   - Expand ACPI MADT CPU offlining support

   - Add machinery to prepare CoCo guests memory before kexec-ing into a
     new kernel

   - Cleanup, readjust and massage related code"

* tag 'x86_cc_for_v6.11_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
  ACPI: tables: Print MULTIPROC_WAKEUP when MADT is parsed
  x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method
  x86/mm: Introduce kernel_ident_mapping_free()
  x86/smp: Add smp_ops.stop_this_cpu() callback
  x86/acpi: Do not attempt to bring up secondary CPUs in the kexec case
  x86/acpi: Rename fields in the acpi_madt_multiproc_wakeup structure
  x86/mm: Do not zap page table entries mapping unaccepted memory table during kdump
  x86/mm: Make e820__end_ram_pfn() cover E820_TYPE_ACPI ranges
  x86/tdx: Convert shared memory back to private on kexec
  x86/mm: Add callbacks to prepare encrypted memory for kexec
  x86/tdx: Account shared memory
  x86/mm: Return correct level from lookup_address() if pte is none
  x86/mm: Make x86_platform.guest.enc_status_change_*() return an error
  x86/kexec: Keep CR4.MCE set during kexec for TDX guest
  x86/relocate_kernel: Use named labels for less confusion
  cpu/hotplug, x86/acpi: Disable CPU offlining for ACPI MADT wakeup
  cpu/hotplug: Add support for declaring CPU offlining not supported
  x86/apic: Mark acpi_mp_wake_* variables as __ro_after_init
  x86/acpi: Extract ACPI MADT wakeup code into a separate file
  x86/kexec: Remove spurious unconditional JMP from from identity_mapped()
  ...
parents 181a984b 16df3594
......@@ -1118,6 +1118,13 @@ config X86_LOCAL_APIC
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
select IRQ_DOMAIN_HIERARCHY
config ACPI_MADT_WAKEUP
def_bool y
depends on X86_64
depends on ACPI
depends on SMP
depends on X86_LOCAL_APIC
config X86_IO_APIC
def_bool y
depends on X86_LOCAL_APIC || X86_UP_IOAPIC
......
......@@ -29,7 +29,6 @@ static bool noinstr intel_cc_platform_has(enum cc_attr attr)
{
switch (attr) {
case CC_ATTR_GUEST_UNROLL_STRING_IO:
case CC_ATTR_HOTPLUG_DISABLED:
case CC_ATTR_GUEST_MEM_ENCRYPT:
case CC_ATTR_MEM_ENCRYPT:
return true;
......
......@@ -7,6 +7,7 @@
#include <linux/cpufeature.h>
#include <linux/export.h>
#include <linux/io.h>
#include <linux/kexec.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
......@@ -14,6 +15,7 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
/* MMIO direction */
#define EPT_READ 0
......@@ -38,6 +40,8 @@
#define TDREPORT_SUBTYPE_0 0
static atomic_long_t nr_shared;
/* Called from __tdx_hypercall() for unrecoverable failure */
noinstr void __noreturn __tdx_hypercall_failed(void)
{
......@@ -798,28 +802,124 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
return true;
}
static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
bool enc)
{
/*
* Only handle shared->private conversion here.
* See the comment in tdx_early_init().
*/
if (enc)
return tdx_enc_status_changed(vaddr, numpages, enc);
return true;
if (enc && !tdx_enc_status_changed(vaddr, numpages, enc))
return -EIO;
return 0;
}
static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
bool enc)
{
/*
* Only handle private->shared conversion here.
* See the comment in tdx_early_init().
*/
if (!enc)
return tdx_enc_status_changed(vaddr, numpages, enc);
return true;
if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
return -EIO;
if (enc)
atomic_long_sub(numpages, &nr_shared);
else
atomic_long_add(numpages, &nr_shared);
return 0;
}
/* Stop new private<->shared conversions */
static void tdx_kexec_begin(void)
{
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
return;
/*
* Crash kernel reaches here with interrupts disabled: can't wait for
* conversions to finish.
*
* If race happened, just report and proceed.
*/
if (!set_memory_enc_stop_conversion())
pr_warn("Failed to stop shared<->private conversions\n");
}
/* Walk direct mapping and convert all shared memory back to private */
static void tdx_kexec_finish(void)
{
unsigned long addr, end;
long found = 0, shared;
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
return;
lockdep_assert_irqs_disabled();
addr = PAGE_OFFSET;
end = PAGE_OFFSET + get_max_mapped();
while (addr < end) {
unsigned long size;
unsigned int level;
pte_t *pte;
pte = lookup_address(addr, &level);
size = page_level_size(level);
if (pte && pte_decrypted(*pte)) {
int pages = size / PAGE_SIZE;
/*
* Touching memory with shared bit set triggers implicit
* conversion to shared.
*
* Make sure nobody touches the shared range from
* now on.
*/
set_pte(pte, __pte(0));
/*
* Memory encryption state persists across kexec.
* If tdx_enc_status_changed() fails in the first
* kernel, it leaves memory in an unknown state.
*
* If that memory remains shared, accessing it in the
* *next* kernel through a private mapping will result
* in an unrecoverable guest shutdown.
*
* The kdump kernel boot is not impacted as it uses
* a pre-reserved memory range that is always private.
* However, gathering crash information could lead to
* a crash if it accesses unconverted memory through
* a private mapping which is possible when accessing
* that memory through /proc/vmcore, for example.
*
* In all cases, print error info in order to leave
* enough bread crumbs for debugging.
*/
if (!tdx_enc_status_changed(addr, pages, true)) {
pr_err("Failed to unshare range %#lx-%#lx\n",
addr, addr + size);
}
found += pages;
}
addr += size;
}
__flush_tlb_all();
shared = atomic_long_read(&nr_shared);
if (shared != found) {
pr_err("shared page accounting is off\n");
pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
}
}
void __init tdx_early_init(void)
......@@ -881,6 +981,9 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
......
......@@ -523,9 +523,9 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
* transition is complete, hv_vtom_set_host_visibility() marks the pages
* as "present" again.
*/
static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc)
static int hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc)
{
return !set_memory_np(kbuffer, pagecount);
return set_memory_np(kbuffer, pagecount);
}
/*
......@@ -536,20 +536,19 @@ static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc
* with host. This function works as wrap of hv_mark_gpa_visibility()
* with memory base and size.
*/
static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
static int hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
{
enum hv_mem_host_visibility visibility = enc ?
VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE;
u64 *pfn_array;
phys_addr_t paddr;
int i, pfn, err;
void *vaddr;
int ret = 0;
bool result = true;
int i, pfn;
pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!pfn_array) {
result = false;
ret = -ENOMEM;
goto err_set_memory_p;
}
......@@ -568,10 +567,8 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo
if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
ret = hv_mark_gpa_visibility(pfn, pfn_array,
visibility);
if (ret) {
result = false;
if (ret)
goto err_free_pfn_array;
}
pfn = 0;
}
}
......@@ -586,10 +583,11 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo
* order to avoid leaving the memory range in a "broken" state. Setting
* the PRESENT bits shouldn't fail, but return an error if it does.
*/
if (set_memory_p(kbuffer, pagecount))
result = false;
err = set_memory_p(kbuffer, pagecount);
if (err && !ret)
ret = err;
return result;
return ret;
}
static bool hv_vtom_tlb_flush_required(bool private)
......
......@@ -78,6 +78,13 @@ static inline bool acpi_skip_set_wakeup_address(void)
#define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address
union acpi_subtable_headers;
int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
const unsigned long end);
void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
/*
* Check if the CPU can handle C2 and deeper
*/
......
......@@ -6,6 +6,7 @@
struct x86_mapping_info {
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
void (*free_pgt_page)(void *, void *); /* free buf for page table */
void *context; /* context for alloc_pgt_page */
unsigned long page_flag; /* page flag for PMD or PUD entry */
unsigned long offset; /* ident mapping offset */
......@@ -16,4 +17,6 @@ struct x86_mapping_info {
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long pstart, unsigned long pend);
void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd);
#endif /* _ASM_X86_INIT_H */
......@@ -140,6 +140,11 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
static inline bool pte_decrypted(pte_t pte)
{
return cc_mkdec(pte_val(pte)) == pte_val(pte);
}
#define pmd_dirty pmd_dirty
static inline bool pmd_dirty(pmd_t pmd)
{
......
......@@ -549,6 +549,7 @@ enum pg_level {
PG_LEVEL_2M,
PG_LEVEL_1G,
PG_LEVEL_512G,
PG_LEVEL_256T,
PG_LEVEL_NUM
};
......
......@@ -49,8 +49,11 @@ int set_memory_wb(unsigned long addr, int numpages);
int set_memory_np(unsigned long addr, int numpages);
int set_memory_p(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);
bool set_memory_enc_stop_conversion(void);
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
int set_memory_np_noalias(unsigned long addr, int numpages);
int set_memory_nonglobal(unsigned long addr, int numpages);
int set_memory_global(unsigned long addr, int numpages);
......
......@@ -35,6 +35,7 @@ struct smp_ops {
int (*cpu_disable)(void);
void (*cpu_die)(unsigned int cpu);
void (*play_dead)(void);
void (*stop_this_cpu)(void);
void (*send_call_func_ipi)(const struct cpumask *mask);
void (*send_call_func_single_ipi)(int cpu);
......
......@@ -149,12 +149,22 @@ struct x86_init_acpi {
* @enc_status_change_finish Notify HV after the encryption status of a range is changed
* @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
* @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
* @enc_kexec_begin Begin the two-step process of converting shared memory back
* to private. It stops the new conversions from being started
* and waits in-flight conversions to finish, if possible.
* @enc_kexec_finish Finish the two-step process of converting shared memory to
* private. All memory is private after the call when
* the function returns.
* It is called on only one CPU while the others are shut down
* and with interrupts disabled.
*/
struct x86_guest {
bool (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
bool (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void);
void (*enc_kexec_begin)(void);
void (*enc_kexec_finish)(void);
};
/**
......
......@@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o
obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
......
......@@ -67,13 +67,6 @@ static bool has_lapic_cpus __initdata;
static bool acpi_support_online_capable;
#endif
#ifdef CONFIG_X86_64
/* Physical address of the Multiprocessor Wakeup Structure mailbox */
static u64 acpi_mp_wake_mailbox_paddr;
/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
#endif
#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
......@@ -341,60 +334,6 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
return 0;
}
#ifdef CONFIG_X86_64
static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
{
/*
* Remap mailbox memory only for the first call to acpi_wakeup_cpu().
*
* Wakeup of secondary CPUs is fully serialized in the core code.
* No need to protect acpi_mp_wake_mailbox from concurrent accesses.
*/
if (!acpi_mp_wake_mailbox) {
acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
sizeof(*acpi_mp_wake_mailbox),
MEMREMAP_WB);
}
/*
* Mailbox memory is shared between the firmware and OS. Firmware will
* listen on mailbox command address, and once it receives the wakeup
* command, the CPU associated with the given apicid will be booted.
*
* The value of 'apic_id' and 'wakeup_vector' must be visible to the
* firmware before the wakeup command is visible. smp_store_release()
* ensures ordering and visibility.
*/
acpi_mp_wake_mailbox->apic_id = apicid;
acpi_mp_wake_mailbox->wakeup_vector = start_ip;
smp_store_release(&acpi_mp_wake_mailbox->command,
ACPI_MP_WAKE_COMMAND_WAKEUP);
/*
* Wait for the CPU to wake up.
*
* The CPU being woken up is essentially in a spin loop waiting to be
* woken up. It should not take long for it wake up and acknowledge by
* zeroing out ->command.
*
* ACPI specification doesn't provide any guidance on how long kernel
* has to wait for a wake up acknowledgement. It also doesn't provide
* a way to cancel a wake up request if it takes too long.
*
* In TDX environment, the VMM has control over how long it takes to
* wake up secondary. It can postpone scheduling secondary vCPU
* indefinitely. Giving up on wake up request and reporting error opens
* possible attack vector for VMM: it can wake up a secondary CPU when
* kernel doesn't expect it. Wait until positive result of the wake up
* request.
*/
while (READ_ONCE(acpi_mp_wake_mailbox->command))
cpu_relax();
return 0;
}
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
......@@ -1124,29 +1063,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
}
return 0;
}
#ifdef CONFIG_X86_64
static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
const unsigned long end)
{
struct acpi_madt_multiproc_wakeup *mp_wake;
if (!IS_ENABLED(CONFIG_SMP))
return -ENODEV;
mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
if (BAD_MADT_ENTRY(mp_wake, end))
return -EINVAL;
acpi_table_print_madt_entry(&header->common);
acpi_mp_wake_mailbox_paddr = mp_wake->base_address;
apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
return 0;
}
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
......@@ -1343,7 +1259,7 @@ static void __init acpi_process_madt(void)
smp_found_config = 1;
}
#ifdef CONFIG_X86_64
#ifdef CONFIG_ACPI_MADT_WAKEUP
/*
* Parse MADT MP Wake entry.
*/
......
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/nospec-branch.h>
#include <asm/page_types.h>
#include <asm/processor-flags.h>
.text
.align PAGE_SIZE
/*
* asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
*
* rdi: Address of the ACPI MADT MPWK ResetVector
* rsi: PGD of the identity mapping
*/
SYM_FUNC_START(asm_acpi_mp_play_dead)
/* Turn off global entries. Following CR3 write will flush them. */
movq %cr4, %rdx
andq $~(X86_CR4_PGE), %rdx
movq %rdx, %cr4
/* Switch to identity mapping */
movq %rsi, %cr3
/* Jump to reset vector */
ANNOTATE_RETPOLINE_SAFE
jmp *%rdi
SYM_FUNC_END(asm_acpi_mp_play_dead)
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/acpi.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/io.h>
#include <linux/kexec.h>
#include <linux/memblock.h>
#include <linux/pgtable.h>
#include <linux/sched/hotplug.h>
#include <asm/apic.h>
#include <asm/barrier.h>
#include <asm/init.h>
#include <asm/intel_pt.h>
#include <asm/nmi.h>
#include <asm/processor.h>
#include <asm/reboot.h>
/* Physical address of the Multiprocessor Wakeup Structure mailbox */
static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init;
static u64 acpi_mp_pgd __ro_after_init;
static u64 acpi_mp_reset_vector_paddr __ro_after_init;
static void acpi_mp_stop_this_cpu(void)
{
asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
}
static void acpi_mp_play_dead(void)
{
play_dead_common();
asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
}
static void acpi_mp_cpu_die(unsigned int cpu)
{
u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
unsigned long timeout;
/*
* Use TEST mailbox command to prove that BIOS got control over
* the CPU before declaring it dead.
*
* BIOS has to clear 'command' field of the mailbox.
*/
acpi_mp_wake_mailbox->apic_id = apicid;
smp_store_release(&acpi_mp_wake_mailbox->command,
ACPI_MP_WAKE_COMMAND_TEST);
/* Don't wait longer than a second. */
timeout = USEC_PER_SEC;
while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
udelay(1);
if (!timeout)
pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
}
/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
static void __init *alloc_pgt_page(void *dummy)
{
return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
}
static void __init free_pgt_page(void *pgt, void *dummy)
{
return memblock_free(pgt, PAGE_SIZE);
}
/*
* Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
* the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
* to the identity mapping and the function has be present at the same spot in
* the virtual address space before and after switching page tables.
*/
static int __init init_transition_pgtable(pgd_t *pgd)
{
pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
unsigned long vaddr, paddr;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
vaddr = (unsigned long)asm_acpi_mp_play_dead;
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
p4d = (p4d_t *)alloc_pgt_page(NULL);
if (!p4d)
return -ENOMEM;
set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
}
p4d = p4d_offset(pgd, vaddr);
if (!p4d_present(*p4d)) {
pud = (pud_t *)alloc_pgt_page(NULL);
if (!pud)
return -ENOMEM;
set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
pud = pud_offset(p4d, vaddr);
if (!pud_present(*pud)) {
pmd = (pmd_t *)alloc_pgt_page(NULL);
if (!pmd)
return -ENOMEM;
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
}
pmd = pmd_offset(pud, vaddr);
if (!pmd_present(*pmd)) {
pte = (pte_t *)alloc_pgt_page(NULL);
if (!pte)
return -ENOMEM;
set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
}
pte = pte_offset_kernel(pmd, vaddr);
paddr = __pa(vaddr);
set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
return 0;
}
static int __init acpi_mp_setup_reset(u64 reset_vector)
{
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
.free_pgt_page = free_pgt_page,
.page_flag = __PAGE_KERNEL_LARGE_EXEC,
.kernpg_flag = _KERNPG_TABLE_NOENC,
};
pgd_t *pgd;
pgd = alloc_pgt_page(NULL);
if (!pgd)
return -ENOMEM;
for (int i = 0; i < nr_pfn_mapped; i++) {
unsigned long mstart, mend;
mstart = pfn_mapped[i].start << PAGE_SHIFT;
mend = pfn_mapped[i].end << PAGE_SHIFT;
if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
kernel_ident_mapping_free(&info, pgd);
return -ENOMEM;
}
}
if (kernel_ident_mapping_init(&info, pgd,
PAGE_ALIGN_DOWN(reset_vector),
PAGE_ALIGN(reset_vector + 1))) {
kernel_ident_mapping_free(&info, pgd);
return -ENOMEM;
}
if (init_transition_pgtable(pgd)) {
kernel_ident_mapping_free(&info, pgd);
return -ENOMEM;
}
smp_ops.play_dead = acpi_mp_play_dead;
smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
smp_ops.cpu_die = acpi_mp_cpu_die;
acpi_mp_reset_vector_paddr = reset_vector;
acpi_mp_pgd = __pa(pgd);
return 0;
}
static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
{
if (!acpi_mp_wake_mailbox_paddr) {
pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
return -EOPNOTSUPP;
}
/*
* Remap mailbox memory only for the first call to acpi_wakeup_cpu().
*
* Wakeup of secondary CPUs is fully serialized in the core code.
* No need to protect acpi_mp_wake_mailbox from concurrent accesses.
*/
if (!acpi_mp_wake_mailbox) {
acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
sizeof(*acpi_mp_wake_mailbox),
MEMREMAP_WB);
}
/*
* Mailbox memory is shared between the firmware and OS. Firmware will
* listen on mailbox command address, and once it receives the wakeup
* command, the CPU associated with the given apicid will be booted.
*
* The value of 'apic_id' and 'wakeup_vector' must be visible to the
* firmware before the wakeup command is visible. smp_store_release()
* ensures ordering and visibility.
*/
acpi_mp_wake_mailbox->apic_id = apicid;
acpi_mp_wake_mailbox->wakeup_vector = start_ip;
smp_store_release(&acpi_mp_wake_mailbox->command,
ACPI_MP_WAKE_COMMAND_WAKEUP);
/*
* Wait for the CPU to wake up.
*
* The CPU being woken up is essentially in a spin loop waiting to be
* woken up. It should not take long for it wake up and acknowledge by
* zeroing out ->command.
*
* ACPI specification doesn't provide any guidance on how long kernel
* has to wait for a wake up acknowledgment. It also doesn't provide
* a way to cancel a wake up request if it takes too long.
*
* In TDX environment, the VMM has control over how long it takes to
* wake up secondary. It can postpone scheduling secondary vCPU
* indefinitely. Giving up on wake up request and reporting error opens
* possible attack vector for VMM: it can wake up a secondary CPU when
* kernel doesn't expect it. Wait until positive result of the wake up
* request.
*/
while (READ_ONCE(acpi_mp_wake_mailbox->command))
cpu_relax();
return 0;
}
static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
{
cpu_hotplug_disable_offlining();
/*
* ACPI MADT doesn't allow to offline a CPU after it was onlined. This
* limits kexec: the second kernel won't be able to use more than one CPU.
*
* To prevent a kexec kernel from onlining secondary CPUs invalidate the
* mailbox address in the ACPI MADT wakeup structure which prevents a
* kexec kernel to use it.
*
* This is safe as the booting kernel has the mailbox address cached
* already and acpi_wakeup_cpu() uses the cached value to bring up the
* secondary CPUs.
*
* Note: This is a Linux specific convention and not covered by the
* ACPI specification.
*/
mp_wake->mailbox_address = 0;
}
int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
const unsigned long end)
{
struct acpi_madt_multiproc_wakeup *mp_wake;
mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
/*
* Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
* entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
* than the actual size of the MP wakeup entry in ACPI table because the
* 'reset_vector' is only available in the V1 MP wakeup structure.
*/
if (!mp_wake)
return -EINVAL;
if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
return -EINVAL;
if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
return -EINVAL;
acpi_table_print_madt_entry(&header->common);
acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
pr_warn("Failed to setup MADT reset vector\n");
acpi_mp_disable_offlining(mp_wake);
}
} else {
/*
* CPU offlining requires version 1 of the ACPI MADT wakeup
* structure.
*/
acpi_mp_disable_offlining(mp_wake);
}
apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
return 0;
}
......@@ -292,9 +292,8 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
*/
static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
{
struct cpu_cacheinfo *ci;
struct cacheinfo *ci;
int ret;
int i;
/* Pick the first cpu we find that is associated with the cache. */
plr->cpu = cpumask_first(&plr->d->cpu_mask);
......@@ -306,16 +305,12 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
goto out_region;
}
ci = get_cpu_cacheinfo(plr->cpu);
ci = get_cpu_cacheinfo_level(plr->cpu, plr->s->res->cache_level);
if (ci) {
plr->line_size = ci->coherency_line_size;
plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
for (i = 0; i < ci->num_leaves; i++) {
if (ci->info_list[i].level == plr->s->res->cache_level) {
plr->line_size = ci->info_list[i].coherency_line_size;
return 0;
}
}
ret = -1;
rdt_last_cmd_puts("Unable to determine cache line size\n");
......
......@@ -1450,18 +1450,14 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
struct rdt_domain *d, unsigned long cbm)
{
struct cpu_cacheinfo *ci;
unsigned int size = 0;
int num_b, i;
struct cacheinfo *ci;
int num_b;
num_b = bitmap_weight(&cbm, r->cache.cbm_len);
ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
for (i = 0; i < ci->num_leaves; i++) {
if (ci->info_list[i].level == r->cache_level) {
size = ci->info_list[i].size / r->cache.cbm_len * num_b;
break;
}
}
ci = get_cpu_cacheinfo_level(cpumask_any(&d->cpu_mask), r->cache_level);
if (ci)
size = ci->size / r->cache.cbm_len * num_b;
return size;
}
......
......@@ -128,6 +128,18 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
/*
* Non-crash kexec calls enc_kexec_begin() while scheduling is still
* active. This allows the callback to wait until all in-flight
* shared<->private conversions are complete. In a crash scenario,
* enc_kexec_begin() gets called after all but one CPU have been shut
* down and interrupts have been disabled. This allows the callback to
* detect a race with the conversion and report it.
*/
x86_platform.guest.enc_kexec_begin();
x86_platform.guest.enc_kexec_finish();
crash_save_cpu(regs, safe_smp_processor_id());
}
......
......@@ -828,7 +828,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
/*
* Find the highest page frame number we have available
*/
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn)
{
int i;
unsigned long last_pfn = 0;
......@@ -839,7 +839,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
unsigned long start_pfn;
unsigned long end_pfn;
if (entry->type != type)
if (entry->type != E820_TYPE_RAM &&
entry->type != E820_TYPE_ACPI)
continue;
start_pfn = entry->addr >> PAGE_SHIFT;
......@@ -865,12 +866,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
unsigned long __init e820__end_of_ram_pfn(void)
{
return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
return e820__end_ram_pfn(MAX_ARCH_PFN);
}
unsigned long __init e820__end_of_low_ram_pfn(void)
{
return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT));
}
static void __init early_panic(char *msg)
......
......@@ -835,6 +835,13 @@ void __noreturn stop_this_cpu(void *dummy)
*/
cpumask_clear_cpu(cpu, &cpus_stop_mask);
#ifdef CONFIG_SMP
if (smp_ops.stop_this_cpu) {
smp_ops.stop_this_cpu();
unreachable();
}
#endif
for (;;) {
/*
* Use native_halt() so that memory contents don't change
......
......@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
#include <linux/kexec.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
......@@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void)
{
/*
* Call enc_kexec_begin() while all CPUs are still active and
* interrupts are enabled. This will allow all in-flight memory
* conversions to finish cleanly.
*/
if (kexec_in_progress)
x86_platform.guest.enc_kexec_begin();
/* Stop the cpus and apics */
#ifdef CONFIG_X86_IO_APIC
/*
......@@ -752,6 +761,9 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_64
x86_platform.iommu_shutdown();
#endif
if (kexec_in_progress)
x86_platform.guest.enc_kexec_finish();
}
static void __machine_emergency_restart(int emergency)
......@@ -868,6 +880,12 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
if (smp_ops.stop_this_cpu) {
smp_ops.stop_this_cpu();
unreachable();
}
/* Assume hlt works */
halt();
for (;;)
......
......@@ -5,6 +5,8 @@
*/
#include <linux/linkage.h>
#include <linux/stringify.h>
#include <asm/alternative.h>
#include <asm/page_types.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
......@@ -145,16 +147,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* Set cr4 to a known state:
* - physical address extension enabled
* - 5-level paging, if it was enabled before
* - Machine check exception on TDX guest, if it was enabled before.
* Clearing MCE might not be allowed in TDX guests, depending on setup.
*
* Use R13 that contains the original CR4 value, read in relocate_kernel().
* PAE is always set in the original CR4.
*/
movl $X86_CR4_PAE, %eax
testq $X86_CR4_LA57, %r13
jz 1f
orl $X86_CR4_LA57, %eax
1:
movq %rax, %cr4
jmp 1f
1:
andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
movq %r13, %cr4
/* Flush the TLB (needed?) */
movq %r9, %cr3
......@@ -165,9 +166,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* used by kexec. Flush the caches before copying the kernel.
*/
testq %r12, %r12
jz 1f
jz .Lsme_off
wbinvd
1:
.Lsme_off:
movq %rcx, %r11
call swap_pages
......@@ -187,7 +188,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
*/
testq %r11, %r11
jnz 1f
jnz .Lrelocate
xorl %eax, %eax
xorl %ebx, %ebx
xorl %ecx, %ecx
......@@ -208,7 +209,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
ret
int3
1:
.Lrelocate:
popq %rdx
leaq PAGE_SIZE(%r10), %rsp
ANNOTATE_RETPOLINE_SAFE
......
......@@ -134,10 +134,12 @@ struct x86_cpuinit_ops x86_cpuinit = {
static void default_nmi_init(void) { };
static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; }
static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; }
static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
static void enc_kexec_begin_noop(void) {}
static void enc_kexec_finish_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; }
struct x86_platform_ops x86_platform __ro_after_init = {
......@@ -161,6 +163,8 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.enc_status_change_finish = enc_status_change_finish_noop,
.enc_tlb_flush_required = enc_tlb_flush_required_noop,
.enc_cache_flush_required = enc_cache_flush_required_noop,
.enc_kexec_begin = enc_kexec_begin_noop,
.enc_kexec_finish = enc_kexec_finish_noop,
},
};
......
......@@ -4,6 +4,79 @@
* included by both the compressed kernel and the regular kernel.
*/
static void free_pte(struct x86_mapping_info *info, pmd_t *pmd)
{
pte_t *pte = pte_offset_kernel(pmd, 0);
info->free_pgt_page(pte, info->context);
}
static void free_pmd(struct x86_mapping_info *info, pud_t *pud)
{
pmd_t *pmd = pmd_offset(pud, 0);
int i;
for (i = 0; i < PTRS_PER_PMD; i++) {
if (!pmd_present(pmd[i]))
continue;
if (pmd_leaf(pmd[i]))
continue;
free_pte(info, &pmd[i]);
}
info->free_pgt_page(pmd, info->context);
}
static void free_pud(struct x86_mapping_info *info, p4d_t *p4d)
{
pud_t *pud = pud_offset(p4d, 0);
int i;
for (i = 0; i < PTRS_PER_PUD; i++) {
if (!pud_present(pud[i]))
continue;
if (pud_leaf(pud[i]))
continue;
free_pmd(info, &pud[i]);
}
info->free_pgt_page(pud, info->context);
}
static void free_p4d(struct x86_mapping_info *info, pgd_t *pgd)
{
p4d_t *p4d = p4d_offset(pgd, 0);
int i;
for (i = 0; i < PTRS_PER_P4D; i++) {
if (!p4d_present(p4d[i]))
continue;
free_pud(info, &p4d[i]);
}
if (pgtable_l5_enabled())
info->free_pgt_page(p4d, info->context);
}
void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd)
{
int i;
for (i = 0; i < PTRS_PER_PGD; i++) {
if (!pgd_present(pgd[i]))
continue;
free_p4d(info, &pgd[i]);
}
info->free_pgt_page(pgd, info->context);
}
static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
unsigned long addr, unsigned long end)
{
......
......@@ -469,7 +469,9 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
E820_TYPE_RESERVED_KERN) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_ACPI))
set_pte_init(pte, __pte(0), init);
continue;
}
......@@ -524,7 +526,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
E820_TYPE_RESERVED_KERN) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_ACPI))
set_pmd_init(pmd, __pmd(0), init);
continue;
}
......@@ -611,7 +615,9 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
E820_TYPE_RESERVED_KERN) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_ACPI))
set_pud_init(pud, __pud(0), init);
continue;
}
......@@ -698,7 +704,9 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
E820_TYPE_RESERVED_KERN) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_ACPI))
set_p4d_init(p4d, __p4d(0), init);
continue;
}
......
......@@ -283,7 +283,7 @@ static void enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc)
#endif
}
static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
static int amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
{
/*
* To maintain the security guarantees of SEV-SNP guests, make sure
......@@ -292,11 +292,11 @@ static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool
if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc)
snp_set_memory_shared(vaddr, npages);
return true;
return 0;
}
/* Return true unconditionally: return value doesn't matter for the SEV side */
static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc)
static int amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc)
{
/*
* After memory is mapped encrypted in the page table, validate it
......@@ -308,7 +308,7 @@ static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool e
if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc);
return true;
return 0;
}
static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
......
......@@ -662,8 +662,9 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
/*
* Lookup the page table entry for a virtual address in a specific pgd.
* Return a pointer to the entry, the level of the mapping, and the effective
* NX and RW bits of all page table levels.
* Return a pointer to the entry (or NULL if the entry does not exist),
* the level of the entry, and the effective NX and RW bits of all
* page table levels.
*/
pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
unsigned int *level, bool *nx, bool *rw)
......@@ -672,13 +673,14 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
pud_t *pud;
pmd_t *pmd;
*level = PG_LEVEL_NONE;
*level = PG_LEVEL_256T;
*nx = false;
*rw = true;
if (pgd_none(*pgd))
return NULL;
*level = PG_LEVEL_512G;
*nx |= pgd_flags(*pgd) & _PAGE_NX;
*rw &= pgd_flags(*pgd) & _PAGE_RW;
......@@ -686,10 +688,10 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
if (p4d_none(*p4d))
return NULL;
*level = PG_LEVEL_512G;
if (p4d_leaf(*p4d) || !p4d_present(*p4d))
return (pte_t *)p4d;
*level = PG_LEVEL_1G;
*nx |= p4d_flags(*p4d) & _PAGE_NX;
*rw &= p4d_flags(*p4d) & _PAGE_RW;
......@@ -697,10 +699,10 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
if (pud_none(*pud))
return NULL;
*level = PG_LEVEL_1G;
if (pud_leaf(*pud) || !pud_present(*pud))
return (pte_t *)pud;
*level = PG_LEVEL_2M;
*nx |= pud_flags(*pud) & _PAGE_NX;
*rw &= pud_flags(*pud) & _PAGE_RW;
......@@ -708,15 +710,13 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
if (pmd_none(*pmd))
return NULL;
*level = PG_LEVEL_2M;
if (pmd_leaf(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
*level = PG_LEVEL_4K;
*nx |= pmd_flags(*pmd) & _PAGE_NX;
*rw &= pmd_flags(*pmd) & _PAGE_RW;
*level = PG_LEVEL_4K;
return pte_offset_kernel(pmd, address);
}
......@@ -736,9 +736,8 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
* Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping.
*
* Note: We return pud and pmd either when the entry is marked large
* or when the present bit is not set. Otherwise we would return a
* pointer to a nonexisting mapping.
* Note: the function returns p4d, pud or pmd either when the entry is marked
* large or when the present bit is not set. Otherwise it returns NULL.
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
......@@ -2196,7 +2195,8 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
/* Notify hypervisor that we are about to set/clr encryption attribute. */
if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc))
ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
if (ret)
goto vmm_fail;
ret = __change_page_attr_set_clr(&cpa, 1);
......@@ -2214,24 +2214,61 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
return ret;
/* Notify hypervisor that we have successfully set/clr encryption attribute. */
if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc))
ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc);
if (ret)
goto vmm_fail;
return 0;
vmm_fail:
WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n",
(void *)addr, numpages, enc ? "private" : "shared");
WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n",
(void *)addr, numpages, enc ? "private" : "shared", ret);
return ret;
}
/*
* The lock serializes conversions between private and shared memory.
*
* It is taken for read on conversion. A write lock guarantees that no
* concurrent conversions are in progress.
*/
static DECLARE_RWSEM(mem_enc_lock);
/*
* Stop new private<->shared conversions.
*
* Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
* The lock is not released to prevent new conversions from being started.
*/
bool set_memory_enc_stop_conversion(void)
{
/*
* In a crash scenario, sleep is not allowed. Try to take the lock.
* Failure indicates that there is a race with the conversion.
*/
if (oops_in_progress)
return down_write_trylock(&mem_enc_lock);
down_write(&mem_enc_lock);
return -EIO;
return true;
}
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return __set_memory_enc_pgtable(addr, numpages, enc);
int ret = 0;
return 0;
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
if (!down_read_trylock(&mem_enc_lock))
return -EBUSY;
ret = __set_memory_enc_pgtable(addr, numpages, enc);
up_read(&mem_enc_lock);
}
return ret;
}
int set_memory_encrypted(unsigned long addr, int numpages)
......
......@@ -198,6 +198,20 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header)
}
break;
case ACPI_MADT_TYPE_MULTIPROC_WAKEUP:
{
struct acpi_madt_multiproc_wakeup *p =
(struct acpi_madt_multiproc_wakeup *)header;
u64 reset_vector = 0;
if (p->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1)
reset_vector = p->reset_vector;
pr_debug("MP Wakeup (version[%d], mailbox[%#llx], reset[%#llx])\n",
p->version, p->mailbox_address, reset_vector);
}
break;
case ACPI_MADT_TYPE_CORE_PIC:
{
struct acpi_madt_core_pic *p = (struct acpi_madt_core_pic *)header;
......
......@@ -1194,11 +1194,23 @@ struct acpi_madt_generic_translator {
struct acpi_madt_multiproc_wakeup {
struct acpi_subtable_header header;
u16 mailbox_version;
u16 version;
u32 reserved; /* reserved - must be zero */
u64 base_address;
u64 mailbox_address;
u64 reset_vector;
};
/* Values for Version field above */
enum acpi_madt_multiproc_wakeup_version {
ACPI_MADT_MP_WAKEUP_VERSION_NONE = 0,
ACPI_MADT_MP_WAKEUP_VERSION_V1 = 1,
ACPI_MADT_MP_WAKEUP_VERSION_RESERVED = 2, /* 2 and greater are reserved */
};
#define ACPI_MADT_MP_WAKEUP_SIZE_V0 16
#define ACPI_MADT_MP_WAKEUP_SIZE_V1 24
#define ACPI_MULTIPROC_WAKEUP_MB_OS_SIZE 2032
#define ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE 2048
......@@ -1212,6 +1224,7 @@ struct acpi_madt_multiproc_wakeup_mailbox {
};
#define ACPI_MP_WAKE_COMMAND_WAKEUP 1
#define ACPI_MP_WAKE_COMMAND_TEST 2
/* 17: CPU Core Interrupt Controller (ACPI 6.5) */
......
......@@ -3,6 +3,7 @@
#define _LINUX_CACHEINFO_H
#include <linux/bitops.h>
#include <linux/cpuhplock.h>
#include <linux/cpumask.h>
#include <linux/smp.h>
......@@ -113,23 +114,37 @@ int acpi_get_cache_info(unsigned int cpu,
const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf);
/*
* Get the id of the cache associated with @cpu at level @level.
* Get the cacheinfo structure for the cache associated with @cpu at
* level @level.
* cpuhp lock must be held.
*/
static inline int get_cpu_cacheinfo_id(int cpu, int level)
static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
{
struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
int i;
lockdep_assert_cpus_held();
for (i = 0; i < ci->num_leaves; i++) {
if (ci->info_list[i].level == level) {
if (ci->info_list[i].attributes & CACHE_ID)
return ci->info_list[i].id;
return -1;
return &ci->info_list[i];
return NULL;
}
}
return -1;
return NULL;
}
/*
* Get the id of the cache associated with @cpu at level @level.
* cpuhp lock must be held.
*/
static inline int get_cpu_cacheinfo_id(int cpu, int level)
{
struct cacheinfo *ci = get_cpu_cacheinfo_level(cpu, level);
return ci ? ci->id : -1;
}
#ifdef CONFIG_ARM64
......
......@@ -81,16 +81,6 @@ enum cc_attr {
*/
CC_ATTR_GUEST_SEV_SNP,
/**
* @CC_ATTR_HOTPLUG_DISABLED: Hotplug is not supported or disabled.
*
* The platform/OS is running as a guest/virtual machine does not
* support CPU hotplug feature.
*
* Examples include TDX Guest.
*/
CC_ATTR_HOTPLUG_DISABLED,
/**
* @CC_ATTR_HOST_SEV_SNP: AMD SNP enabled on the host.
*
......
......@@ -18,6 +18,7 @@
#include <linux/compiler.h>
#include <linux/cpumask.h>
#include <linux/cpuhotplug.h>
#include <linux/cpuhplock.h>
#include <linux/cpu_smt.h>
struct device;
......@@ -132,38 +133,6 @@ static inline int add_cpu(unsigned int cpu) { return 0;}
#endif /* CONFIG_SMP */
extern const struct bus_type cpu_subsys;
extern int lockdep_is_cpus_held(void);
#ifdef CONFIG_HOTPLUG_CPU
extern void cpus_write_lock(void);
extern void cpus_write_unlock(void);
extern void cpus_read_lock(void);
extern void cpus_read_unlock(void);
extern int cpus_read_trylock(void);
extern void lockdep_assert_cpus_held(void);
extern void cpu_hotplug_disable(void);
extern void cpu_hotplug_enable(void);
void clear_tasks_mm_cpumask(int cpu);
int remove_cpu(unsigned int cpu);
int cpu_device_down(struct device *dev);
extern void smp_shutdown_nonboot_cpus(unsigned int primary_cpu);
#else /* CONFIG_HOTPLUG_CPU */
static inline void cpus_write_lock(void) { }
static inline void cpus_write_unlock(void) { }
static inline void cpus_read_lock(void) { }
static inline void cpus_read_unlock(void) { }
static inline int cpus_read_trylock(void) { return true; }
static inline void lockdep_assert_cpus_held(void) { }
static inline void cpu_hotplug_disable(void) { }
static inline void cpu_hotplug_enable(void) { }
static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
#endif /* !CONFIG_HOTPLUG_CPU */
DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
#ifdef CONFIG_PM_SLEEP_SMP
extern int freeze_secondary_cpus(int primary);
extern void thaw_secondary_cpus(void);
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* include/linux/cpuhplock.h - CPU hotplug locking
*
* Locking functions for CPU hotplug.
*/
#ifndef _LINUX_CPUHPLOCK_H_
#define _LINUX_CPUHPLOCK_H_
#include <linux/cleanup.h>
#include <linux/errno.h>
struct device;
extern int lockdep_is_cpus_held(void);
#ifdef CONFIG_HOTPLUG_CPU
void cpus_write_lock(void);
void cpus_write_unlock(void);
void cpus_read_lock(void);
void cpus_read_unlock(void);
int cpus_read_trylock(void);
void lockdep_assert_cpus_held(void);
void cpu_hotplug_disable_offlining(void);
void cpu_hotplug_disable(void);
void cpu_hotplug_enable(void);
void clear_tasks_mm_cpumask(int cpu);
int remove_cpu(unsigned int cpu);
int cpu_device_down(struct device *dev);
void smp_shutdown_nonboot_cpus(unsigned int primary_cpu);
#else /* CONFIG_HOTPLUG_CPU */
static inline void cpus_write_lock(void) { }
static inline void cpus_write_unlock(void) { }
static inline void cpus_read_lock(void) { }
static inline void cpus_read_unlock(void) { }
static inline int cpus_read_trylock(void) { return true; }
static inline void lockdep_assert_cpus_held(void) { }
static inline void cpu_hotplug_disable_offlining(void) { }
static inline void cpu_hotplug_disable(void) { }
static inline void cpu_hotplug_enable(void) { }
static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
#endif /* !CONFIG_HOTPLUG_CPU */
DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
#endif /* _LINUX_CPUHPLOCK_H_ */
......@@ -483,6 +483,8 @@ static int cpu_hotplug_disabled;
DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
static bool cpu_hotplug_offline_disabled __ro_after_init;
void cpus_read_lock(void)
{
percpu_down_read(&cpu_hotplug_lock);
......@@ -542,6 +544,14 @@ static void lockdep_release_cpus_lock(void)
rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}
/* Declare CPU offlining not supported */
void cpu_hotplug_disable_offlining(void)
{
cpu_maps_update_begin();
cpu_hotplug_offline_disabled = true;
cpu_maps_update_done();
}
/*
* Wait for currently running CPU hotplug operations to complete (if any) and
* disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
......@@ -1471,7 +1481,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
* If the platform does not support hotplug, report it explicitly to
* differentiate it from a transient offlining failure.
*/
if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
if (cpu_hotplug_offline_disabled)
return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment