Commit 98896d87 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86_cc_for_v6.11_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 confidential computing updates from Borislav Petkov:
 "Unrelated x86/cc changes queued here to avoid ugly cross-merges and
  conflicts:

   - Carve out CPU hotplug function declarations into a separate header
     with the goal to be able to use the lockdep assertions in a more
     flexible manner

   - As a result, refactor cacheinfo code after carving out a function
     to return the cache ID associated with a given cache level

   - Cleanups

  Add support to be able to kexec TDX guests:

   - Expand ACPI MADT CPU offlining support

   - Add machinery to prepare CoCo guests memory before kexec-ing into a
     new kernel

   - Cleanup, readjust and massage related code"

* tag 'x86_cc_for_v6.11_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
  ACPI: tables: Print MULTIPROC_WAKEUP when MADT is parsed
  x86/acpi: Add support for CPU offlining for ACPI MADT wakeup method
  x86/mm: Introduce kernel_ident_mapping_free()
  x86/smp: Add smp_ops.stop_this_cpu() callback
  x86/acpi: Do not attempt to bring up secondary CPUs in the kexec case
  x86/acpi: Rename fields in the acpi_madt_multiproc_wakeup structure
  x86/mm: Do not zap page table entries mapping unaccepted memory table during kdump
  x86/mm: Make e820__end_ram_pfn() cover E820_TYPE_ACPI ranges
  x86/tdx: Convert shared memory back to private on kexec
  x86/mm: Add callbacks to prepare encrypted memory for kexec
  x86/tdx: Account shared memory
  x86/mm: Return correct level from lookup_address() if pte is none
  x86/mm: Make x86_platform.guest.enc_status_change_*() return an error
  x86/kexec: Keep CR4.MCE set during kexec for TDX guest
  x86/relocate_kernel: Use named labels for less confusion
  cpu/hotplug, x86/acpi: Disable CPU offlining for ACPI MADT wakeup
  cpu/hotplug: Add support for declaring CPU offlining not supported
  x86/apic: Mark acpi_mp_wake_* variables as __ro_after_init
  x86/acpi: Extract ACPI MADT wakeup code into a separate file
  x86/kexec: Remove spurious unconditional JMP from from identity_mapped()
  ...
parents 181a984b 16df3594
...@@ -1118,6 +1118,13 @@ config X86_LOCAL_APIC ...@@ -1118,6 +1118,13 @@ config X86_LOCAL_APIC
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
select IRQ_DOMAIN_HIERARCHY select IRQ_DOMAIN_HIERARCHY
config ACPI_MADT_WAKEUP
def_bool y
depends on X86_64
depends on ACPI
depends on SMP
depends on X86_LOCAL_APIC
config X86_IO_APIC config X86_IO_APIC
def_bool y def_bool y
depends on X86_LOCAL_APIC || X86_UP_IOAPIC depends on X86_LOCAL_APIC || X86_UP_IOAPIC
......
...@@ -29,7 +29,6 @@ static bool noinstr intel_cc_platform_has(enum cc_attr attr) ...@@ -29,7 +29,6 @@ static bool noinstr intel_cc_platform_has(enum cc_attr attr)
{ {
switch (attr) { switch (attr) {
case CC_ATTR_GUEST_UNROLL_STRING_IO: case CC_ATTR_GUEST_UNROLL_STRING_IO:
case CC_ATTR_HOTPLUG_DISABLED:
case CC_ATTR_GUEST_MEM_ENCRYPT: case CC_ATTR_GUEST_MEM_ENCRYPT:
case CC_ATTR_MEM_ENCRYPT: case CC_ATTR_MEM_ENCRYPT:
return true; return true;
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/cpufeature.h> #include <linux/cpufeature.h>
#include <linux/export.h> #include <linux/export.h>
#include <linux/io.h> #include <linux/io.h>
#include <linux/kexec.h>
#include <asm/coco.h> #include <asm/coco.h>
#include <asm/tdx.h> #include <asm/tdx.h>
#include <asm/vmx.h> #include <asm/vmx.h>
...@@ -14,6 +15,7 @@ ...@@ -14,6 +15,7 @@
#include <asm/insn.h> #include <asm/insn.h>
#include <asm/insn-eval.h> #include <asm/insn-eval.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/set_memory.h>
/* MMIO direction */ /* MMIO direction */
#define EPT_READ 0 #define EPT_READ 0
...@@ -38,6 +40,8 @@ ...@@ -38,6 +40,8 @@
#define TDREPORT_SUBTYPE_0 0 #define TDREPORT_SUBTYPE_0 0
static atomic_long_t nr_shared;
/* Called from __tdx_hypercall() for unrecoverable failure */ /* Called from __tdx_hypercall() for unrecoverable failure */
noinstr void __noreturn __tdx_hypercall_failed(void) noinstr void __noreturn __tdx_hypercall_failed(void)
{ {
...@@ -798,28 +802,124 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) ...@@ -798,28 +802,124 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
return true; return true;
} }
static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages, static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
bool enc) bool enc)
{ {
/* /*
* Only handle shared->private conversion here. * Only handle shared->private conversion here.
* See the comment in tdx_early_init(). * See the comment in tdx_early_init().
*/ */
if (enc) if (enc && !tdx_enc_status_changed(vaddr, numpages, enc))
return tdx_enc_status_changed(vaddr, numpages, enc); return -EIO;
return true;
return 0;
} }
static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages, static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
bool enc) bool enc)
{ {
/* /*
* Only handle private->shared conversion here. * Only handle private->shared conversion here.
* See the comment in tdx_early_init(). * See the comment in tdx_early_init().
*/ */
if (!enc) if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
return tdx_enc_status_changed(vaddr, numpages, enc); return -EIO;
return true;
if (enc)
atomic_long_sub(numpages, &nr_shared);
else
atomic_long_add(numpages, &nr_shared);
return 0;
}
/* Stop new private<->shared conversions */
static void tdx_kexec_begin(void)
{
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
return;
/*
* Crash kernel reaches here with interrupts disabled: can't wait for
* conversions to finish.
*
* If race happened, just report and proceed.
*/
if (!set_memory_enc_stop_conversion())
pr_warn("Failed to stop shared<->private conversions\n");
}
/* Walk direct mapping and convert all shared memory back to private */
static void tdx_kexec_finish(void)
{
unsigned long addr, end;
long found = 0, shared;
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
return;
lockdep_assert_irqs_disabled();
addr = PAGE_OFFSET;
end = PAGE_OFFSET + get_max_mapped();
while (addr < end) {
unsigned long size;
unsigned int level;
pte_t *pte;
pte = lookup_address(addr, &level);
size = page_level_size(level);
if (pte && pte_decrypted(*pte)) {
int pages = size / PAGE_SIZE;
/*
* Touching memory with shared bit set triggers implicit
* conversion to shared.
*
* Make sure nobody touches the shared range from
* now on.
*/
set_pte(pte, __pte(0));
/*
* Memory encryption state persists across kexec.
* If tdx_enc_status_changed() fails in the first
* kernel, it leaves memory in an unknown state.
*
* If that memory remains shared, accessing it in the
* *next* kernel through a private mapping will result
* in an unrecoverable guest shutdown.
*
* The kdump kernel boot is not impacted as it uses
* a pre-reserved memory range that is always private.
* However, gathering crash information could lead to
* a crash if it accesses unconverted memory through
* a private mapping which is possible when accessing
* that memory through /proc/vmcore, for example.
*
* In all cases, print error info in order to leave
* enough bread crumbs for debugging.
*/
if (!tdx_enc_status_changed(addr, pages, true)) {
pr_err("Failed to unshare range %#lx-%#lx\n",
addr, addr + size);
}
found += pages;
}
addr += size;
}
__flush_tlb_all();
shared = atomic_long_read(&nr_shared);
if (shared != found) {
pr_err("shared page accounting is off\n");
pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
}
} }
void __init tdx_early_init(void) void __init tdx_early_init(void)
...@@ -881,6 +981,9 @@ void __init tdx_early_init(void) ...@@ -881,6 +981,9 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required; x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
/* /*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled * bringup low level code. That raises #VE which cannot be handled
......
...@@ -523,9 +523,9 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], ...@@ -523,9 +523,9 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
* transition is complete, hv_vtom_set_host_visibility() marks the pages * transition is complete, hv_vtom_set_host_visibility() marks the pages
* as "present" again. * as "present" again.
*/ */
static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc) static int hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc)
{ {
return !set_memory_np(kbuffer, pagecount); return set_memory_np(kbuffer, pagecount);
} }
/* /*
...@@ -536,20 +536,19 @@ static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc ...@@ -536,20 +536,19 @@ static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc
* with host. This function works as wrap of hv_mark_gpa_visibility() * with host. This function works as wrap of hv_mark_gpa_visibility()
* with memory base and size. * with memory base and size.
*/ */
static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc) static int hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
{ {
enum hv_mem_host_visibility visibility = enc ? enum hv_mem_host_visibility visibility = enc ?
VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE; VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE;
u64 *pfn_array; u64 *pfn_array;
phys_addr_t paddr; phys_addr_t paddr;
int i, pfn, err;
void *vaddr; void *vaddr;
int ret = 0; int ret = 0;
bool result = true;
int i, pfn;
pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!pfn_array) { if (!pfn_array) {
result = false; ret = -ENOMEM;
goto err_set_memory_p; goto err_set_memory_p;
} }
...@@ -568,10 +567,8 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo ...@@ -568,10 +567,8 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo
if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) { if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
ret = hv_mark_gpa_visibility(pfn, pfn_array, ret = hv_mark_gpa_visibility(pfn, pfn_array,
visibility); visibility);
if (ret) { if (ret)
result = false;
goto err_free_pfn_array; goto err_free_pfn_array;
}
pfn = 0; pfn = 0;
} }
} }
...@@ -586,10 +583,11 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo ...@@ -586,10 +583,11 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo
* order to avoid leaving the memory range in a "broken" state. Setting * order to avoid leaving the memory range in a "broken" state. Setting
* the PRESENT bits shouldn't fail, but return an error if it does. * the PRESENT bits shouldn't fail, but return an error if it does.
*/ */
if (set_memory_p(kbuffer, pagecount)) err = set_memory_p(kbuffer, pagecount);
result = false; if (err && !ret)
ret = err;
return result; return ret;
} }
static bool hv_vtom_tlb_flush_required(bool private) static bool hv_vtom_tlb_flush_required(bool private)
......
...@@ -78,6 +78,13 @@ static inline bool acpi_skip_set_wakeup_address(void) ...@@ -78,6 +78,13 @@ static inline bool acpi_skip_set_wakeup_address(void)
#define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address #define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address
union acpi_subtable_headers;
int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
const unsigned long end);
void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
/* /*
* Check if the CPU can handle C2 and deeper * Check if the CPU can handle C2 and deeper
*/ */
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
struct x86_mapping_info { struct x86_mapping_info {
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
void (*free_pgt_page)(void *, void *); /* free buf for page table */
void *context; /* context for alloc_pgt_page */ void *context; /* context for alloc_pgt_page */
unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long page_flag; /* page flag for PMD or PUD entry */
unsigned long offset; /* ident mapping offset */ unsigned long offset; /* ident mapping offset */
...@@ -16,4 +17,6 @@ struct x86_mapping_info { ...@@ -16,4 +17,6 @@ struct x86_mapping_info {
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long pstart, unsigned long pend); unsigned long pstart, unsigned long pend);
void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd);
#endif /* _ASM_X86_INIT_H */ #endif /* _ASM_X86_INIT_H */
...@@ -140,6 +140,11 @@ static inline int pte_young(pte_t pte) ...@@ -140,6 +140,11 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED; return pte_flags(pte) & _PAGE_ACCESSED;
} }
static inline bool pte_decrypted(pte_t pte)
{
return cc_mkdec(pte_val(pte)) == pte_val(pte);
}
#define pmd_dirty pmd_dirty #define pmd_dirty pmd_dirty
static inline bool pmd_dirty(pmd_t pmd) static inline bool pmd_dirty(pmd_t pmd)
{ {
......
...@@ -549,6 +549,7 @@ enum pg_level { ...@@ -549,6 +549,7 @@ enum pg_level {
PG_LEVEL_2M, PG_LEVEL_2M,
PG_LEVEL_1G, PG_LEVEL_1G,
PG_LEVEL_512G, PG_LEVEL_512G,
PG_LEVEL_256T,
PG_LEVEL_NUM PG_LEVEL_NUM
}; };
......
...@@ -49,8 +49,11 @@ int set_memory_wb(unsigned long addr, int numpages); ...@@ -49,8 +49,11 @@ int set_memory_wb(unsigned long addr, int numpages);
int set_memory_np(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages);
int set_memory_p(unsigned long addr, int numpages); int set_memory_p(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages);
bool set_memory_enc_stop_conversion(void);
int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages);
int set_memory_np_noalias(unsigned long addr, int numpages); int set_memory_np_noalias(unsigned long addr, int numpages);
int set_memory_nonglobal(unsigned long addr, int numpages); int set_memory_nonglobal(unsigned long addr, int numpages);
int set_memory_global(unsigned long addr, int numpages); int set_memory_global(unsigned long addr, int numpages);
......
...@@ -35,6 +35,7 @@ struct smp_ops { ...@@ -35,6 +35,7 @@ struct smp_ops {
int (*cpu_disable)(void); int (*cpu_disable)(void);
void (*cpu_die)(unsigned int cpu); void (*cpu_die)(unsigned int cpu);
void (*play_dead)(void); void (*play_dead)(void);
void (*stop_this_cpu)(void);
void (*send_call_func_ipi)(const struct cpumask *mask); void (*send_call_func_ipi)(const struct cpumask *mask);
void (*send_call_func_single_ipi)(int cpu); void (*send_call_func_single_ipi)(int cpu);
......
...@@ -149,12 +149,22 @@ struct x86_init_acpi { ...@@ -149,12 +149,22 @@ struct x86_init_acpi {
* @enc_status_change_finish Notify HV after the encryption status of a range is changed * @enc_status_change_finish Notify HV after the encryption status of a range is changed
* @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status * @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
* @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status * @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
* @enc_kexec_begin Begin the two-step process of converting shared memory back
* to private. It stops the new conversions from being started
* and waits in-flight conversions to finish, if possible.
* @enc_kexec_finish Finish the two-step process of converting shared memory to
* private. All memory is private after the call when
* the function returns.
* It is called on only one CPU while the others are shut down
* and with interrupts disabled.
*/ */
struct x86_guest { struct x86_guest {
bool (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc); int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
bool (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc); int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc); bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void); bool (*enc_cache_flush_required)(void);
void (*enc_kexec_begin)(void);
void (*enc_kexec_finish)(void);
}; };
/** /**
......
...@@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o ...@@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o
obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o
ifneq ($(CONFIG_ACPI_PROCESSOR),) ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o obj-y += cstate.o
......
...@@ -67,13 +67,6 @@ static bool has_lapic_cpus __initdata; ...@@ -67,13 +67,6 @@ static bool has_lapic_cpus __initdata;
static bool acpi_support_online_capable; static bool acpi_support_online_capable;
#endif #endif
#ifdef CONFIG_X86_64
/* Physical address of the Multiprocessor Wakeup Structure mailbox */
static u64 acpi_mp_wake_mailbox_paddr;
/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
#endif
#ifdef CONFIG_X86_IO_APIC #ifdef CONFIG_X86_IO_APIC
/* /*
* Locks related to IOAPIC hotplug * Locks related to IOAPIC hotplug
...@@ -341,60 +334,6 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e ...@@ -341,60 +334,6 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
return 0; return 0;
} }
#ifdef CONFIG_X86_64
static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
{
/*
* Remap mailbox memory only for the first call to acpi_wakeup_cpu().
*
* Wakeup of secondary CPUs is fully serialized in the core code.
* No need to protect acpi_mp_wake_mailbox from concurrent accesses.
*/
if (!acpi_mp_wake_mailbox) {
acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
sizeof(*acpi_mp_wake_mailbox),
MEMREMAP_WB);
}
/*
* Mailbox memory is shared between the firmware and OS. Firmware will
* listen on mailbox command address, and once it receives the wakeup
* command, the CPU associated with the given apicid will be booted.
*
* The value of 'apic_id' and 'wakeup_vector' must be visible to the
* firmware before the wakeup command is visible. smp_store_release()
* ensures ordering and visibility.
*/
acpi_mp_wake_mailbox->apic_id = apicid;
acpi_mp_wake_mailbox->wakeup_vector = start_ip;
smp_store_release(&acpi_mp_wake_mailbox->command,
ACPI_MP_WAKE_COMMAND_WAKEUP);
/*
* Wait for the CPU to wake up.
*
* The CPU being woken up is essentially in a spin loop waiting to be
* woken up. It should not take long for it wake up and acknowledge by
* zeroing out ->command.
*
* ACPI specification doesn't provide any guidance on how long kernel
* has to wait for a wake up acknowledgement. It also doesn't provide
* a way to cancel a wake up request if it takes too long.
*
* In TDX environment, the VMM has control over how long it takes to
* wake up secondary. It can postpone scheduling secondary vCPU
* indefinitely. Giving up on wake up request and reporting error opens
* possible attack vector for VMM: it can wake up a secondary CPU when
* kernel doesn't expect it. Wait until positive result of the wake up
* request.
*/
while (READ_ONCE(acpi_mp_wake_mailbox->command))
cpu_relax();
return 0;
}
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */ #endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC #ifdef CONFIG_X86_IO_APIC
...@@ -1124,29 +1063,6 @@ static int __init acpi_parse_madt_lapic_entries(void) ...@@ -1124,29 +1063,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
} }
return 0; return 0;
} }
#ifdef CONFIG_X86_64
static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
const unsigned long end)
{
struct acpi_madt_multiproc_wakeup *mp_wake;
if (!IS_ENABLED(CONFIG_SMP))
return -ENODEV;
mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
if (BAD_MADT_ENTRY(mp_wake, end))
return -EINVAL;
acpi_table_print_madt_entry(&header->common);
acpi_mp_wake_mailbox_paddr = mp_wake->base_address;
apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
return 0;
}
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */ #endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC #ifdef CONFIG_X86_IO_APIC
...@@ -1343,7 +1259,7 @@ static void __init acpi_process_madt(void) ...@@ -1343,7 +1259,7 @@ static void __init acpi_process_madt(void)
smp_found_config = 1; smp_found_config = 1;
} }
#ifdef CONFIG_X86_64 #ifdef CONFIG_ACPI_MADT_WAKEUP
/* /*
* Parse MADT MP Wake entry. * Parse MADT MP Wake entry.
*/ */
......
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/nospec-branch.h>
#include <asm/page_types.h>
#include <asm/processor-flags.h>
.text
.align PAGE_SIZE
/*
* asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
*
* rdi: Address of the ACPI MADT MPWK ResetVector
* rsi: PGD of the identity mapping
*/
SYM_FUNC_START(asm_acpi_mp_play_dead)
/* Turn off global entries. Following CR3 write will flush them. */
movq %cr4, %rdx
andq $~(X86_CR4_PGE), %rdx
movq %rdx, %cr4
/* Switch to identity mapping */
movq %rsi, %cr3
/* Jump to reset vector */
ANNOTATE_RETPOLINE_SAFE
jmp *%rdi
SYM_FUNC_END(asm_acpi_mp_play_dead)
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/acpi.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/io.h>
#include <linux/kexec.h>
#include <linux/memblock.h>
#include <linux/pgtable.h>
#include <linux/sched/hotplug.h>
#include <asm/apic.h>
#include <asm/barrier.h>
#include <asm/init.h>
#include <asm/intel_pt.h>
#include <asm/nmi.h>
#include <asm/processor.h>
#include <asm/reboot.h>
/* Physical address of the Multiprocessor Wakeup Structure mailbox */
static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init;
static u64 acpi_mp_pgd __ro_after_init;
static u64 acpi_mp_reset_vector_paddr __ro_after_init;
static void acpi_mp_stop_this_cpu(void)
{
asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
}
static void acpi_mp_play_dead(void)
{
play_dead_common();
asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
}
static void acpi_mp_cpu_die(unsigned int cpu)
{
u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
unsigned long timeout;
/*
* Use TEST mailbox command to prove that BIOS got control over
* the CPU before declaring it dead.
*
* BIOS has to clear 'command' field of the mailbox.
*/
acpi_mp_wake_mailbox->apic_id = apicid;
smp_store_release(&acpi_mp_wake_mailbox->command,
ACPI_MP_WAKE_COMMAND_TEST);
/* Don't wait longer than a second. */
timeout = USEC_PER_SEC;
while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
udelay(1);
if (!timeout)
pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
}
/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
static void __init *alloc_pgt_page(void *dummy)
{
return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
}
static void __init free_pgt_page(void *pgt, void *dummy)
{
return memblock_free(pgt, PAGE_SIZE);
}
/*
* Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
* the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
* to the identity mapping and the function has be present at the same spot in
* the virtual address space before and after switching page tables.
*/
static int __init init_transition_pgtable(pgd_t *pgd)
{
pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
unsigned long vaddr, paddr;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
vaddr = (unsigned long)asm_acpi_mp_play_dead;
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
p4d = (p4d_t *)alloc_pgt_page(NULL);
if (!p4d)
return -ENOMEM;
set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
}
p4d = p4d_offset(pgd, vaddr);
if (!p4d_present(*p4d)) {
pud = (pud_t *)alloc_pgt_page(NULL);
if (!pud)
return -ENOMEM;
set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
pud = pud_offset(p4d, vaddr);
if (!pud_present(*pud)) {
pmd = (pmd_t *)alloc_pgt_page(NULL);
if (!pmd)
return -ENOMEM;
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
}
pmd = pmd_offset(pud, vaddr);
if (!pmd_present(*pmd)) {
pte = (pte_t *)alloc_pgt_page(NULL);
if (!pte)
return -ENOMEM;
set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
}
pte = pte_offset_kernel(pmd, vaddr);
paddr = __pa(vaddr);
set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
return 0;
}
static int __init acpi_mp_setup_reset(u64 reset_vector)
{
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
.free_pgt_page = free_pgt_page,
.page_flag = __PAGE_KERNEL_LARGE_EXEC,
.kernpg_flag = _KERNPG_TABLE_NOENC,
};
pgd_t *pgd;
pgd = alloc_pgt_page(NULL);
if (!pgd)
return -ENOMEM;
for (int i = 0; i < nr_pfn_mapped; i++) {
unsigned long mstart, mend;
mstart = pfn_mapped[i].start << PAGE_SHIFT;
mend = pfn_mapped[i].end << PAGE_SHIFT;
if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
kernel_ident_mapping_free(&info, pgd);
return -ENOMEM;
}
}
if (kernel_ident_mapping_init(&info, pgd,
PAGE_ALIGN_DOWN(reset_vector),
PAGE_ALIGN(reset_vector + 1))) {
kernel_ident_mapping_free(&info, pgd);
return -ENOMEM;
}
if (init_transition_pgtable(pgd)) {
kernel_ident_mapping_free(&info, pgd);
return -ENOMEM;
}
smp_ops.play_dead = acpi_mp_play_dead;
smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
smp_ops.cpu_die = acpi_mp_cpu_die;
acpi_mp_reset_vector_paddr = reset_vector;
acpi_mp_pgd = __pa(pgd);
return 0;
}
static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
{
if (!acpi_mp_wake_mailbox_paddr) {
pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
return -EOPNOTSUPP;
}
/*
* Remap mailbox memory only for the first call to acpi_wakeup_cpu().
*
* Wakeup of secondary CPUs is fully serialized in the core code.
* No need to protect acpi_mp_wake_mailbox from concurrent accesses.
*/
if (!acpi_mp_wake_mailbox) {
acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
sizeof(*acpi_mp_wake_mailbox),
MEMREMAP_WB);
}
/*
* Mailbox memory is shared between the firmware and OS. Firmware will
* listen on mailbox command address, and once it receives the wakeup
* command, the CPU associated with the given apicid will be booted.
*
* The value of 'apic_id' and 'wakeup_vector' must be visible to the
* firmware before the wakeup command is visible. smp_store_release()
* ensures ordering and visibility.
*/
acpi_mp_wake_mailbox->apic_id = apicid;
acpi_mp_wake_mailbox->wakeup_vector = start_ip;
smp_store_release(&acpi_mp_wake_mailbox->command,
ACPI_MP_WAKE_COMMAND_WAKEUP);
/*
* Wait for the CPU to wake up.
*
* The CPU being woken up is essentially in a spin loop waiting to be
* woken up. It should not take long for it wake up and acknowledge by
* zeroing out ->command.
*
* ACPI specification doesn't provide any guidance on how long kernel
* has to wait for a wake up acknowledgment. It also doesn't provide
* a way to cancel a wake up request if it takes too long.
*
* In TDX environment, the VMM has control over how long it takes to
* wake up secondary. It can postpone scheduling secondary vCPU
* indefinitely. Giving up on wake up request and reporting error opens
* possible attack vector for VMM: it can wake up a secondary CPU when
* kernel doesn't expect it. Wait until positive result of the wake up
* request.
*/
while (READ_ONCE(acpi_mp_wake_mailbox->command))
cpu_relax();
return 0;
}
static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
{
cpu_hotplug_disable_offlining();
/*
* ACPI MADT doesn't allow to offline a CPU after it was onlined. This
* limits kexec: the second kernel won't be able to use more than one CPU.
*
* To prevent a kexec kernel from onlining secondary CPUs invalidate the
* mailbox address in the ACPI MADT wakeup structure which prevents a
* kexec kernel to use it.
*
* This is safe as the booting kernel has the mailbox address cached
* already and acpi_wakeup_cpu() uses the cached value to bring up the
* secondary CPUs.
*
* Note: This is a Linux specific convention and not covered by the
* ACPI specification.
*/
mp_wake->mailbox_address = 0;
}
int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
const unsigned long end)
{
struct acpi_madt_multiproc_wakeup *mp_wake;
mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
/*
* Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
* entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
* than the actual size of the MP wakeup entry in ACPI table because the
* 'reset_vector' is only available in the V1 MP wakeup structure.
*/
if (!mp_wake)
return -EINVAL;
if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
return -EINVAL;
if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
return -EINVAL;
acpi_table_print_madt_entry(&header->common);
acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
pr_warn("Failed to setup MADT reset vector\n");
acpi_mp_disable_offlining(mp_wake);
}
} else {
/*
* CPU offlining requires version 1 of the ACPI MADT wakeup
* structure.
*/
acpi_mp_disable_offlining(mp_wake);
}
apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
return 0;
}
...@@ -292,9 +292,8 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) ...@@ -292,9 +292,8 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
*/ */
static int pseudo_lock_region_init(struct pseudo_lock_region *plr) static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
{ {
struct cpu_cacheinfo *ci; struct cacheinfo *ci;
int ret; int ret;
int i;
/* Pick the first cpu we find that is associated with the cache. */ /* Pick the first cpu we find that is associated with the cache. */
plr->cpu = cpumask_first(&plr->d->cpu_mask); plr->cpu = cpumask_first(&plr->d->cpu_mask);
...@@ -306,16 +305,12 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) ...@@ -306,16 +305,12 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
goto out_region; goto out_region;
} }
ci = get_cpu_cacheinfo(plr->cpu); ci = get_cpu_cacheinfo_level(plr->cpu, plr->s->res->cache_level);
if (ci) {
plr->line_size = ci->coherency_line_size;
plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
for (i = 0; i < ci->num_leaves; i++) {
if (ci->info_list[i].level == plr->s->res->cache_level) {
plr->line_size = ci->info_list[i].coherency_line_size;
return 0; return 0;
} }
}
ret = -1; ret = -1;
rdt_last_cmd_puts("Unable to determine cache line size\n"); rdt_last_cmd_puts("Unable to determine cache line size\n");
......
...@@ -1450,18 +1450,14 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, ...@@ -1450,18 +1450,14 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
struct rdt_domain *d, unsigned long cbm) struct rdt_domain *d, unsigned long cbm)
{ {
struct cpu_cacheinfo *ci;
unsigned int size = 0; unsigned int size = 0;
int num_b, i; struct cacheinfo *ci;
int num_b;
num_b = bitmap_weight(&cbm, r->cache.cbm_len); num_b = bitmap_weight(&cbm, r->cache.cbm_len);
ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); ci = get_cpu_cacheinfo_level(cpumask_any(&d->cpu_mask), r->cache_level);
for (i = 0; i < ci->num_leaves; i++) { if (ci)
if (ci->info_list[i].level == r->cache_level) { size = ci->size / r->cache.cbm_len * num_b;
size = ci->info_list[i].size / r->cache.cbm_len * num_b;
break;
}
}
return size; return size;
} }
......
...@@ -128,6 +128,18 @@ void native_machine_crash_shutdown(struct pt_regs *regs) ...@@ -128,6 +128,18 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER #ifdef CONFIG_HPET_TIMER
hpet_disable(); hpet_disable();
#endif #endif
/*
* Non-crash kexec calls enc_kexec_begin() while scheduling is still
* active. This allows the callback to wait until all in-flight
* shared<->private conversions are complete. In a crash scenario,
* enc_kexec_begin() gets called after all but one CPU have been shut
* down and interrupts have been disabled. This allows the callback to
* detect a race with the conversion and report it.
*/
x86_platform.guest.enc_kexec_begin();
x86_platform.guest.enc_kexec_finish();
crash_save_cpu(regs, safe_smp_processor_id()); crash_save_cpu(regs, safe_smp_processor_id());
} }
......
...@@ -828,7 +828,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) ...@@ -828,7 +828,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
/* /*
* Find the highest page frame number we have available * Find the highest page frame number we have available
*/ */
static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type) static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn)
{ {
int i; int i;
unsigned long last_pfn = 0; unsigned long last_pfn = 0;
...@@ -839,7 +839,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type ...@@ -839,7 +839,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
unsigned long start_pfn; unsigned long start_pfn;
unsigned long end_pfn; unsigned long end_pfn;
if (entry->type != type) if (entry->type != E820_TYPE_RAM &&
entry->type != E820_TYPE_ACPI)
continue; continue;
start_pfn = entry->addr >> PAGE_SHIFT; start_pfn = entry->addr >> PAGE_SHIFT;
...@@ -865,12 +866,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type ...@@ -865,12 +866,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
unsigned long __init e820__end_of_ram_pfn(void) unsigned long __init e820__end_of_ram_pfn(void)
{ {
return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM); return e820__end_ram_pfn(MAX_ARCH_PFN);
} }
unsigned long __init e820__end_of_low_ram_pfn(void) unsigned long __init e820__end_of_low_ram_pfn(void)
{ {
return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM); return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT));
} }
static void __init early_panic(char *msg) static void __init early_panic(char *msg)
......
...@@ -835,6 +835,13 @@ void __noreturn stop_this_cpu(void *dummy) ...@@ -835,6 +835,13 @@ void __noreturn stop_this_cpu(void *dummy)
*/ */
cpumask_clear_cpu(cpu, &cpus_stop_mask); cpumask_clear_cpu(cpu, &cpus_stop_mask);
#ifdef CONFIG_SMP
if (smp_ops.stop_this_cpu) {
smp_ops.stop_this_cpu();
unreachable();
}
#endif
for (;;) { for (;;) {
/* /*
* Use native_halt() so that memory contents don't change * Use native_halt() so that memory contents don't change
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/objtool.h> #include <linux/objtool.h>
#include <linux/pgtable.h> #include <linux/pgtable.h>
#include <linux/kexec.h>
#include <acpi/reboot.h> #include <acpi/reboot.h>
#include <asm/io.h> #include <asm/io.h>
#include <asm/apic.h> #include <asm/apic.h>
...@@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void) ...@@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void) void native_machine_shutdown(void)
{ {
/*
* Call enc_kexec_begin() while all CPUs are still active and
* interrupts are enabled. This will allow all in-flight memory
* conversions to finish cleanly.
*/
if (kexec_in_progress)
x86_platform.guest.enc_kexec_begin();
/* Stop the cpus and apics */ /* Stop the cpus and apics */
#ifdef CONFIG_X86_IO_APIC #ifdef CONFIG_X86_IO_APIC
/* /*
...@@ -752,6 +761,9 @@ void native_machine_shutdown(void) ...@@ -752,6 +761,9 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
x86_platform.iommu_shutdown(); x86_platform.iommu_shutdown();
#endif #endif
if (kexec_in_progress)
x86_platform.guest.enc_kexec_finish();
} }
static void __machine_emergency_restart(int emergency) static void __machine_emergency_restart(int emergency)
...@@ -868,6 +880,12 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) ...@@ -868,6 +880,12 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
cpu_emergency_disable_virtualization(); cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi); atomic_dec(&waiting_for_crash_ipi);
if (smp_ops.stop_this_cpu) {
smp_ops.stop_this_cpu();
unreachable();
}
/* Assume hlt works */ /* Assume hlt works */
halt(); halt();
for (;;) for (;;)
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
*/ */
#include <linux/linkage.h> #include <linux/linkage.h>
#include <linux/stringify.h>
#include <asm/alternative.h>
#include <asm/page_types.h> #include <asm/page_types.h>
#include <asm/kexec.h> #include <asm/kexec.h>
#include <asm/processor-flags.h> #include <asm/processor-flags.h>
...@@ -145,16 +147,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) ...@@ -145,16 +147,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* Set cr4 to a known state: * Set cr4 to a known state:
* - physical address extension enabled * - physical address extension enabled
* - 5-level paging, if it was enabled before * - 5-level paging, if it was enabled before
* - Machine check exception on TDX guest, if it was enabled before.
* Clearing MCE might not be allowed in TDX guests, depending on setup.
*
* Use R13 that contains the original CR4 value, read in relocate_kernel().
* PAE is always set in the original CR4.
*/ */
movl $X86_CR4_PAE, %eax andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
testq $X86_CR4_LA57, %r13 ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
jz 1f movq %r13, %cr4
orl $X86_CR4_LA57, %eax
1:
movq %rax, %cr4
jmp 1f
1:
/* Flush the TLB (needed?) */ /* Flush the TLB (needed?) */
movq %r9, %cr3 movq %r9, %cr3
...@@ -165,9 +166,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) ...@@ -165,9 +166,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* used by kexec. Flush the caches before copying the kernel. * used by kexec. Flush the caches before copying the kernel.
*/ */
testq %r12, %r12 testq %r12, %r12
jz 1f jz .Lsme_off
wbinvd wbinvd
1: .Lsme_off:
movq %rcx, %r11 movq %rcx, %r11
call swap_pages call swap_pages
...@@ -187,7 +188,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) ...@@ -187,7 +188,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
*/ */
testq %r11, %r11 testq %r11, %r11
jnz 1f jnz .Lrelocate
xorl %eax, %eax xorl %eax, %eax
xorl %ebx, %ebx xorl %ebx, %ebx
xorl %ecx, %ecx xorl %ecx, %ecx
...@@ -208,7 +209,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) ...@@ -208,7 +209,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
ret ret
int3 int3
1: .Lrelocate:
popq %rdx popq %rdx
leaq PAGE_SIZE(%r10), %rsp leaq PAGE_SIZE(%r10), %rsp
ANNOTATE_RETPOLINE_SAFE ANNOTATE_RETPOLINE_SAFE
......
...@@ -134,10 +134,12 @@ struct x86_cpuinit_ops x86_cpuinit = { ...@@ -134,10 +134,12 @@ struct x86_cpuinit_ops x86_cpuinit = {
static void default_nmi_init(void) { }; static void default_nmi_init(void) { };
static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; } static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; } static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; } static bool enc_cache_flush_required_noop(void) { return false; }
static void enc_kexec_begin_noop(void) {}
static void enc_kexec_finish_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; } static bool is_private_mmio_noop(u64 addr) {return false; }
struct x86_platform_ops x86_platform __ro_after_init = { struct x86_platform_ops x86_platform __ro_after_init = {
...@@ -161,6 +163,8 @@ struct x86_platform_ops x86_platform __ro_after_init = { ...@@ -161,6 +163,8 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.enc_status_change_finish = enc_status_change_finish_noop, .enc_status_change_finish = enc_status_change_finish_noop,
.enc_tlb_flush_required = enc_tlb_flush_required_noop, .enc_tlb_flush_required = enc_tlb_flush_required_noop,
.enc_cache_flush_required = enc_cache_flush_required_noop, .enc_cache_flush_required = enc_cache_flush_required_noop,
.enc_kexec_begin = enc_kexec_begin_noop,
.enc_kexec_finish = enc_kexec_finish_noop,
}, },
}; };
......
...@@ -4,6 +4,79 @@ ...@@ -4,6 +4,79 @@
* included by both the compressed kernel and the regular kernel. * included by both the compressed kernel and the regular kernel.
*/ */
static void free_pte(struct x86_mapping_info *info, pmd_t *pmd)
{
pte_t *pte = pte_offset_kernel(pmd, 0);
info->free_pgt_page(pte, info->context);
}
static void free_pmd(struct x86_mapping_info *info, pud_t *pud)
{
pmd_t *pmd = pmd_offset(pud, 0);
int i;
for (i = 0; i < PTRS_PER_PMD; i++) {
if (!pmd_present(pmd[i]))
continue;
if (pmd_leaf(pmd[i]))
continue;
free_pte(info, &pmd[i]);
}
info->free_pgt_page(pmd, info->context);
}
static void free_pud(struct x86_mapping_info *info, p4d_t *p4d)
{
pud_t *pud = pud_offset(p4d, 0);
int i;
for (i = 0; i < PTRS_PER_PUD; i++) {
if (!pud_present(pud[i]))
continue;
if (pud_leaf(pud[i]))
continue;
free_pmd(info, &pud[i]);
}
info->free_pgt_page(pud, info->context);
}
static void free_p4d(struct x86_mapping_info *info, pgd_t *pgd)
{
p4d_t *p4d = p4d_offset(pgd, 0);
int i;
for (i = 0; i < PTRS_PER_P4D; i++) {
if (!p4d_present(p4d[i]))
continue;
free_pud(info, &p4d[i]);
}
if (pgtable_l5_enabled())
info->free_pgt_page(p4d, info->context);
}
void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd)
{
int i;
for (i = 0; i < PTRS_PER_PGD; i++) {
if (!pgd_present(pgd[i]))
continue;
free_p4d(info, &pgd[i]);
}
info->free_pgt_page(pgd, info->context);
}
static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
unsigned long addr, unsigned long end) unsigned long addr, unsigned long end)
{ {
......
...@@ -469,7 +469,9 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, ...@@ -469,7 +469,9 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PAGE_MASK, paddr_next, !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RAM) && E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next, !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RESERVED_KERN)) E820_TYPE_RESERVED_KERN) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_ACPI))
set_pte_init(pte, __pte(0), init); set_pte_init(pte, __pte(0), init);
continue; continue;
} }
...@@ -524,7 +526,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, ...@@ -524,7 +526,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PMD_MASK, paddr_next, !e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RAM) && E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next, !e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RESERVED_KERN)) E820_TYPE_RESERVED_KERN) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_ACPI))
set_pmd_init(pmd, __pmd(0), init); set_pmd_init(pmd, __pmd(0), init);
continue; continue;
} }
...@@ -611,7 +615,9 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, ...@@ -611,7 +615,9 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PUD_MASK, paddr_next, !e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RAM) && E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next, !e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RESERVED_KERN)) E820_TYPE_RESERVED_KERN) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_ACPI))
set_pud_init(pud, __pud(0), init); set_pud_init(pud, __pud(0), init);
continue; continue;
} }
...@@ -698,7 +704,9 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, ...@@ -698,7 +704,9 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & P4D_MASK, paddr_next, !e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RAM) && E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next, !e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RESERVED_KERN)) E820_TYPE_RESERVED_KERN) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_ACPI))
set_p4d_init(p4d, __p4d(0), init); set_p4d_init(p4d, __p4d(0), init);
continue; continue;
} }
......
...@@ -283,7 +283,7 @@ static void enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc) ...@@ -283,7 +283,7 @@ static void enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc)
#endif #endif
} }
static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc) static int amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
{ {
/* /*
* To maintain the security guarantees of SEV-SNP guests, make sure * To maintain the security guarantees of SEV-SNP guests, make sure
...@@ -292,11 +292,11 @@ static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool ...@@ -292,11 +292,11 @@ static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool
if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc) if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc)
snp_set_memory_shared(vaddr, npages); snp_set_memory_shared(vaddr, npages);
return true; return 0;
} }
/* Return true unconditionally: return value doesn't matter for the SEV side */ /* Return true unconditionally: return value doesn't matter for the SEV side */
static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc) static int amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc)
{ {
/* /*
* After memory is mapped encrypted in the page table, validate it * After memory is mapped encrypted in the page table, validate it
...@@ -308,7 +308,7 @@ static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool e ...@@ -308,7 +308,7 @@ static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool e
if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc); enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc);
return true; return 0;
} }
static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
......
...@@ -662,8 +662,9 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star ...@@ -662,8 +662,9 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
/* /*
* Lookup the page table entry for a virtual address in a specific pgd. * Lookup the page table entry for a virtual address in a specific pgd.
* Return a pointer to the entry, the level of the mapping, and the effective * Return a pointer to the entry (or NULL if the entry does not exist),
* NX and RW bits of all page table levels. * the level of the entry, and the effective NX and RW bits of all
* page table levels.
*/ */
pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
unsigned int *level, bool *nx, bool *rw) unsigned int *level, bool *nx, bool *rw)
...@@ -672,13 +673,14 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, ...@@ -672,13 +673,14 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
pud_t *pud; pud_t *pud;
pmd_t *pmd; pmd_t *pmd;
*level = PG_LEVEL_NONE; *level = PG_LEVEL_256T;
*nx = false; *nx = false;
*rw = true; *rw = true;
if (pgd_none(*pgd)) if (pgd_none(*pgd))
return NULL; return NULL;
*level = PG_LEVEL_512G;
*nx |= pgd_flags(*pgd) & _PAGE_NX; *nx |= pgd_flags(*pgd) & _PAGE_NX;
*rw &= pgd_flags(*pgd) & _PAGE_RW; *rw &= pgd_flags(*pgd) & _PAGE_RW;
...@@ -686,10 +688,10 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, ...@@ -686,10 +688,10 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
if (p4d_none(*p4d)) if (p4d_none(*p4d))
return NULL; return NULL;
*level = PG_LEVEL_512G;
if (p4d_leaf(*p4d) || !p4d_present(*p4d)) if (p4d_leaf(*p4d) || !p4d_present(*p4d))
return (pte_t *)p4d; return (pte_t *)p4d;
*level = PG_LEVEL_1G;
*nx |= p4d_flags(*p4d) & _PAGE_NX; *nx |= p4d_flags(*p4d) & _PAGE_NX;
*rw &= p4d_flags(*p4d) & _PAGE_RW; *rw &= p4d_flags(*p4d) & _PAGE_RW;
...@@ -697,10 +699,10 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, ...@@ -697,10 +699,10 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
if (pud_none(*pud)) if (pud_none(*pud))
return NULL; return NULL;
*level = PG_LEVEL_1G;
if (pud_leaf(*pud) || !pud_present(*pud)) if (pud_leaf(*pud) || !pud_present(*pud))
return (pte_t *)pud; return (pte_t *)pud;
*level = PG_LEVEL_2M;
*nx |= pud_flags(*pud) & _PAGE_NX; *nx |= pud_flags(*pud) & _PAGE_NX;
*rw &= pud_flags(*pud) & _PAGE_RW; *rw &= pud_flags(*pud) & _PAGE_RW;
...@@ -708,15 +710,13 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, ...@@ -708,15 +710,13 @@ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
if (pmd_none(*pmd)) if (pmd_none(*pmd))
return NULL; return NULL;
*level = PG_LEVEL_2M;
if (pmd_leaf(*pmd) || !pmd_present(*pmd)) if (pmd_leaf(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd; return (pte_t *)pmd;
*level = PG_LEVEL_4K;
*nx |= pmd_flags(*pmd) & _PAGE_NX; *nx |= pmd_flags(*pmd) & _PAGE_NX;
*rw &= pmd_flags(*pmd) & _PAGE_RW; *rw &= pmd_flags(*pmd) & _PAGE_RW;
*level = PG_LEVEL_4K;
return pte_offset_kernel(pmd, address); return pte_offset_kernel(pmd, address);
} }
...@@ -736,9 +736,8 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, ...@@ -736,9 +736,8 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
* Lookup the page table entry for a virtual address. Return a pointer * Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping. * to the entry and the level of the mapping.
* *
* Note: We return pud and pmd either when the entry is marked large * Note: the function returns p4d, pud or pmd either when the entry is marked
* or when the present bit is not set. Otherwise we would return a * large or when the present bit is not set. Otherwise it returns NULL.
* pointer to a nonexisting mapping.
*/ */
pte_t *lookup_address(unsigned long address, unsigned int *level) pte_t *lookup_address(unsigned long address, unsigned int *level)
{ {
...@@ -2196,7 +2195,8 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) ...@@ -2196,7 +2195,8 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required()); cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
/* Notify hypervisor that we are about to set/clr encryption attribute. */ /* Notify hypervisor that we are about to set/clr encryption attribute. */
if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
if (ret)
goto vmm_fail; goto vmm_fail;
ret = __change_page_attr_set_clr(&cpa, 1); ret = __change_page_attr_set_clr(&cpa, 1);
...@@ -2214,24 +2214,61 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) ...@@ -2214,24 +2214,61 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
return ret; return ret;
/* Notify hypervisor that we have successfully set/clr encryption attribute. */ /* Notify hypervisor that we have successfully set/clr encryption attribute. */
if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc);
if (ret)
goto vmm_fail; goto vmm_fail;
return 0; return 0;
vmm_fail: vmm_fail:
WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n", WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n",
(void *)addr, numpages, enc ? "private" : "shared"); (void *)addr, numpages, enc ? "private" : "shared", ret);
return ret;
}
/*
* The lock serializes conversions between private and shared memory.
*
* It is taken for read on conversion. A write lock guarantees that no
* concurrent conversions are in progress.
*/
static DECLARE_RWSEM(mem_enc_lock);
/*
* Stop new private<->shared conversions.
*
* Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
* The lock is not released to prevent new conversions from being started.
*/
bool set_memory_enc_stop_conversion(void)
{
/*
* In a crash scenario, sleep is not allowed. Try to take the lock.
* Failure indicates that there is a race with the conversion.
*/
if (oops_in_progress)
return down_write_trylock(&mem_enc_lock);
down_write(&mem_enc_lock);
return -EIO; return true;
} }
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{ {
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) int ret = 0;
return __set_memory_enc_pgtable(addr, numpages, enc);
return 0; if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
if (!down_read_trylock(&mem_enc_lock))
return -EBUSY;
ret = __set_memory_enc_pgtable(addr, numpages, enc);
up_read(&mem_enc_lock);
}
return ret;
} }
int set_memory_encrypted(unsigned long addr, int numpages) int set_memory_encrypted(unsigned long addr, int numpages)
......
...@@ -198,6 +198,20 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) ...@@ -198,6 +198,20 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header)
} }
break; break;
case ACPI_MADT_TYPE_MULTIPROC_WAKEUP:
{
struct acpi_madt_multiproc_wakeup *p =
(struct acpi_madt_multiproc_wakeup *)header;
u64 reset_vector = 0;
if (p->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1)
reset_vector = p->reset_vector;
pr_debug("MP Wakeup (version[%d], mailbox[%#llx], reset[%#llx])\n",
p->version, p->mailbox_address, reset_vector);
}
break;
case ACPI_MADT_TYPE_CORE_PIC: case ACPI_MADT_TYPE_CORE_PIC:
{ {
struct acpi_madt_core_pic *p = (struct acpi_madt_core_pic *)header; struct acpi_madt_core_pic *p = (struct acpi_madt_core_pic *)header;
......
...@@ -1194,11 +1194,23 @@ struct acpi_madt_generic_translator { ...@@ -1194,11 +1194,23 @@ struct acpi_madt_generic_translator {
struct acpi_madt_multiproc_wakeup { struct acpi_madt_multiproc_wakeup {
struct acpi_subtable_header header; struct acpi_subtable_header header;
u16 mailbox_version; u16 version;
u32 reserved; /* reserved - must be zero */ u32 reserved; /* reserved - must be zero */
u64 base_address; u64 mailbox_address;
u64 reset_vector;
}; };
/* Values for Version field above */
enum acpi_madt_multiproc_wakeup_version {
ACPI_MADT_MP_WAKEUP_VERSION_NONE = 0,
ACPI_MADT_MP_WAKEUP_VERSION_V1 = 1,
ACPI_MADT_MP_WAKEUP_VERSION_RESERVED = 2, /* 2 and greater are reserved */
};
#define ACPI_MADT_MP_WAKEUP_SIZE_V0 16
#define ACPI_MADT_MP_WAKEUP_SIZE_V1 24
#define ACPI_MULTIPROC_WAKEUP_MB_OS_SIZE 2032 #define ACPI_MULTIPROC_WAKEUP_MB_OS_SIZE 2032
#define ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE 2048 #define ACPI_MULTIPROC_WAKEUP_MB_FIRMWARE_SIZE 2048
...@@ -1212,6 +1224,7 @@ struct acpi_madt_multiproc_wakeup_mailbox { ...@@ -1212,6 +1224,7 @@ struct acpi_madt_multiproc_wakeup_mailbox {
}; };
#define ACPI_MP_WAKE_COMMAND_WAKEUP 1 #define ACPI_MP_WAKE_COMMAND_WAKEUP 1
#define ACPI_MP_WAKE_COMMAND_TEST 2
/* 17: CPU Core Interrupt Controller (ACPI 6.5) */ /* 17: CPU Core Interrupt Controller (ACPI 6.5) */
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#define _LINUX_CACHEINFO_H #define _LINUX_CACHEINFO_H
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/cpuhplock.h>
#include <linux/cpumask.h> #include <linux/cpumask.h>
#include <linux/smp.h> #include <linux/smp.h>
...@@ -113,23 +114,37 @@ int acpi_get_cache_info(unsigned int cpu, ...@@ -113,23 +114,37 @@ int acpi_get_cache_info(unsigned int cpu,
const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf); const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf);
/* /*
* Get the id of the cache associated with @cpu at level @level. * Get the cacheinfo structure for the cache associated with @cpu at
* level @level.
* cpuhp lock must be held. * cpuhp lock must be held.
*/ */
static inline int get_cpu_cacheinfo_id(int cpu, int level) static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
{ {
struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
int i; int i;
lockdep_assert_cpus_held();
for (i = 0; i < ci->num_leaves; i++) { for (i = 0; i < ci->num_leaves; i++) {
if (ci->info_list[i].level == level) { if (ci->info_list[i].level == level) {
if (ci->info_list[i].attributes & CACHE_ID) if (ci->info_list[i].attributes & CACHE_ID)
return ci->info_list[i].id; return &ci->info_list[i];
return -1; return NULL;
} }
} }
return -1; return NULL;
}
/*
* Get the id of the cache associated with @cpu at level @level.
* cpuhp lock must be held.
*/
static inline int get_cpu_cacheinfo_id(int cpu, int level)
{
struct cacheinfo *ci = get_cpu_cacheinfo_level(cpu, level);
return ci ? ci->id : -1;
} }
#ifdef CONFIG_ARM64 #ifdef CONFIG_ARM64
......
...@@ -81,16 +81,6 @@ enum cc_attr { ...@@ -81,16 +81,6 @@ enum cc_attr {
*/ */
CC_ATTR_GUEST_SEV_SNP, CC_ATTR_GUEST_SEV_SNP,
/**
* @CC_ATTR_HOTPLUG_DISABLED: Hotplug is not supported or disabled.
*
* The platform/OS is running as a guest/virtual machine does not
* support CPU hotplug feature.
*
* Examples include TDX Guest.
*/
CC_ATTR_HOTPLUG_DISABLED,
/** /**
* @CC_ATTR_HOST_SEV_SNP: AMD SNP enabled on the host. * @CC_ATTR_HOST_SEV_SNP: AMD SNP enabled on the host.
* *
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <linux/compiler.h> #include <linux/compiler.h>
#include <linux/cpumask.h> #include <linux/cpumask.h>
#include <linux/cpuhotplug.h> #include <linux/cpuhotplug.h>
#include <linux/cpuhplock.h>
#include <linux/cpu_smt.h> #include <linux/cpu_smt.h>
struct device; struct device;
...@@ -132,38 +133,6 @@ static inline int add_cpu(unsigned int cpu) { return 0;} ...@@ -132,38 +133,6 @@ static inline int add_cpu(unsigned int cpu) { return 0;}
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
extern const struct bus_type cpu_subsys; extern const struct bus_type cpu_subsys;
extern int lockdep_is_cpus_held(void);
#ifdef CONFIG_HOTPLUG_CPU
extern void cpus_write_lock(void);
extern void cpus_write_unlock(void);
extern void cpus_read_lock(void);
extern void cpus_read_unlock(void);
extern int cpus_read_trylock(void);
extern void lockdep_assert_cpus_held(void);
extern void cpu_hotplug_disable(void);
extern void cpu_hotplug_enable(void);
void clear_tasks_mm_cpumask(int cpu);
int remove_cpu(unsigned int cpu);
int cpu_device_down(struct device *dev);
extern void smp_shutdown_nonboot_cpus(unsigned int primary_cpu);
#else /* CONFIG_HOTPLUG_CPU */
static inline void cpus_write_lock(void) { }
static inline void cpus_write_unlock(void) { }
static inline void cpus_read_lock(void) { }
static inline void cpus_read_unlock(void) { }
static inline int cpus_read_trylock(void) { return true; }
static inline void lockdep_assert_cpus_held(void) { }
static inline void cpu_hotplug_disable(void) { }
static inline void cpu_hotplug_enable(void) { }
static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
#endif /* !CONFIG_HOTPLUG_CPU */
DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
#ifdef CONFIG_PM_SLEEP_SMP #ifdef CONFIG_PM_SLEEP_SMP
extern int freeze_secondary_cpus(int primary); extern int freeze_secondary_cpus(int primary);
extern void thaw_secondary_cpus(void); extern void thaw_secondary_cpus(void);
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* include/linux/cpuhplock.h - CPU hotplug locking
*
* Locking functions for CPU hotplug.
*/
#ifndef _LINUX_CPUHPLOCK_H_
#define _LINUX_CPUHPLOCK_H_
#include <linux/cleanup.h>
#include <linux/errno.h>
struct device;
extern int lockdep_is_cpus_held(void);
#ifdef CONFIG_HOTPLUG_CPU
void cpus_write_lock(void);
void cpus_write_unlock(void);
void cpus_read_lock(void);
void cpus_read_unlock(void);
int cpus_read_trylock(void);
void lockdep_assert_cpus_held(void);
void cpu_hotplug_disable_offlining(void);
void cpu_hotplug_disable(void);
void cpu_hotplug_enable(void);
void clear_tasks_mm_cpumask(int cpu);
int remove_cpu(unsigned int cpu);
int cpu_device_down(struct device *dev);
void smp_shutdown_nonboot_cpus(unsigned int primary_cpu);
#else /* CONFIG_HOTPLUG_CPU */
static inline void cpus_write_lock(void) { }
static inline void cpus_write_unlock(void) { }
static inline void cpus_read_lock(void) { }
static inline void cpus_read_unlock(void) { }
static inline int cpus_read_trylock(void) { return true; }
static inline void lockdep_assert_cpus_held(void) { }
static inline void cpu_hotplug_disable_offlining(void) { }
static inline void cpu_hotplug_disable(void) { }
static inline void cpu_hotplug_enable(void) { }
static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
#endif /* !CONFIG_HOTPLUG_CPU */
DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
#endif /* _LINUX_CPUHPLOCK_H_ */
...@@ -483,6 +483,8 @@ static int cpu_hotplug_disabled; ...@@ -483,6 +483,8 @@ static int cpu_hotplug_disabled;
DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
static bool cpu_hotplug_offline_disabled __ro_after_init;
void cpus_read_lock(void) void cpus_read_lock(void)
{ {
percpu_down_read(&cpu_hotplug_lock); percpu_down_read(&cpu_hotplug_lock);
...@@ -542,6 +544,14 @@ static void lockdep_release_cpus_lock(void) ...@@ -542,6 +544,14 @@ static void lockdep_release_cpus_lock(void)
rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_); rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
} }
/* Declare CPU offlining not supported */
void cpu_hotplug_disable_offlining(void)
{
cpu_maps_update_begin();
cpu_hotplug_offline_disabled = true;
cpu_maps_update_done();
}
/* /*
* Wait for currently running CPU hotplug operations to complete (if any) and * Wait for currently running CPU hotplug operations to complete (if any) and
* disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
...@@ -1471,7 +1481,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) ...@@ -1471,7 +1481,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
* If the platform does not support hotplug, report it explicitly to * If the platform does not support hotplug, report it explicitly to
* differentiate it from a transient offlining failure. * differentiate it from a transient offlining failure.
*/ */
if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED)) if (cpu_hotplug_offline_disabled)
return -EOPNOTSUPP; return -EOPNOTSUPP;
if (cpu_hotplug_disabled) if (cpu_hotplug_disabled)
return -EBUSY; return -EBUSY;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment