Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
index 667df55..0bca7ef 100644
--- a/arch/x86/kernel/cpu/.gitignore
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 890f600..93792b4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,9 +13,11 @@
KCOV_INSTRUMENT_common.o := n
KCOV_INSTRUMENT_perf_event.o := n
+# As above, instrumenting secondary CPU boot code causes boot hangs.
+KCSAN_SANITIZE_common.o := n
+
# Make sure load_percpu_segment has no stackprotector
-nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_common.o := $(nostackp)
+CFLAGS_common.o := -fno-stack-protector
obj-y := cacheinfo.o scattered.o topology.o
obj-y += common.o
@@ -29,6 +31,7 @@
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
+obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
ifdef CONFIG_CPU_SUP_INTEL
obj-y += intel.o intel_pconfig.o tsx.o
obj-$(CONFIG_PM) += intel_epb.o
@@ -53,11 +56,12 @@
ifdef CONFIG_X86_FEATURE_NAMES
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
cpufeature = $(src)/../../include/asm/cpufeatures.h
+vmxfeature = $(src)/../../include/asm/vmxfeatures.h
-$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
+$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
endif
targets += capflags.c
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 676022e..0b2c039 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -10,36 +10,32 @@
*/
#include <linux/interrupt.h>
-#include <asm/acrn.h>
#include <asm/apic.h>
+#include <asm/cpufeatures.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
+#include <asm/idtentry.h>
#include <asm/irq_regs.h>
-static uint32_t __init acrn_detect(void)
+static u32 __init acrn_detect(void)
{
- return hypervisor_cpuid_base("ACRNACRNACRN\0\0", 0);
+ return hypervisor_cpuid_base("ACRNACRNACRN", 0);
}
static void __init acrn_init_platform(void)
{
/* Setup the IDT for ACRN hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector);
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback);
}
static bool acrn_x2apic_available(void)
{
- /*
- * x2apic is not supported for now. Future enablement will have to check
- * X86_FEATURE_X2APIC to determine whether x2apic is supported in the
- * guest.
- */
- return false;
+ return boot_cpu_has(X86_FEATURE_X2APIC);
}
static void (*acrn_intr_handler)(void);
-__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
@@ -50,13 +46,12 @@
* will block the interrupt whose vector is lower than
* HYPERVISOR_CALLBACK_VECTOR.
*/
- entering_ack_irq();
+ ack_APIC_irq();
inc_irq_stat(irq_hv_callback_count);
if (acrn_intr_handler)
acrn_intr_handler();
- exiting_irq();
set_irq_regs(old_regs);
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 753f3df..acea05e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -15,9 +15,11 @@
#include <asm/cpu.h>
#include <asm/spec-ctrl.h>
#include <asm/smp.h>
+#include <asm/numa.h>
#include <asm/pci-direct.h>
#include <asm/delay.h>
#include <asm/debugreg.h>
+#include <asm/resctrl.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
@@ -320,13 +322,6 @@
c->cpu_core_id %= cus_per_node;
}
-
-static void amd_get_topology_early(struct cpuinfo_x86 *c)
-{
- if (cpu_has(c, X86_FEATURE_TOPOEXT))
- smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
-}
-
/*
* Fixup core topology information for
* (1) AMD multi-node processors
@@ -400,6 +395,35 @@
per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
}
+static void amd_detect_ppin(struct cpuinfo_x86 *c)
+{
+ unsigned long long val;
+
+ if (!cpu_has(c, X86_FEATURE_AMD_PPIN))
+ return;
+
+ /* When PPIN is defined in CPUID, still need to check PPIN_CTL MSR */
+ if (rdmsrl_safe(MSR_AMD_PPIN_CTL, &val))
+ goto clear_ppin;
+
+ /* PPIN is locked in disabled mode, clear feature bit */
+ if ((val & 3UL) == 1UL)
+ goto clear_ppin;
+
+ /* If PPIN is disabled, try to enable it */
+ if (!(val & 2UL)) {
+ wrmsrl_safe(MSR_AMD_PPIN_CTL, val | 2UL);
+ rdmsrl_safe(MSR_AMD_PPIN_CTL, &val);
+ }
+
+ /* If PPIN_EN bit is 1, return from here; otherwise fall through */
+ if (val & 2UL)
+ return;
+
+clear_ppin:
+ clear_cpu_cap(c, X86_FEATURE_AMD_PPIN);
+}
+
u16 amd_get_nb_id(int cpu)
{
return per_cpu(cpu_llc_id, cpu);
@@ -574,6 +598,8 @@
x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
}
}
+
+ resctrl_cpu_detect(c);
}
static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -587,7 +613,7 @@
* If BIOS has not enabled SME then don't advertise the
* SME feature (set in scattered.c).
* For SEV: If BIOS has not enabled SEV then don't advertise the
- * SEV feature (set in scattered.c).
+ * SEV and SEV_ES feature (set in scattered.c).
*
* In all cases, since support for SME and SEV requires long mode,
* don't advertise the feature under CONFIG_X86_32.
@@ -618,6 +644,7 @@
setup_clear_cpu_cap(X86_FEATURE_SME);
clear_sev:
setup_clear_cpu_cap(X86_FEATURE_SEV);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_ES);
}
}
@@ -717,7 +744,8 @@
}
}
- amd_get_topology_early(c);
+ if (cpu_has(c, X86_FEATURE_TOPOEXT))
+ smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -931,7 +959,8 @@
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
case 0x16: init_amd_jg(c); break;
- case 0x17: init_amd_zn(c); break;
+ case 0x17: fallthrough;
+ case 0x19: init_amd_zn(c); break;
}
/*
@@ -946,6 +975,7 @@
amd_detect_cmp(c);
amd_get_topology(c);
srat_detect_node(c);
+ amd_detect_ppin(c);
init_amd_cacheinfo(c);
@@ -987,6 +1017,8 @@
if (cpu_has(c, X86_FEATURE_IRPERF) &&
!cpu_has_amd_erratum(c, amd_erratum_1054))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index fcc4238..78b9514 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -15,6 +15,8 @@
#include <linux/nospec.h>
#include <linux/prctl.h>
#include <linux/sched/smt.h>
+#include <linux/pgtable.h>
+#include <linux/bpf.h>
#include <asm/spec-ctrl.h>
#include <asm/cmdline.h>
@@ -26,11 +28,11 @@
#include <asm/vmx.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
-#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/intel-family.h>
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
#include "cpu.h"
@@ -288,6 +290,13 @@
#undef pr_fmt
#define pr_fmt(fmt) "TAA: " fmt
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
/* Default mitigation for TAA-affected CPUs */
static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
static bool taa_nosmt __ro_after_init;
@@ -536,14 +545,12 @@
* If FSGSBASE is enabled, the user can put a kernel address in
* GS, in which case SMAP provides no protection.
*
- * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the
- * FSGSBASE enablement patches have been merged. ]
- *
* If FSGSBASE is disabled, the user can only put a user space
* address in GS. That makes an attack harder, but still
* possible if there's no SMAP protection.
*/
- if (!smap_works_speculatively()) {
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+ !smap_works_speculatively()) {
/*
* Mitigation can be provided from SWAPGS itself or
* PTI as the CR3 write in the Meltdown mitigation
@@ -607,6 +614,32 @@
static inline const char *spectre_v2_module_string(void) { return ""; }
#endif
+#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n"
+#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n"
+
+#ifdef CONFIG_BPF_SYSCALL
+void unpriv_ebpf_notify(int new_state)
+{
+ if (new_state)
+ return;
+
+ /* Unprivileged eBPF is enabled */
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_EIBRS:
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+ break;
+ case SPECTRE_V2_EIBRS_LFENCE:
+ if (sched_smt_active())
+ pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+ break;
+ default:
+ break;
+ }
+}
+#endif
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt);
@@ -621,7 +654,10 @@
SPECTRE_V2_CMD_FORCE,
SPECTRE_V2_CMD_RETPOLINE,
SPECTRE_V2_CMD_RETPOLINE_GENERIC,
- SPECTRE_V2_CMD_RETPOLINE_AMD,
+ SPECTRE_V2_CMD_RETPOLINE_LFENCE,
+ SPECTRE_V2_CMD_EIBRS,
+ SPECTRE_V2_CMD_EIBRS_RETPOLINE,
+ SPECTRE_V2_CMD_EIBRS_LFENCE,
};
enum spectre_v2_user_cmd {
@@ -694,6 +730,13 @@
return SPECTRE_V2_USER_CMD_AUTO;
}
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return (mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE);
+}
+
static void __init
spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
{
@@ -756,10 +799,12 @@
}
/*
- * If enhanced IBRS is enabled or SMT impossible, STIBP is not
+ * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not
* required.
*/
- if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+ !smt_possible ||
+ spectre_v2_in_eibrs_mode(spectre_v2_enabled))
return;
/*
@@ -771,12 +816,6 @@
boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
mode = SPECTRE_V2_USER_STRICT_PREFERRED;
- /*
- * If STIBP is not available, clear the STIBP mode.
- */
- if (!boot_cpu_has(X86_FEATURE_STIBP))
- mode = SPECTRE_V2_USER_NONE;
-
spectre_v2_user_stibp = mode;
set_mode:
@@ -785,9 +824,11 @@
static const char * const spectre_v2_strings[] = {
[SPECTRE_V2_NONE] = "Vulnerable",
- [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
- [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
- [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
+ [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
+ [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines",
};
static const struct {
@@ -798,8 +839,12 @@
{ "off", SPECTRE_V2_CMD_NONE, false },
{ "on", SPECTRE_V2_CMD_FORCE, true },
{ "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
- { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
+ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false },
+ { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false },
{ "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+ { "eibrs", SPECTRE_V2_CMD_EIBRS, false },
+ { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false },
+ { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false },
{ "auto", SPECTRE_V2_CMD_AUTO, false },
};
@@ -836,17 +881,30 @@
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
- cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
- cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
+ cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
!IS_ENABLED(CONFIG_RETPOLINE)) {
- pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option);
+ pr_err("%s selected but not compiled in. Switching to AUTO select\n",
+ mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_HYGON &&
- boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
+ if ((cmd == SPECTRE_V2_CMD_EIBRS ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) &&
+ !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+ pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n",
+ mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
@@ -855,6 +913,16 @@
return cmd;
}
+static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
+{
+ if (!IS_ENABLED(CONFIG_RETPOLINE)) {
+ pr_err("Kernel not compiled with retpoline; no mitigation available!");
+ return SPECTRE_V2_NONE;
+ }
+
+ return SPECTRE_V2_RETPOLINE;
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -875,49 +943,64 @@
case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
- mode = SPECTRE_V2_IBRS_ENHANCED;
- /* Force it so VMEXIT will restore correctly */
- x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
- goto specv2_set_mode;
+ mode = SPECTRE_V2_EIBRS;
+ break;
}
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_auto;
+
+ mode = spectre_v2_select_retpoline();
break;
- case SPECTRE_V2_CMD_RETPOLINE_AMD:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_amd;
+
+ case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
+ pr_err(SPECTRE_V2_LFENCE_MSG);
+ mode = SPECTRE_V2_LFENCE;
break;
+
case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_generic;
+ mode = SPECTRE_V2_RETPOLINE;
break;
+
case SPECTRE_V2_CMD_RETPOLINE:
- if (IS_ENABLED(CONFIG_RETPOLINE))
- goto retpoline_auto;
+ mode = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS:
+ mode = SPECTRE_V2_EIBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_LFENCE:
+ mode = SPECTRE_V2_EIBRS_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
+ mode = SPECTRE_V2_EIBRS_RETPOLINE;
break;
}
- pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!");
- return;
-retpoline_auto:
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
- boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- retpoline_amd:
- if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
- pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
- goto retpoline_generic;
- }
- mode = SPECTRE_V2_RETPOLINE_AMD;
- setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
- setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
- } else {
- retpoline_generic:
- mode = SPECTRE_V2_RETPOLINE_GENERIC;
- setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+
+ if (spectre_v2_in_eibrs_mode(mode)) {
+ /* Force it so VMEXIT will restore correctly */
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
}
-specv2_set_mode:
+ switch (mode) {
+ case SPECTRE_V2_NONE:
+ case SPECTRE_V2_EIBRS:
+ break;
+
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
+ fallthrough;
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ break;
+ }
+
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
@@ -943,7 +1026,7 @@
* the CPU supports Enhanced IBRS, kernel might un-intentionally not
* enable IBRS around firmware calls.
*/
- if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) {
+ if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
@@ -1013,6 +1096,10 @@
{
mutex_lock(&spec_ctrl_mutex);
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+
switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
break;
@@ -1563,7 +1650,12 @@
static ssize_t itlb_multihit_show_state(char *buf)
{
- if (itlb_multihit_kvm_mitigation)
+ if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !boot_cpu_has(X86_FEATURE_VMX))
+ return sprintf(buf, "KVM: Mitigation: VMX unsupported\n");
+ else if (!(cr4_read_shadow() & X86_CR4_VMXE))
+ return sprintf(buf, "KVM: Mitigation: VMX disabled\n");
+ else if (itlb_multihit_kvm_mitigation)
return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
else
return sprintf(buf, "KVM: Vulnerable\n");
@@ -1614,7 +1706,7 @@
static char *stibp_state(void)
{
- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
return "";
switch (spectre_v2_user_stibp) {
@@ -1644,6 +1736,27 @@
return "";
}
+static ssize_t spectre_v2_show_state(char *buf)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+ return sprintf(buf, "Vulnerable: LFENCE\n");
+
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+
+ return sprintf(buf, "%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ spectre_v2_module_string());
+}
+
static ssize_t srbds_show_state(char *buf)
{
return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
@@ -1669,12 +1782,7 @@
return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
case X86_BUG_SPECTRE_V2:
- return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
- ibpb_state(),
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
- stibp_state(),
- boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
- spectre_v2_module_string());
+ return spectre_v2_show_state(buf);
case X86_BUG_SPEC_STORE_BYPASS:
return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 30f33b7..b458b0f 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -248,7 +248,7 @@
switch (leaf) {
case 1:
l1 = &l1i;
- /* fall through */
+ fallthrough;
case 0:
if (!l1->val)
return;
@@ -985,7 +985,7 @@
this_leaf->priv = base->nb;
}
-static int __init_cache_level(unsigned int cpu)
+int init_cache_level(unsigned int cpu)
{
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
@@ -1014,7 +1014,7 @@
id4_regs->id = c->apicid >> index_msb;
}
-static int __populate_cache_leaves(unsigned int cpu)
+int populate_cache_leaves(unsigned int cpu)
{
unsigned int idx, ret;
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
@@ -1033,6 +1033,3 @@
return 0;
}
-
-DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
-DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 14433ff..345f7d9 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -3,6 +3,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include <asm/e820/api.h>
#include <asm/mtrr.h>
@@ -18,13 +19,6 @@
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -72,7 +66,8 @@
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
- cpu_detect_cache_sizes(c);
+ if (c->x86 >= 7)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
enum {
@@ -98,18 +93,15 @@
static void early_init_centaur(struct cpuinfo_x86 *c)
{
- switch (c->x86) {
#ifdef CONFIG_X86_32
- case 5:
- /* Emulate MTRRs using Centaur's MCR. */
+ /* Emulate MTRRs using Centaur's MCR. */
+ if (c->x86 == 5)
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
- break;
#endif
- case 6:
- if (c->x86_model >= 0xf)
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- break;
- }
+ if ((c->x86 == 6 && c->x86_model >= 0xf) ||
+ (c->x86 >= 7))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#endif
@@ -119,31 +111,6 @@
}
}
-static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
-
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
@@ -178,9 +145,8 @@
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
- switch (c->x86) {
#ifdef CONFIG_X86_32
- case 5:
+ if (c->x86 == 5) {
switch (c->x86_model) {
case 4:
name = "C6";
@@ -240,18 +206,15 @@
c->x86_cache_size = (cc>>24)+(dd>>24);
}
sprintf(c->x86_model_id, "WinChip %s", name);
- break;
-#endif
- case 6:
- init_c3(c);
- break;
}
+#endif
+ if (c->x86 == 6 || c->x86 >= 7)
+ init_c3(c);
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
- if (cpu_has(c, X86_FEATURE_VMX))
- centaur_detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f961a56..9c8fc6f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -14,16 +14,20 @@
#include <linux/sched/mm.h>
#include <linux/sched/clock.h>
#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/init.h>
#include <linux/kprobes.h>
#include <linux/kgdb.h>
#include <linux/smp.h>
#include <linux/io.h>
#include <linux/syscore_ops.h>
+#include <linux/pgtable.h>
+#include <asm/cmdline.h>
#include <asm/stackprotector.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
+#include <asm/doublefault.h>
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -33,7 +37,6 @@
#include <asm/vsyscall.h>
#include <linux/topology.h>
#include <linux/cpumask.h>
-#include <asm/pgtable.h>
#include <linux/atomic.h>
#include <asm/proto.h>
#include <asm/setup.h>
@@ -43,20 +46,18 @@
#include <asm/mtrr.h>
#include <asm/hwcap2.h>
#include <linux/numa.h>
+#include <asm/numa.h>
#include <asm/asm.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/microcode.h>
#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
-#endif
#include "cpu.h"
@@ -165,22 +166,6 @@
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
-static int __init x86_mpx_setup(char *s)
-{
- /* require an exact match without trailing characters */
- if (strlen(s))
- return 0;
-
- /* do not emit a message if the feature is not present */
- if (!boot_cpu_has(X86_FEATURE_MPX))
- return 1;
-
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
- return 1;
-}
-__setup("nompx", x86_mpx_setup);
-
#ifdef CONFIG_X86_64
static int __init x86_nopcid_setup(char *s)
{
@@ -307,8 +292,6 @@
static __init int setup_disable_smep(char *arg)
{
setup_clear_cpu_cap(X86_FEATURE_SMEP);
- /* Check for things that depend on SMEP being enabled: */
- check_mpx_erratum(&boot_cpu_data);
return 1;
}
__setup("nosmep", setup_disable_smep);
@@ -337,6 +320,7 @@
#ifdef CONFIG_X86_SMAP
cr4_set_bits(X86_CR4_SMAP);
#else
+ clear_cpu_cap(c, X86_FEATURE_SMAP);
cr4_clear_bits(X86_CR4_SMAP);
#endif
}
@@ -409,7 +393,30 @@
bits_changed);
}
}
-EXPORT_SYMBOL(native_write_cr4);
+#if IS_MODULE(CONFIG_LKDTM)
+EXPORT_SYMBOL_GPL(native_write_cr4);
+#endif
+
+void cr4_update_irqsoff(unsigned long set, unsigned long clear)
+{
+ unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
+
+ lockdep_assert_irqs_disabled();
+
+ newval = (cr4 & ~clear) | set;
+ if (newval != cr4) {
+ this_cpu_write(cpu_tlbstate.cr4, newval);
+ __write_cr4(newval);
+ }
+}
+EXPORT_SYMBOL(cr4_update_irqsoff);
+
+/* Read the CR4 shadow. */
+unsigned long cr4_read_shadow(void)
+{
+ return this_cpu_read(cpu_tlbstate.cr4);
+}
+EXPORT_SYMBOL_GPL(cr4_read_shadow);
void cr4_init(void)
{
@@ -437,6 +444,22 @@
static_key_enable(&cr_pinning.key);
}
+static __init int x86_nofsgsbase_setup(char *arg)
+{
+ /* Require an exact match without trailing characters. */
+ if (strlen(arg))
+ return 0;
+
+ /* Do not emit a message if the feature is not present. */
+ if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
+ pr_info("FSGSBASE disabled via kernel command line\n");
+ return 1;
+}
+__setup("nofsgsbase", x86_nofsgsbase_setup);
+
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -565,8 +588,9 @@
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
-__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
void load_percpu_segment(int cpu)
{
@@ -872,30 +896,6 @@
}
}
-static void init_cqm(struct cpuinfo_x86 *c)
-{
- if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
- c->x86_cache_max_rmid = -1;
- c->x86_cache_occ_scale = -1;
- return;
- }
-
- /* will be overridden if occupancy monitoring exists */
- c->x86_cache_max_rmid = cpuid_ebx(0xf);
-
- if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
- cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
- cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
- u32 eax, ebx, ecx, edx;
-
- /* QoS sub-leaf, EAX=0Fh, ECX=1 */
- cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
-
- c->x86_cache_max_rmid = ecx;
- c->x86_cache_occ_scale = ebx;
- }
-}
-
void get_cpu_cap(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
@@ -963,7 +963,6 @@
init_scattered_cpuid_features(c);
init_speculation_control(c);
- init_cqm(c);
/*
* Clear/Set all flags overridden by options, after probe.
@@ -1026,8 +1025,8 @@
#define NO_ITLB_MULTIHIT BIT(7)
#define NO_SPECTRE_V2 BIT(8)
-#define VULNWL(_vendor, _family, _model, _whitelist) \
- { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
+#define VULNWL(vendor, family, model, whitelist) \
+ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
#define VULNWL_INTEL(model, whitelist) \
VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
@@ -1088,8 +1087,8 @@
VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
/* Zhaoxin Family 7 */
- VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2),
- VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2),
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS),
{}
};
@@ -1224,6 +1223,59 @@
}
/*
+ * We parse cpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init cpu_parse_early_param(void)
+{
+ char arg[128];
+ char *argptr = arg;
+ int arglen, res, bit;
+
+#ifdef CONFIG_X86_32
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+#ifdef CONFIG_MATH_EMULATION
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+#else
+ pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
+#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+ if (arglen <= 0)
+ return;
+
+ pr_info("Clearing CPUID bits:");
+ do {
+ res = get_option(&argptr, &bit);
+ if (res == 0 || res == 3)
+ break;
+
+ /* If the argument was too long, the last bit may be cut off */
+ if (res == 1 && arglen >= sizeof(arg))
+ break;
+
+ if (bit >= 0 && bit < NCAPINTS * 32) {
+ pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+ setup_clear_cpu_cap(bit);
+ }
+ } while (res == 2);
+ pr_cont("\n");
+}
+
+/*
* Do minimum CPU detection early.
* Fields really needed: vendor, cpuid_level, family, model, mask,
* cache alignment.
@@ -1258,6 +1310,7 @@
get_cpu_cap(c);
get_cpu_address_sizes(c);
setup_force_cpu_cap(X86_FEATURE_CPUID);
+ cpu_parse_early_param();
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
@@ -1275,6 +1328,8 @@
cpu_set_bug_bits(c);
+ cpu_set_core_cap_bits(c);
+
fpu__init_system(c);
#ifdef CONFIG_X86_32
@@ -1336,9 +1391,8 @@
early_identify_cpu(&boot_cpu_data);
}
-static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
-#ifdef CONFIG_X86_64
/*
* Empirically, writing zero to a segment selector on AMD does
* not clear the base, whereas writing zero to a segment
@@ -1359,10 +1413,43 @@
wrmsrl(MSR_FS_BASE, 1);
loadsegment(fs, 0);
rdmsrl(MSR_FS_BASE, tmp);
- if (tmp != 0)
- set_cpu_bug(c, X86_BUG_NULL_SEG);
wrmsrl(MSR_FS_BASE, old_base);
-#endif
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
+ if (c->extended_cpuid_level >= 0x80000021 &&
+ cpuid_eax(0x80000021) & BIT(6))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
static void generic_identify(struct cpuinfo_x86 *c)
@@ -1398,8 +1485,6 @@
get_model_name(c); /* Default name */
- detect_null_seg_behavior(c);
-
/*
* ESPFIX is a strange bug. All real CPUs have it. Paravirt
* systems that run Linux at CPL > 0 may or may not have the
@@ -1414,32 +1499,10 @@
* ESPFIX issue, we can change this.
*/
#ifdef CONFIG_X86_32
-# ifdef CONFIG_PARAVIRT_XXL
- do {
- extern void native_iret(void);
- if (pv_ops.cpu.iret == native_iret)
- set_cpu_bug(c, X86_BUG_ESPFIX);
- } while (0);
-# else
set_cpu_bug(c, X86_BUG_ESPFIX);
-# endif
#endif
}
-static void x86_init_cache_qos(struct cpuinfo_x86 *c)
-{
- /*
- * The heavy lifting of max_rmid and cache_occ_scale are handled
- * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu
- * in case CQM bits really aren't there in this CPU.
- */
- if (c != &boot_cpu_data) {
- boot_cpu_data.x86_cache_max_rmid =
- min(boot_cpu_data.x86_cache_max_rmid,
- c->x86_cache_max_rmid);
- }
-}
-
/*
* Validate that ACPI/mptables have the same information about the
* effective APIC id and update the package map.
@@ -1490,6 +1553,9 @@
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof(c->x86_capability));
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ memset(&c->vmx_capability, 0, sizeof(c->vmx_capability));
+#endif
generic_identify(c);
@@ -1524,6 +1590,12 @@
setup_smap(c);
setup_umip(c);
+ /* Enable FSGSBASE instructions if available. */
+ if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
+ cr4_set_bits(X86_CR4_FSGSBASE);
+ elf_hwcap2 |= HWCAP2_FSGSBASE;
+ }
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -1549,7 +1621,6 @@
#endif
x86_init_rdrand(c);
- x86_init_cache_qos(c);
setup_pku(c);
/*
@@ -1736,25 +1807,6 @@
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
-DEFINE_PER_CPU(int, debug_stack_usage);
-DEFINE_PER_CPU(u32, debug_idt_ctr);
-
-void debug_stack_set_zero(void)
-{
- this_cpu_inc(debug_idt_ctr);
- load_current_idt();
-}
-NOKPROBE_SYMBOL(debug_stack_set_zero);
-
-void debug_stack_reset(void)
-{
- if (WARN_ON(!this_cpu_read(debug_idt_ctr)))
- return;
- if (this_cpu_dec_return(debug_idt_ctr) == 0)
- load_current_idt();
-}
-NOKPROBE_SYMBOL(debug_stack_reset);
-
#else /* CONFIG_X86_64 */
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
@@ -1821,7 +1873,7 @@
}
#ifdef CONFIG_X86_64
-static void setup_getcpu(int cpu)
+static inline void setup_getcpu(int cpu)
{
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
struct desc_struct d = { };
@@ -1841,7 +1893,75 @@
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
}
+
+static inline void ucode_cpu_init(int cpu)
+{
+ if (cpu)
+ load_ucode_ap();
+}
+
+static inline void tss_setup_ist(struct tss_struct *tss)
+{
+ /* Set up the per-CPU TSS IST stacks */
+ tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
+ tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
+ tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
+ tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+ /* Only mapped when SEV-ES is active */
+ tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
+}
+
+#else /* CONFIG_X86_64 */
+
+static inline void setup_getcpu(int cpu) { }
+
+static inline void ucode_cpu_init(int cpu)
+{
+ show_ucode_info_early();
+}
+
+static inline void tss_setup_ist(struct tss_struct *tss) { }
+
+#endif /* !CONFIG_X86_64 */
+
+static inline void tss_setup_io_bitmap(struct tss_struct *tss)
+{
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+ tss->io_bitmap.prev_max = 0;
+ tss->io_bitmap.prev_sequence = 0;
+ memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
+ /*
+ * Invalidate the extra array entry past the end of the all
+ * permission bitmap as required by the hardware.
+ */
+ tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
#endif
+}
+
+/*
+ * Setup everything needed to handle exceptions from the IDT, including the IST
+ * exceptions which use paranoid_entry().
+ */
+void cpu_init_exception_handling(void)
+{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ int cpu = raw_smp_processor_id();
+
+ /* paranoid_entry() gets the CPU number from the GDT */
+ setup_getcpu(cpu);
+
+ /* IST vectors need TSS to be set up. */
+ tss_setup_ist(tss);
+ tss_setup_io_bitmap(tss);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+
+ load_TR_desc();
+
+ /* Finally load the IDT */
+ load_current_idt();
+}
/*
* cpu_init() initializes state that is per-CPU. Some data is already
@@ -1849,21 +1969,15 @@
* and IDT. We reload them nevertheless, this function acts as a
* 'CPU state barrier', nothing should get across.
*/
-#ifdef CONFIG_X86_64
-
void cpu_init(void)
{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
- struct task_struct *me;
- struct tss_struct *t;
- int i;
wait_for_master_cpu(cpu);
- if (cpu)
- load_ucode_ap();
-
- t = &per_cpu(cpu_tss_rw, cpu);
+ ucode_cpu_init(cpu);
#ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 &&
@@ -1872,63 +1986,47 @@
#endif
setup_getcpu(cpu);
- me = current;
-
pr_debug("Initializing CPU#%d\n", cpu);
- cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
+ boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/*
* Initialize the per-CPU GDT with the boot GDT,
* and set up the GDT descriptor:
*/
-
switch_to_new_gdt(cpu);
- loadsegment(fs, 0);
-
load_current_idt();
- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
- syscall_init();
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ loadsegment(fs, 0);
+ memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
- barrier();
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
- x86_configure_nx();
- x2apic_setup();
-
- /*
- * set up and load the per-CPU TSS
- */
- if (!t->x86_tss.ist[0]) {
- t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
- t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
- t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
- t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+ x2apic_setup();
}
- t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-
- /*
- * <= is required because the CPU will access up to
- * 8 bits beyond the end of the IO permission bitmap.
- */
- for (i = 0; i <= IO_BITMAP_LONGS; i++)
- t->io_bitmap[i] = ~0UL;
-
mmgrab(&init_mm);
- me->active_mm = &init_mm;
- BUG_ON(me->mm);
+ cur->active_mm = &init_mm;
+ BUG_ON(cur->mm);
initialize_tlbstate_and_flush();
- enter_lazy_tlb(&init_mm, me);
+ enter_lazy_tlb(&init_mm, cur);
- /*
- * Initialize the TSS. sp0 points to the entry trampoline stack
- * regardless of what task is running.
- */
+ /* Initialize the TSS. */
+ tss_setup_ist(tss);
+ tss_setup_io_bitmap(tss);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+
load_TR_desc();
+ /*
+ * sp0 points to the entry trampoline stack regardless of what task
+ * is running.
+ */
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);
@@ -1936,6 +2034,8 @@
clear_all_debug_regs();
dbg_restore_debug_regs();
+ doublefault_init_cpu_tss();
+
fpu__init_cpu();
if (is_uv_system())
@@ -1944,63 +2044,6 @@
load_fixmap_gdt(cpu);
}
-#else
-
-void cpu_init(void)
-{
- int cpu = smp_processor_id();
- struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
-
- wait_for_master_cpu(cpu);
-
- show_ucode_info_early();
-
- pr_info("Initializing CPU#%d\n", cpu);
-
- if (cpu_feature_enabled(X86_FEATURE_VME) ||
- boot_cpu_has(X86_FEATURE_TSC) ||
- boot_cpu_has(X86_FEATURE_DE))
- cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
- load_current_idt();
- switch_to_new_gdt(cpu);
-
- /*
- * Set up and load the per-CPU TSS and LDT
- */
- mmgrab(&init_mm);
- curr->active_mm = &init_mm;
- BUG_ON(curr->mm);
- initialize_tlbstate_and_flush();
- enter_lazy_tlb(&init_mm, curr);
-
- /*
- * Initialize the TSS. sp0 points to the entry trampoline stack
- * regardless of what task is running.
- */
- set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
- load_TR_desc();
- load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
-
- load_mm_ldt(&init_mm);
-
- t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-
-#ifdef CONFIG_DOUBLEFAULT
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
-
- clear_all_debug_regs();
- dbg_restore_debug_regs();
-
- fpu__init_cpu();
-
- load_fixmap_gdt(cpu);
-}
-#endif
-
/*
* The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 9d03369..093f5fc 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -38,7 +38,7 @@
#define cpu_dev_register(cpu_devX) \
static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
- __attribute__((__section__(".x86_cpu_dev.init"))) = \
+ __section(".x86_cpu_dev.init") = \
&cpu_devX;
extern const struct cpu_dev *const __x86_cpu_dev_start[],
@@ -73,6 +73,7 @@
extern int detect_extended_topology(struct cpuinfo_x86 *c);
extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 3cbe24c..d502241 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -69,6 +69,8 @@
{ X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
+ { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
{}
};
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
new file mode 100644
index 0000000..29a3bed
--- /dev/null
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/tboot.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
+#include <asm/processor.h>
+#include <asm/vmx.h>
+#include "cpu.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/cpu: " fmt
+
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+enum vmx_feature_leafs {
+ MISC_FEATURES = 0,
+ PRIMARY_CTLS,
+ SECONDARY_CTLS,
+ NR_VMX_FEATURE_WORDS,
+};
+
+#define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f)
+
+static void init_vmx_capabilities(struct cpuinfo_x86 *c)
+{
+ u32 supported, funcs, ept, vpid, ign;
+
+ BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS);
+
+ /*
+ * The high bits contain the allowed-1 settings, i.e. features that can
+ * be turned on. The low bits contain the allowed-0 settings, i.e.
+ * features that can be turned off. Ignore the allowed-0 settings,
+ * if a feature can be turned on then it's supported.
+ *
+ * Use raw rdmsr() for primary processor controls and pin controls MSRs
+ * as they exist on any CPU that supports VMX, i.e. we want the WARN if
+ * the RDMSR faults.
+ */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ign, supported);
+ c->vmx_capability[PRIMARY_CTLS] = supported;
+
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported);
+ c->vmx_capability[SECONDARY_CTLS] = supported;
+
+ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported);
+ rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs);
+
+ /*
+ * Except for EPT+VPID, which enumerates support for both in a single
+ * MSR, low for EPT, high for VPID.
+ */
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &ept, &vpid);
+
+ /* Pin, EPT, VPID and VM-Func are merged into a single word. */
+ WARN_ON_ONCE(supported >> 16);
+ WARN_ON_ONCE(funcs >> 4);
+ c->vmx_capability[MISC_FEATURES] = (supported & 0xffff) |
+ ((vpid & 0x1) << 16) |
+ ((funcs & 0xf) << 28);
+
+ /* EPT bits are full on scattered and must be manually handled. */
+ if (ept & VMX_EPT_EXECUTE_ONLY_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_EXECUTE_ONLY);
+ if (ept & VMX_EPT_AD_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
+ if (ept & VMX_EPT_1GB_PAGE_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+
+ /* Synthetic APIC features that are aggregates of multiple features. */
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_APIC_ACCESSES)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(FLEXPRIORITY);
+
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(APIC_REGISTER_VIRT)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_INTR_DELIVERY)) &&
+ (c->vmx_capability[MISC_FEATURES] & VMX_F(POSTED_INTR)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(APICV);
+
+ /* Set the synthetic cpufeatures to preserve /proc/cpuinfo's ABI. */
+ if (c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR))
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(FLEXPRIORITY))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VIRTUAL_NMIS))
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (c->vmx_capability[SECONDARY_CTLS] & VMX_F(EPT))
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(EPT_AD))
+ set_cpu_cap(c, X86_FEATURE_EPT_AD);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID))
+ set_cpu_cap(c, X86_FEATURE_VPID);
+}
+#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
+
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
+{
+ bool tboot = tboot_enabled();
+ u64 msr;
+
+ if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ return;
+ }
+
+ if (msr & FEAT_CTL_LOCKED)
+ goto update_caps;
+
+ /*
+ * Ignore whatever value BIOS left in the MSR to avoid enabling random
+ * features or faulting on the WRMSR.
+ */
+ msr = FEAT_CTL_LOCKED;
+
+ /*
+ * Enable VMX if and only if the kernel may do VMXON at some point,
+ * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
+ * for the kernel, e.g. using VMX to hide malicious code.
+ */
+ if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+ msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+
+ if (tboot)
+ msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
+ }
+
+ wrmsrl(MSR_IA32_FEAT_CTL, msr);
+
+update_caps:
+ set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
+
+ if (!cpu_has(c, X86_FEATURE_VMX))
+ return;
+
+ if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) ||
+ (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) {
+ if (IS_ENABLED(CONFIG_KVM_INTEL))
+ pr_err_once("VMX (%s TXT) disabled by BIOS\n",
+ tboot ? "inside" : "outside");
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ } else {
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ init_vmx_capabilities(c);
+#endif
+ }
+}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 62e9a98..b78c471 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -10,6 +10,7 @@
#include <asm/cpu.h>
#include <asm/smp.h>
+#include <asm/numa.h>
#include <asm/cacheinfo.h>
#include <asm/spec-ctrl.h>
#include <asm/delay.h>
@@ -350,6 +351,8 @@
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
if (!cpu_has(c, X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 11d5c59..816fdbe 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
+#include <linux/pgtable.h>
#include <linux/string.h>
#include <linux/bitops.h>
@@ -11,7 +12,6 @@
#include <linux/uaccess.h>
#include <asm/cpufeature.h>
-#include <asm/pgtable.h>
#include <asm/msr.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
@@ -19,6 +19,11 @@
#include <asm/microcode_intel.h>
#include <asm/hwcap2.h>
#include <asm/elf.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cmdline.h>
+#include <asm/traps.h>
+#include <asm/resctrl.h>
+#include <asm/numa.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -31,40 +36,26 @@
#include <asm/apic.h>
#endif
+enum split_lock_detect_state {
+ sld_off = 0,
+ sld_warn,
+ sld_fatal,
+};
+
/*
- * Just in case our CPU detection goes bad, or you have a weird system,
- * allow a way to override the automatic disabling of MPX.
+ * Default to sld_off because most systems do not support split lock detection
+ * split_lock_setup() will switch this to sld_warn on systems that support
+ * split lock detect, unless there is a command line override.
*/
-static int forcempx;
+static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
+static u64 msr_test_ctrl_cache __ro_after_init;
-static int __init forcempx_setup(char *__unused)
-{
- forcempx = 1;
-
- return 1;
-}
-__setup("intel-skd-046-workaround=disable", forcempx_setup);
-
-void check_mpx_erratum(struct cpuinfo_x86 *c)
-{
- if (forcempx)
- return;
- /*
- * Turn off the MPX feature on CPUs where SMEP is not
- * available or disabled.
- *
- * Works around Intel Erratum SKD046: "Branch Instructions
- * May Initialize MPX Bound Registers Incorrectly".
- *
- * This might falsely disable MPX on systems without
- * SMEP, like Atom processors without SMEP. But there
- * is no such hardware known at the moment.
- */
- if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) {
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- pr_warn("x86/mpx: Disabling MPX since SMEP not present\n");
- }
-}
+/*
+ * With a name like MSR_TEST_CTL it should go without saying, but don't touch
+ * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
+ * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
+ */
+static bool cpu_model_supports_sld __ro_after_init;
/*
* Processors which have self-snooping capability can handle conflicting
@@ -330,7 +321,6 @@
c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
}
- check_mpx_erratum(c);
check_memory_type_self_snoop_errata(c);
/*
@@ -341,6 +331,11 @@
detect_ht_early(c);
}
+static void bsp_init_intel(struct cpuinfo_x86 *c)
+{
+ resctrl_cpu_detect(c);
+}
+
#ifdef CONFIG_X86_32
/*
* Early probe support logic for ppro memory erratum #50
@@ -494,52 +489,6 @@
#endif
}
-static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- /* Intel VMX MSR indicated features */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-#define x86_VMX_FEATURE_EPT_CAP_AD 0x00200000
-
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
- u32 msr_vpid_cap, msr_ept_cap;
-
- clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- clear_cpu_cap(c, X86_FEATURE_VNMI);
- clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- clear_cpu_cap(c, X86_FEATURE_EPT);
- clear_cpu_cap(c, X86_FEATURE_VPID);
- clear_cpu_cap(c, X86_FEATURE_EPT_AD);
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) {
- set_cpu_cap(c, X86_FEATURE_EPT);
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- msr_ept_cap, msr_vpid_cap);
- if (msr_ept_cap & x86_VMX_FEATURE_EPT_CAP_AD)
- set_cpu_cap(c, X86_FEATURE_EPT_AD);
- }
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
#define MSR_IA32_TME_ACTIVATE 0x982
/* Helpers to access TME_ACTIVATE MSR */
@@ -652,6 +601,8 @@
wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
}
+static void split_lock_init(void);
+
static void init_intel(struct cpuinfo_x86 *c)
{
early_init_intel(c);
@@ -755,8 +706,7 @@
/* Work around errata */
srat_detect_node(c);
- if (cpu_has(c, X86_FEATURE_VMX))
- detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
if (cpu_has(c, X86_FEATURE_TME))
detect_tme(c);
@@ -767,6 +717,8 @@
tsx_enable();
if (tsx_ctrl_state == TSX_CTRL_DISABLE)
tsx_disable();
+
+ split_lock_init();
}
#ifdef CONFIG_X86_32
@@ -819,7 +771,7 @@
{ 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
- { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
+ { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" },
{ 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
@@ -847,7 +799,7 @@
{ 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
{ 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
{ 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
- { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
+ { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
{ 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
{ 0x00, 0, 0 }
};
@@ -859,8 +811,8 @@
return;
/* look up this descriptor in the table */
- for (k = 0; intel_tlb_table[k].descriptor != desc && \
- intel_tlb_table[k].descriptor != 0; k++)
+ for (k = 0; intel_tlb_table[k].descriptor != desc &&
+ intel_tlb_table[k].descriptor != 0; k++)
;
if (intel_tlb_table[k].tlb_type == 0)
@@ -1023,8 +975,220 @@
#endif
.c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
+ .c_bsp_init = bsp_init_intel,
.c_init = init_intel,
.c_x86_vendor = X86_VENDOR_INTEL,
};
cpu_dev_register(intel_cpu_dev);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/split lock detection: " fmt
+
+static const struct {
+ const char *option;
+ enum split_lock_detect_state state;
+} sld_options[] __initconst = {
+ { "off", sld_off },
+ { "warn", sld_warn },
+ { "fatal", sld_fatal },
+};
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+ int len = strlen(opt);
+
+ return len == arglen && !strncmp(arg, opt, len);
+}
+
+static bool split_lock_verify_msr(bool on)
+{
+ u64 ctrl, tmp;
+
+ if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
+ return false;
+ if (on)
+ ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ else
+ ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
+ return false;
+ rdmsrl(MSR_TEST_CTRL, tmp);
+ return ctrl == tmp;
+}
+
+static void __init split_lock_setup(void)
+{
+ enum split_lock_detect_state state = sld_warn;
+ char arg[20];
+ int i, ret;
+
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ ret = cmdline_find_option(boot_command_line, "split_lock_detect",
+ arg, sizeof(arg));
+ if (ret >= 0) {
+ for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
+ if (match_option(arg, ret, sld_options[i].option)) {
+ state = sld_options[i].state;
+ break;
+ }
+ }
+ }
+
+ switch (state) {
+ case sld_off:
+ pr_info("disabled\n");
+ return;
+ case sld_warn:
+ pr_info("warning about user-space split_locks\n");
+ break;
+ case sld_fatal:
+ pr_info("sending SIGBUS on user-space split_locks\n");
+ break;
+ }
+
+ rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ if (!split_lock_verify_msr(true)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ sld_state = state;
+ setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
+}
+
+/*
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
+ */
+static void sld_update_msr(bool on)
+{
+ u64 test_ctrl_val = msr_test_ctrl_cache;
+
+ if (on)
+ test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+ wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
+}
+
+static void split_lock_init(void)
+{
+ if (cpu_model_supports_sld)
+ split_lock_verify_msr(sld_state != sld_off);
+}
+
+static void split_lock_warn(unsigned long ip)
+{
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+
+ /*
+ * Disable the split lock detection for this task so it can make
+ * progress and set TIF_SLD so the detection is re-enabled via
+ * switch_to_sld() when the task is scheduled out.
+ */
+ sld_update_msr(false);
+ set_tsk_thread_flag(current, TIF_SLD);
+}
+
+bool handle_guest_split_lock(unsigned long ip)
+{
+ if (sld_state == sld_warn) {
+ split_lock_warn(ip);
+ return true;
+ }
+
+ pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid,
+ sld_state == sld_fatal ? "fatal" : "bogus", ip);
+
+ current->thread.error_code = 0;
+ current->thread.trap_nr = X86_TRAP_AC;
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ return false;
+}
+EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+ if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+ return false;
+ split_lock_warn(regs->ip);
+ return true;
+}
+
+/*
+ * This function is called only when switching between tasks with
+ * different split-lock detection modes. It sets the MSR for the
+ * mode of the new task. This is right most of the time, but since
+ * the MSR is shared by hyperthreads on a physical core there can
+ * be glitches when the two threads need different modes.
+ */
+void switch_to_sld(unsigned long tifn)
+{
+ sld_update_msr(!(tifn & _TIF_SLD));
+}
+
+/*
+ * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should
+ * only be trusted if it is confirmed that a CPU model implements a
+ * specific feature at a particular bit position.
+ *
+ * The possible driver data field values:
+ *
+ * - 0: CPU models that are known to have the per-core split-lock detection
+ * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
+ *
+ * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use
+ * bit 5 to enumerate the per-core split-lock detection feature.
+ */
+static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1),
+ {}
+};
+
+void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
+{
+ const struct x86_cpu_id *m;
+ u64 ia32_core_caps;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ m = x86_match_cpu(split_lock_cpu_ids);
+ if (!m)
+ return;
+
+ switch (m->driver_data) {
+ case 0:
+ break;
+ case 1:
+ if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
+ return;
+ rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT))
+ return;
+ break;
+ default:
+ return;
+ }
+
+ cpu_model_supports_sld = true;
+ split_lock_setup();
+}
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 2f163e6..ad67760 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -16,12 +16,17 @@
* respective wildcard entries.
*
* A typical table entry would be to match a specific CPU
- * { X86_VENDOR_INTEL, 6, 0x12 }
- * or to match a specific CPU feature
- * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
+ *
+ * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
+ * X86_FEATURE_ANY, NULL);
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
- * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
+ *
+ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts
+ * for various common selections. The above can be shortened to:
+ *
+ * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1cf34fc..f73f118 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -78,6 +78,7 @@
static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "load_store", "Load Store Unit" },
+ [SMCA_LS_V2] = { "load_store", "Load Store Unit" },
[SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
[SMCA_DE] = { "decode_unit", "Decode Unit" },
@@ -131,48 +132,49 @@
}
static struct smca_hwid smca_hwid_mcatypes[] = {
- /* { bank_type, hwid_mcatype, xec_bitmap } */
+ /* { bank_type, hwid_mcatype } */
/* Reserved type */
- { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 },
+ { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) },
/* ZN Core (HWID=0xB0) MCA types */
- { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF },
- { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
- { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
- { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) },
/* HWID 0xB0 MCATYPE 0x4 is Reserved */
- { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0xFFF },
- { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F },
- { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) },
/* Data Fabric MCA types */
- { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF },
- { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0x1F },
- { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF },
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
/* Unified Memory Controller MCA type */
- { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0xFF },
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
/* Parameter Block MCA type */
- { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 },
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
/* Platform Security Processor MCA type */
- { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 },
- { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF },
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) },
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) },
/* System Management Unit MCA type */
- { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
- { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF },
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) },
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) },
/* Microprocessor 5 Unit MCA type */
- { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF },
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
/* Northbridge IO Unit MCA type */
- { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0), 0x1F },
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
/* PCI Express Unit MCA type */
- { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0), 0x1F },
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
};
struct smca_bank smca_banks[MAX_NR_BANKS];
@@ -190,7 +192,12 @@
static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
-static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
+
+/*
+ * A list of the banks enabled on each logical CPU. Controls which respective
+ * descriptors to initialize later in mce_threshold_create_device().
+ */
+static DEFINE_PER_CPU(unsigned int, bank_map);
/* Map of banks that have more than MCA_MISC0 available. */
static DEFINE_PER_CPU(u32, smca_misc_banks_map);
@@ -379,6 +386,10 @@
struct thresh_restart *tr = _tr;
u32 hi, lo;
+ /* sysfs write might race against an offline operation */
+ if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off)
+ return;
+
rdmsr(tr->b->address, lo, hi);
if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
@@ -566,14 +577,19 @@
{
enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
struct cpuinfo_x86 *c = &boot_cpu_data;
- u8 xec = (m->status >> 16) & 0x3F;
/* See Family 17h Models 10h-2Fh Erratum #1114. */
if (c->x86 == 0x17 &&
c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
- bank_type == SMCA_IF && xec == 10)
+ bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10)
return true;
+ /* NB GART TLB error reporting is disabled by default. */
+ if (c->x86 < 0x17) {
+ if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5)
+ return true;
+ }
+
return false;
}
@@ -583,7 +599,7 @@
* - Prevent possible spurious interrupts from the IF bank on Family 0x17
* Models 0x10-0x2F due to Erratum #1114.
*/
-void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
+static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
{
int i, num_msrs;
u64 hwcr;
@@ -905,14 +921,13 @@
mce_log(&m);
}
-asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
{
- entering_irq();
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
inc_irq_stat(irq_deferred_error_count);
deferred_error_int_vector();
trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
- exiting_ack_irq();
+ ack_APIC_irq();
}
/*
@@ -1014,13 +1029,22 @@
static void amd_threshold_interrupt(void)
{
struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
unsigned int bank, cpu = smp_processor_id();
+ /*
+ * Validate that the threshold bank has been initialized already. The
+ * handler is installed at boot time, but on a hotplug event the
+ * interrupt might fire before the data has been initialized.
+ */
+ if (!bp)
+ return;
+
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
- first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
+ first_block = bp[bank]->blocks;
if (!first_block)
continue;
@@ -1069,7 +1093,8 @@
memset(&tr, 0, sizeof(tr));
tr.b = b;
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+ if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1))
+ return -ENODEV;
return size;
}
@@ -1093,7 +1118,8 @@
b->threshold_limit = new;
tr.b = b;
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+ if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1))
+ return -ENODEV;
return size;
}
@@ -1102,7 +1128,9 @@
{
u32 lo, hi;
- rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
+ /* CPU might be offline by now */
+ if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi))
+ return -ENODEV;
return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
(THRESHOLD_MAX - b->threshold_limit)));
@@ -1207,10 +1235,10 @@
u32 low, high;
int err;
- if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
+ if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS))
return 0;
- if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
+ if (rdmsr_safe(address, &low, &high))
return 0;
if (!(high & MASK_VALID_HI)) {
@@ -1245,6 +1273,7 @@
INIT_LIST_HEAD(&b->miscj);
+ /* This is safe as @tb is not visible yet */
if (tb->blocks)
list_add(&b->miscj, &tb->blocks->miscj);
else
@@ -1265,13 +1294,12 @@
if (b)
kobject_uevent(&b->kobj, KOBJ_ADD);
- return err;
+ return 0;
out_free:
if (b) {
- kobject_put(&b->kobj);
list_del(&b->miscj);
- kfree(b);
+ kobject_put(&b->kobj);
}
return err;
}
@@ -1300,9 +1328,10 @@
return err;
}
-static int threshold_create_bank(unsigned int cpu, unsigned int bank)
+static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
+ unsigned int bank)
{
- struct device *dev = per_cpu(mce_device, cpu);
+ struct device *dev = this_cpu_read(mce_device);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
const char *name = get_name(bank, NULL);
@@ -1322,7 +1351,7 @@
if (err)
goto out;
- per_cpu(threshold_banks, cpu)[bank] = b;
+ bp[bank] = b;
refcount_inc(&b->cpus);
err = __threshold_add_blocks(b);
@@ -1337,6 +1366,7 @@
goto out;
}
+ /* Associate the bank with the per-CPU MCE device */
b->kobj = kobject_create_and_add(name, &dev->kobj);
if (!b->kobj) {
err = -EINVAL;
@@ -1344,6 +1374,7 @@
}
if (is_shared_bank(bank)) {
+ b->shared = 1;
refcount_set(&b->cpus, 1);
/* nb is already initialized, see above */
@@ -1355,16 +1386,16 @@
err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
if (err)
- goto out_free;
+ goto out_kobj;
- per_cpu(threshold_banks, cpu)[bank] = b;
-
+ bp[bank] = b;
return 0;
- out_free:
+out_kobj:
+ kobject_put(b->kobj);
+out_free:
kfree(b);
-
- out:
+out:
return err;
}
@@ -1373,21 +1404,16 @@
kfree(to_block(kobj));
}
-static void deallocate_threshold_block(unsigned int cpu, unsigned int bank)
+static void deallocate_threshold_blocks(struct threshold_bank *bank)
{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+ struct threshold_block *pos, *tmp;
- if (!head)
- return;
-
- list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+ list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) {
list_del(&pos->miscj);
kobject_put(&pos->kobj);
}
- kobject_put(&head->blocks->kobj);
+ kobject_put(&bank->blocks->kobj);
}
static void __threshold_remove_blocks(struct threshold_bank *b)
@@ -1401,122 +1427,102 @@
kobject_del(&pos->kobj);
}
-static void threshold_remove_bank(unsigned int cpu, int bank)
+static void threshold_remove_bank(struct threshold_bank *bank)
{
struct amd_northbridge *nb;
- struct threshold_bank *b;
- b = per_cpu(threshold_banks, cpu)[bank];
- if (!b)
+ if (!bank->blocks)
+ goto out_free;
+
+ if (!bank->shared)
+ goto out_dealloc;
+
+ if (!refcount_dec_and_test(&bank->cpus)) {
+ __threshold_remove_blocks(bank);
return;
-
- if (!b->blocks)
- goto free_out;
-
- if (is_shared_bank(bank)) {
- if (!refcount_dec_and_test(&b->cpus)) {
- __threshold_remove_blocks(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
- return;
- } else {
- /*
- * the last CPU on this node using the shared bank is
- * going away, remove that bank now.
- */
- nb = node_to_amd_nb(amd_get_nb_id(cpu));
- nb->bank4 = NULL;
- }
+ } else {
+ /*
+ * The last CPU on this node using the shared bank is going
+ * away, remove that bank now.
+ */
+ nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id()));
+ nb->bank4 = NULL;
}
- deallocate_threshold_block(cpu, bank);
+out_dealloc:
+ deallocate_threshold_blocks(bank);
-free_out:
- kobject_del(b->kobj);
- kobject_put(b->kobj);
- kfree(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
+out_free:
+ kobject_put(bank->kobj);
+ kfree(bank);
}
int mce_threshold_remove_device(unsigned int cpu)
{
- unsigned int bank;
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
+ unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
- for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- threshold_remove_bank(cpu, bank);
+ if (!bp)
+ return 0;
+
+ /*
+ * Clear the pointer before cleaning up, so that the interrupt won't
+ * touch anything of this.
+ */
+ this_cpu_write(threshold_banks, NULL);
+
+ for (bank = 0; bank < numbanks; bank++) {
+ if (bp[bank]) {
+ threshold_remove_bank(bp[bank]);
+ bp[bank] = NULL;
+ }
}
- kfree(per_cpu(threshold_banks, cpu));
- per_cpu(threshold_banks, cpu) = NULL;
+ kfree(bp);
return 0;
}
-/* create dir/files for all valid threshold banks */
+/**
+ * mce_threshold_create_device - Create the per-CPU MCE threshold device
+ * @cpu: The plugged in CPU
+ *
+ * Create directories and files for all valid threshold banks.
+ *
+ * This is invoked from the CPU hotplug callback which was installed in
+ * mcheck_init_device(). The invocation happens in context of the hotplug
+ * thread running on @cpu. The callback is invoked on all CPUs which are
+ * online when the callback is installed or during a real hotplug event.
+ */
int mce_threshold_create_device(unsigned int cpu)
{
- unsigned int bank;
+ unsigned int numbanks, bank;
struct threshold_bank **bp;
- int err = 0;
+ int err;
- bp = per_cpu(threshold_banks, cpu);
+ if (!mce_flags.amd_threshold)
+ return 0;
+
+ bp = this_cpu_read(threshold_banks);
if (bp)
return 0;
- bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *),
- GFP_KERNEL);
+ numbanks = this_cpu_read(mce_num_banks);
+ bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL);
if (!bp)
return -ENOMEM;
- per_cpu(threshold_banks, cpu) = bp;
-
- for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ for (bank = 0; bank < numbanks; ++bank) {
+ if (!(this_cpu_read(bank_map) & (1 << bank)))
continue;
- err = threshold_create_bank(cpu, bank);
+ err = threshold_create_bank(bp, cpu, bank);
if (err)
- goto err;
+ goto out_err;
}
- return err;
-err:
- mce_threshold_remove_device(cpu);
- return err;
-}
-
-static __init int threshold_init_device(void)
-{
- unsigned lcpu = 0;
-
- /* to hit CPUs online before the notifier is up */
- for_each_online_cpu(lcpu) {
- int err = mce_threshold_create_device(lcpu);
-
- if (err)
- return err;
- }
+ this_cpu_write(threshold_banks, bp);
if (thresholding_irq_en)
mce_threshold_vector = amd_threshold_interrupt;
-
return 0;
+out_err:
+ mce_threshold_remove_device(cpu);
+ return err;
}
-/*
- * there are 3 funcs which need to be _initcalled in a logic sequence:
- * 1. xen_late_init_mcelog
- * 2. mcheck_init_device
- * 3. threshold_init_device
- *
- * xen_late_init_mcelog must register xen_mce_chrdev_device before
- * native mce_chrdev_device registration if running under xen platform;
- *
- * mcheck_init_device should be inited before threshold_init_device to
- * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
- *
- * so we use following _initcalls
- * 1. device_initcall(xen_late_init_mcelog);
- * 2. device_initcall_sync(mcheck_init_device);
- * 3. late_initcall(threshold_init_device);
- *
- * when running under xen, the initcall order is 1,2,3;
- * on baremetal, we skip 1 and we do only 2 and 3.
- */
-late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index c2a9762..5cf1a02 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -40,8 +40,10 @@
#include <linux/debugfs.h>
#include <linux/irq_work.h>
#include <linux/export.h>
-#include <linux/jump_label.h>
#include <linux/set_memory.h>
+#include <linux/sync_core.h>
+#include <linux/task_work.h>
+#include <linux/hardirq.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
@@ -53,8 +55,6 @@
#include "internal.h"
-static DEFINE_MUTEX(mce_log_mutex);
-
/* sysfs synchronization */
static DEFINE_MUTEX(mce_sysfs_mutex);
@@ -130,7 +130,7 @@
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
+noinstr void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
@@ -140,10 +140,12 @@
m->cpuid = cpuid_eax(1);
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
- rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+ m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
- rdmsrl(MSR_PPIN, m->ppin);
+ m->ppin = __rdmsr(MSR_PPIN);
+ else if (this_cpu_has(X86_FEATURE_AMD_PPIN))
+ m->ppin = __rdmsr(MSR_AMD_PPIN);
m->microcode = boot_cpu_data.microcode;
}
@@ -156,40 +158,20 @@
if (!mce_gen_pool_add(m))
irq_work_queue(&mce_irq_work);
}
-
-void mce_inject_log(struct mce *m)
-{
- mutex_lock(&mce_log_mutex);
- mce_log(m);
- mutex_unlock(&mce_log_mutex);
-}
-EXPORT_SYMBOL_GPL(mce_inject_log);
-
-static struct notifier_block mce_srao_nb;
-
-/*
- * We run the default notifier if we have only the SRAO, the first and the
- * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
- * notifiers registered on the chain.
- */
-#define NUM_DEFAULT_NOTIFIERS 3
-static atomic_t num_notifiers;
+EXPORT_SYMBOL_GPL(mce_log);
void mce_register_decode_chain(struct notifier_block *nb)
{
- if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
+ if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
+ nb->priority > MCE_PRIO_HIGHEST))
return;
- atomic_inc(&num_notifiers);
-
blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);
void mce_unregister_decode_chain(struct notifier_block *nb)
{
- atomic_dec(&num_notifiers);
-
blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
@@ -263,6 +245,8 @@
pr_cont("ADDR %llx ", m->addr);
if (m->misc)
pr_cont("MISC %llx ", m->misc);
+ if (m->ppin)
+ pr_cont("PPIN %llx ", m->ppin);
if (mce_flags.smca) {
if (m->synd)
@@ -272,6 +256,7 @@
}
pr_cont("\n");
+
/*
* Note this output is parsed by external tools and old fields
* should not be changed.
@@ -310,11 +295,17 @@
panic("Panicing machine check CPU died");
}
-static void mce_panic(const char *msg, struct mce *final, char *exp)
+static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{
- int apei_err = 0;
struct llist_node *pending;
struct mce_evt_llist *l;
+ int apei_err = 0;
+
+ /*
+ * Allow instrumentation around external facilities usage. Not that it
+ * matters a whole lot since the machine is going to panic anyway.
+ */
+ instrumentation_begin();
if (!fake_panic) {
/*
@@ -329,7 +320,7 @@
} else {
/* Don't log too much for fake panic */
if (atomic_inc_return(&mce_fake_panicked) > 1)
- return;
+ goto out;
}
pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
@@ -367,6 +358,9 @@
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+
+out:
+ instrumentation_end();
}
/* Support code for software error injection */
@@ -407,16 +401,25 @@
}
/* MSR access wrappers used for error injection */
-static u64 mce_rdmsrl(u32 msr)
+static noinstr u64 mce_rdmsrl(u32 msr)
{
DECLARE_ARGS(val, low, high);
if (__this_cpu_read(injectm.finished)) {
- int offset = msr_to_offset(msr);
+ int offset;
+ u64 ret;
+ instrumentation_begin();
+
+ offset = msr_to_offset(msr);
if (offset < 0)
- return 0;
- return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+ ret = 0;
+ else
+ ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+
+ instrumentation_end();
+
+ return ret;
}
/*
@@ -452,15 +455,21 @@
return true;
}
-static void mce_wrmsrl(u32 msr, u64 v)
+static noinstr void mce_wrmsrl(u32 msr, u64 v)
{
u32 low, high;
if (__this_cpu_read(injectm.finished)) {
- int offset = msr_to_offset(msr);
+ int offset;
+ instrumentation_begin();
+
+ offset = msr_to_offset(msr);
if (offset >= 0)
*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
+
+ instrumentation_end();
+
return;
}
@@ -536,8 +545,9 @@
if (!(m->status & MCI_STATUS_ADDRV))
return 0;
- /* Checks after this one are Intel-specific: */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ /* Checks after this one are Intel/Zhaoxin-specific: */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 1;
if (!(m->status & MCI_STATUS_MISCV))
@@ -555,10 +565,13 @@
bool mce_is_memory_error(struct mce *m)
{
- if (m->cpuvendor == X86_VENDOR_AMD ||
- m->cpuvendor == X86_VENDOR_HYGON) {
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
return amd_mce_is_memory_error(m);
- } else if (m->cpuvendor == X86_VENDOR_INTEL) {
+
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
@@ -575,9 +588,10 @@
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
- }
- return false;
+ default:
+ return false;
+ }
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
@@ -585,6 +599,7 @@
{
if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
return true;
+
return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
}
@@ -603,22 +618,7 @@
}
EXPORT_SYMBOL_GPL(mce_is_correctable);
-static bool cec_add_mce(struct mce *m)
-{
- if (!m)
- return false;
-
- /* We eat only correctable DRAM errors with usable addresses. */
- if (mce_is_memory_error(m) &&
- mce_is_correctable(m) &&
- mce_usable_address(m))
- if (!cec_add_elem(m->addr >> PAGE_SHIFT))
- return true;
-
- return false;
-}
-
-static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;
@@ -626,9 +626,6 @@
if (!m)
return NOTIFY_DONE;
- if (cec_add_mce(m))
- return NOTIFY_STOP;
-
/* Emit the trace record: */
trace_mce_record(m);
@@ -639,31 +636,36 @@
return NOTIFY_DONE;
}
-static struct notifier_block first_nb = {
- .notifier_call = mce_first_notifier,
- .priority = MCE_PRIO_FIRST,
+static struct notifier_block early_nb = {
+ .notifier_call = mce_early_notifier,
+ .priority = MCE_PRIO_EARLY,
};
-static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
+static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
{
struct mce *mce = (struct mce *)data;
unsigned long pfn;
- if (!mce)
+ if (!mce || !mce_usable_address(mce))
return NOTIFY_DONE;
- if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
- pfn = mce->addr >> PAGE_SHIFT;
- if (!memory_failure(pfn, 0))
- set_mce_nospec(pfn, whole_page(mce));
+ if (mce->severity != MCE_AO_SEVERITY &&
+ mce->severity != MCE_DEFERRED_SEVERITY)
+ return NOTIFY_DONE;
+
+ pfn = mce->addr >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0)) {
+ set_mce_nospec(pfn, whole_page(mce));
+ mce->kflags |= MCE_HANDLED_UC;
}
return NOTIFY_OK;
}
-static struct notifier_block mce_srao_nb = {
- .notifier_call = srao_decode_notifier,
- .priority = MCE_PRIO_SRAO,
+
+static struct notifier_block mce_uc_nb = {
+ .notifier_call = uc_decode_notifier,
+ .priority = MCE_PRIO_UC,
};
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
@@ -674,10 +676,8 @@
if (!m)
return NOTIFY_DONE;
- if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
- return NOTIFY_DONE;
-
- __print_mce(m);
+ if (mca_cfg.print_all || !m->kflags)
+ __print_mce(m);
return NOTIFY_DONE;
}
@@ -691,7 +691,7 @@
/*
* Read ADDR and MISC registers.
*/
-static void mce_read_aux(struct mce *m, int i)
+static noinstr void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(msr_ops.misc(i));
@@ -813,26 +813,25 @@
log_it:
error_seen = true;
+ if (flags & MCP_DONTLOG)
+ goto clear_it;
+
mce_read_aux(&m, i);
-
- m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
-
+ m.severity = mce_severity(&m, NULL, mca_cfg.tolerant, NULL, false);
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
- mce_log(&m);
- else if (mce_usable_address(&m)) {
- /*
- * Although we skipped logging this, we still want
- * to take action. Add to the pool so the registered
- * notifiers will see it.
- */
- if (!mce_gen_pool_add(&m))
- mce_schedule_work();
- }
+ if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
+ goto clear_it;
+
+ if (flags & MCP_QUEUE_LOG)
+ mce_gen_pool_add(&m);
+ else
+ mce_log(&m);
+
+clear_it:
/*
* Clear state for this bank.
*/
@@ -857,7 +856,7 @@
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
struct pt_regs *regs)
{
- char *tmp;
+ char *tmp = *msg;
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
@@ -870,7 +869,7 @@
quirk_no_way_out(i, m, regs);
m->bank = i;
- if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
mce_read_aux(m, i);
*msg = tmp;
return 1;
@@ -948,7 +947,6 @@
struct mce *m = NULL;
int global_worst = 0;
char *msg = NULL;
- char *nmsg = NULL;
/*
* This CPU is the Monarch and the other CPUs have run
@@ -956,12 +954,10 @@
* Grade the severity of the errors of all the CPUs.
*/
for_each_possible_cpu(cpu) {
- int severity = mce_severity(&per_cpu(mces_seen, cpu),
- mca_cfg.tolerant,
- &nmsg, true);
- if (severity > global_worst) {
- msg = nmsg;
- global_worst = severity;
+ struct mce *mtmp = &per_cpu(mces_seen, cpu);
+
+ if (mtmp->severity > global_worst) {
+ global_worst = mtmp->severity;
m = &per_cpu(mces_seen, cpu);
}
}
@@ -971,8 +967,11 @@
* This dumps all the mces in the log buffer and stops the
* other CPUs.
*/
- if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+ if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
+ /* call mce_severity() to get "msg" for panic */
+ mce_severity(m, NULL, mca_cfg.tolerant, &msg, true);
mce_panic("Fatal machine check", m, msg);
+ }
/*
* For UC somewhere we let the CPU who detects it handle it.
@@ -1071,10 +1070,13 @@
* Synchronize between CPUs after main scanning loop.
* This invokes the bulk of the Monarch processing.
*/
-static int mce_end(int order)
+static noinstr int mce_end(int order)
{
- int ret = -1;
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int ret = -1;
+
+ /* Allow instrumentation around external facilities. */
+ instrumentation_begin();
if (!timeout)
goto reset;
@@ -1118,7 +1120,8 @@
/*
* Don't reset anything. That's done by the Monarch.
*/
- return 0;
+ ret = 0;
+ goto out;
}
/*
@@ -1133,6 +1136,10 @@
* Let others run again.
*/
atomic_set(&mce_executing, 0);
+
+out:
+ instrumentation_end();
+
return ret;
}
@@ -1146,23 +1153,6 @@
}
}
-static int do_memory_failure(struct mce *m)
-{
- int flags = MF_ACTION_REQUIRED;
- int ret;
-
- pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
- if (!(m->mcgstatus & MCG_STATUS_RIPV))
- flags |= MF_MUST_KILL;
- ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
- if (ret)
- pr_err("Memory error not recovered");
- else
- set_mce_nospec(m->addr >> PAGE_SHIFT, whole_page(m));
- return ret;
-}
-
-
/*
* Cases where we avoid rendezvous handler timeout:
* 1) If this CPU is offline.
@@ -1175,22 +1165,30 @@
* kdump kernel establishing a new #MC handler where a broadcasted MCE
* might not get handled properly.
*/
-static bool __mc_check_crashing_cpu(int cpu)
+static noinstr bool mce_check_crashing_cpu(void)
{
- if (cpu_is_offline(cpu) ||
+ unsigned int cpu = smp_processor_id();
+
+ if (arch_cpu_is_offline(cpu) ||
(crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
- mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+ if (mcgstatus & MCG_STATUS_LMCES)
+ return false;
+ }
+
if (mcgstatus & MCG_STATUS_RIPV) {
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
return true;
}
}
return false;
}
-static void __mc_scan_banks(struct mce *m, struct mce *final,
+static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
unsigned long *toclear, unsigned long *valid_banks,
int no_way_out, int *worst)
{
@@ -1225,7 +1223,7 @@
/* Set taint even when machine check was not enabled. */
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
- severity = mce_severity(m, cfg->tolerant, NULL, true);
+ severity = mce_severity(m, regs, cfg->tolerant, NULL, true);
/*
* When machine check was for corrected/deferred handler don't
@@ -1258,6 +1256,72 @@
*m = *final;
}
+static void kill_me_now(struct callback_head *ch)
+{
+ struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+ p->mce_count = 0;
+ force_sig(SIGBUS);
+}
+
+static void kill_me_maybe(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ int flags = MF_ACTION_REQUIRED;
+
+ p->mce_count = 0;
+ pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
+
+ if (!p->mce_ripv)
+ flags |= MF_MUST_KILL;
+
+ if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
+ !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+ sync_core();
+ return;
+ }
+
+ if (p->mce_vaddr != (void __user *)-1l) {
+ force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
+ } else {
+ pr_err("Memory error not recovered");
+ kill_me_now(cb);
+ }
+}
+
+static void queue_task_work(struct mce *m, char *msg, int kill_current_task)
+{
+ int count = ++current->mce_count;
+
+ /* First call, save all the details */
+ if (count == 1) {
+ current->mce_addr = m->addr;
+ current->mce_kflags = m->kflags;
+ current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(m);
+
+ if (kill_current_task)
+ current->mce_kill_me.func = kill_me_now;
+ else
+ current->mce_kill_me.func = kill_me_maybe;
+ }
+
+ /* Ten is likely overkill. Don't expect more than two faults before task_work() */
+ if (count > 10)
+ mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
+
+ /* Second or later call, make sure page address matches the one from first call */
+ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+ mce_panic("Consecutive machine checks to different user pages", m, msg);
+
+ /* Do not call task_work_add() more than once */
+ if (count > 1)
+ return;
+
+ task_work_add(current, ¤t->mce_kill_me, TWA_RESUME);
+}
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
@@ -1269,15 +1333,20 @@
* On Intel systems this is entered on all CPUs in parallel through
* MCE broadcast. However some CPUs might be broken beyond repair,
* so be always careful when synchronizing with others.
+ *
+ * Tracing and kprobes are disabled: if we interrupted a kernel context
+ * with IF=1, we need to minimize stack usage. There are also recursion
+ * issues: if the machine check was due to a failure of the memory
+ * backing the user stack, tracing that reads the user stack will cause
+ * potentially infinite recursion.
*/
-void do_machine_check(struct pt_regs *regs, long error_code)
+noinstr void do_machine_check(struct pt_regs *regs)
{
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
struct mca_config *cfg = &mca_cfg;
- int cpu = smp_processor_id();
- char *msg = "Unknown";
struct mce m, *final;
+ char *msg = NULL;
int worst = 0;
/*
@@ -1304,11 +1373,6 @@
*/
int lmce = 1;
- if (__mc_check_crashing_cpu(cpu))
- return;
-
- ist_enter(regs);
-
this_cpu_inc(mce_exception_count);
mce_gather_info(&m, regs);
@@ -1332,9 +1396,10 @@
/*
* Check if this MCE is signaled to only this logical processor,
- * on Intel only.
+ * on Intel, Zhaoxin only.
*/
- if (m.cpuvendor == X86_VENDOR_INTEL)
+ if (m.cpuvendor == X86_VENDOR_INTEL ||
+ m.cpuvendor == X86_VENDOR_ZHAOXIN)
lmce = m.mcgstatus & MCG_STATUS_LMCES;
/*
@@ -1351,7 +1416,7 @@
order = mce_start(&no_way_out);
}
- __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
+ __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
if (!no_way_out)
mce_clear_state(toclear);
@@ -1375,7 +1440,7 @@
* make sure we have the right "msg".
*/
if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
- mce_severity(&m, cfg->tolerant, &msg, true);
+ mce_severity(&m, regs, cfg->tolerant, &msg, true);
mce_panic("Local fatal machine check!", &m, msg);
}
}
@@ -1392,29 +1457,47 @@
if (worst > 0)
irq_work_queue(&mce_irq_work);
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
-
- sync_core();
-
if (worst != MCE_AR_SEVERITY && !kill_it)
- goto out_ist;
+ goto out;
+
+ /*
+ * Enable instrumentation around the external facilities like
+ * task_work_add() (via queue_task_work()), fixup_exception() etc.
+ * For now, that is. Fixing this properly would need a lot more involved
+ * reorganization.
+ */
+ instrumentation_begin();
/* Fault was in user mode and we need to take some action */
if ((m.cs & 3) == 3) {
- ist_begin_non_atomic(regs);
- local_irq_enable();
+ /* If this triggers there is no way to recover. Die hard. */
+ BUG_ON(!on_thread_stack() || !user_mode(regs));
- if (kill_it || do_memory_failure(&m))
- force_sig(SIGBUS);
- local_irq_disable();
- ist_end_non_atomic();
+ queue_task_work(&m, msg, kill_it);
+
} else {
- if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
- mce_panic("Failed kernel mode recovery", &m, NULL);
+ /*
+ * Handle an MCE which has happened in kernel space but from
+ * which the kernel can recover: ex_has_fault_handler() has
+ * already verified that the rIP at which the error happened is
+ * a rIP from which the kernel can recover (by jumping to
+ * recovery code specified in _ASM_EXTABLE_FAULT()) and the
+ * corresponding exception handler which would do that is the
+ * proper one.
+ */
+ if (m.kflags & MCE_IN_KERNEL_RECOV) {
+ if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
+ mce_panic("Failed kernel mode recovery", &m, msg);
+ }
+
+ if (m.kflags & MCE_IN_KERNEL_COPYIN)
+ queue_task_work(&m, msg, kill_it);
}
-out_ist:
- ist_exit(regs);
+ instrumentation_end();
+
+out:
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1597,10 +1680,12 @@
m_fl = MCP_DONTLOG;
/*
- * Log the machine checks left over from the previous reset.
+ * Log the machine checks left over from the previous reset. Log them
+ * only, do not start processing them. That will happen in mcheck_late_init()
+ * when all consumers have been registered on the notifier chain.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC | m_fl, &all_banks);
+ machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
cr4_set_bits(X86_CR4_MCE);
@@ -1754,6 +1839,18 @@
if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu;
}
+
+ if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
+ /*
+ * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+ }
+ }
+
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
if (cfg->bootlog != 0)
@@ -1792,6 +1889,7 @@
mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
+ mce_flags.amd_threshold = 1;
if (mce_flags.smca) {
msr_ops.ctl = smca_ctl_reg;
@@ -1817,6 +1915,35 @@
}
}
+static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /*
+ * These CPUs have MCA bank 8 which reports only one error type called
+ * SVAD (System View Address Decoder). The reporting of that error is
+ * controlled by IA32_MC8.CTL.0.
+ *
+ * If enabled, prefetching on these CPUs will cause SVAD MCE when
+ * virtual machines start and result in a system panic. Always disable
+ * bank 8 SVAD error by default.
+ */
+ if ((c->x86 == 7 && c->x86_model == 0x1b) ||
+ (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (this_cpu_read(mce_num_banks) > 8)
+ mce_banks[8].ctl = 0;
+ }
+
+ intel_init_cmci();
+ intel_init_lmce();
+ mce_adjust_timer = cmci_intel_adjust_timer;
+}
+
+static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
+
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
@@ -1838,6 +1965,10 @@
mce_centaur_feature_init(c);
break;
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_init(c);
+ break;
+
default:
break;
}
@@ -1849,6 +1980,11 @@
case X86_VENDOR_INTEL:
mce_intel_feature_clear(c);
break;
+
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_clear(c);
+ break;
+
default:
break;
}
@@ -1884,26 +2020,96 @@
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return amd_filter_mce(m);
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return intel_filter_mce(m);
return false;
}
/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+static noinstr void unexpected_machine_check(struct pt_regs *regs)
{
+ instrumentation_begin();
pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
smp_processor_id());
+ instrumentation_end();
}
/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
- unexpected_machine_check;
+void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
-dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
+static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
- machine_check_vector(regs, error_code);
+ irqentry_state_t irq_state;
+
+ WARN_ON_ONCE(user_mode(regs));
+
+ /*
+ * Only required when from kernel mode. See
+ * mce_check_crashing_cpu() for details.
+ */
+ if (machine_check_vector == do_machine_check &&
+ mce_check_crashing_cpu())
+ return;
+
+ irq_state = irqentry_nmi_enter(regs);
+ /*
+ * The call targets are marked noinstr, but objtool can't figure
+ * that out because it's an indirect call. Annotate it.
+ */
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ machine_check_vector(regs);
+ if (regs->flags & X86_EFLAGS_IF)
+ trace_hardirqs_on_prepare();
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
}
+static __always_inline void exc_machine_check_user(struct pt_regs *regs)
+{
+ irqentry_enter_from_user_mode(regs);
+ instrumentation_begin();
+ machine_check_vector(regs);
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* MCE hit kernel mode */
+DEFINE_IDTENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+
+/* The user mode variant. */
+DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_user(regs);
+ local_db_restore(dr7);
+}
+#else
+/* 32bit unified entry point */
+DEFINE_IDTENTRY_RAW(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
+
/*
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off:
@@ -1985,6 +2191,7 @@
* mce=no_cmci Disables CMCI
* mce=no_lmce Disables LMCE
* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=print_all Print all machine check logs to console
* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
* monarchtimeout is how long to wait for other CPUs on machine
@@ -1993,7 +2200,7 @@
and older.
* mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold
- * mce=recovery force enable memcpy_mcsafe()
+ * mce=recovery force enable copy_mc_fragile()
*/
static int __init mcheck_enable(char *str)
{
@@ -2013,6 +2220,8 @@
cfg->lmce_disabled = 1;
else if (!strcmp(str, "dont_log_ce"))
cfg->dont_log_ce = true;
+ else if (!strcmp(str, "print_all"))
+ cfg->print_all = true;
else if (!strcmp(str, "ignore_ce"))
cfg->ignore_ce = true;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
@@ -2035,8 +2244,8 @@
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
- mce_register_decode_chain(&first_nb);
- mce_register_decode_chain(&mce_srao_nb);
+ mce_register_decode_chain(&early_nb);
+ mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
@@ -2071,15 +2280,16 @@
static void vendor_disable_error_reporting(void)
{
/*
- * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
- * are socket-wide.
- * Disabling them for just a single offlined CPU is bad, since it will
- * inhibit reporting for all shared resources on the socket like the
- * last level cache (LLC), the integrated memory controller (iMC), etc.
+ * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
+ * MSRs are socket-wide. Disabling them for just a single offlined CPU
+ * is bad, since it will inhibit reporting for all shared resources on
+ * the socket like the last level cache (LLC), the integrated memory
+ * controller (iMC), etc.
*/
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
- boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
return;
mce_disable_error_reporting();
@@ -2278,6 +2488,7 @@
static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
+static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
static struct dev_ext_attribute dev_attr_check_interval = {
__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
@@ -2302,6 +2513,7 @@
#endif
&dev_attr_monarch_timeout.attr,
&dev_attr_dont_log_ce.attr,
+ &dev_attr_print_all.attr,
&dev_attr_ignore_ce.attr,
&dev_attr_cmci_disabled.attr,
NULL
@@ -2474,6 +2686,13 @@
}
}
+/*
+ * When running on XEN, this initcall is ordered against the XEN mcelog
+ * initcall:
+ *
+ * device_initcall(xen_late_init_mcelog);
+ * device_initcall_sync(mcheck_init_device);
+ */
static __init int mcheck_init_device(void)
{
int err;
@@ -2505,6 +2724,10 @@
if (err)
goto err_out_mem;
+ /*
+ * Invokes mce_cpu_online() on all CPUs which are online when
+ * the state is installed.
+ */
err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
mce_cpu_online, mce_cpu_pre_down);
if (err < 0)
@@ -2585,16 +2808,12 @@
static void __init mcheck_debugfs_init(void) { }
#endif
-DEFINE_STATIC_KEY_FALSE(mcsafe_key);
-EXPORT_SYMBOL_GPL(mcsafe_key);
-
static int __init mcheck_late_init(void)
{
if (mca_cfg.recovery)
- static_branch_inc(&mcsafe_key);
+ enable_copy_mc_fragile();
mcheck_debugfs_init();
- cec_init();
/*
* Flush out everything that has been logged during early boot, now that
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index 7c8958d..100fbee 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -29,11 +29,7 @@
* separate MCEs from kernel messages to avoid bogus bug reports.
*/
-static struct mce_log_buffer mcelog = {
- .signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
-};
+static struct mce_log_buffer *mcelog;
static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
@@ -43,23 +39,27 @@
struct mce *mce = (struct mce *)data;
unsigned int entry;
+ if (mce->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;
+
mutex_lock(&mce_chrdev_read_mutex);
- entry = mcelog.next;
+ entry = mcelog->next;
/*
* When the buffer fills up discard new entries. Assume that the
* earlier errors are the more interesting ones:
*/
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
+ if (entry >= mcelog->len) {
+ set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags);
goto unlock;
}
- mcelog.next = entry + 1;
+ mcelog->next = entry + 1;
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- mcelog.entry[entry].finished = 1;
+ memcpy(mcelog->entry + entry, mce, sizeof(struct mce));
+ mcelog->entry[entry].finished = 1;
+ mcelog->entry[entry].kflags = 0;
/* wake processes polling /dev/mcelog */
wake_up_interruptible(&mce_chrdev_wait);
@@ -67,6 +67,9 @@
unlock:
mutex_unlock(&mce_chrdev_read_mutex);
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ mce->kflags |= MCE_HANDLED_MCELOG;
+
return NOTIFY_OK;
}
@@ -214,21 +217,21 @@
/* Only supports full reads right now */
err = -EINVAL;
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+ if (*off != 0 || usize < mcelog->len * sizeof(struct mce))
goto out;
- next = mcelog.next;
+ next = mcelog->next;
err = 0;
for (i = 0; i < next; i++) {
- struct mce *m = &mcelog.entry[i];
+ struct mce *m = &mcelog->entry[i];
err |= copy_to_user(buf, m, sizeof(*m));
buf += sizeof(*m);
}
- memset(mcelog.entry, 0, next * sizeof(struct mce));
- mcelog.next = 0;
+ memset(mcelog->entry, 0, next * sizeof(struct mce));
+ mcelog->next = 0;
if (err)
err = -EFAULT;
@@ -242,7 +245,7 @@
static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_chrdev_wait, wait);
- if (READ_ONCE(mcelog.next))
+ if (READ_ONCE(mcelog->next))
return EPOLLIN | EPOLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
return EPOLLIN | EPOLLRDNORM;
@@ -261,13 +264,13 @@
case MCE_GET_RECORD_LEN:
return put_user(sizeof(struct mce), p);
case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
+ return put_user(mcelog->len, p);
case MCE_GETCLEAR_FLAGS: {
unsigned flags;
do {
- flags = mcelog.flags;
- } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+ flags = mcelog->flags;
+ } while (cmpxchg(&mcelog->flags, flags, 0) != flags);
return put_user(flags, p);
}
@@ -328,6 +331,7 @@
.write = mce_chrdev_write,
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.llseek = no_llseek,
};
@@ -339,8 +343,18 @@
static __init int dev_mcelog_init_device(void)
{
+ int mce_log_len;
int err;
+ mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus());
+ mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL);
+ if (!mcelog)
+ return -ENOMEM;
+
+ memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
+ mcelog->len = mce_log_len;
+ mcelog->recordlen = sizeof(struct mce);
+
/* register character device /dev/mcelog */
err = misc_register(&mce_chrdev_device);
if (err) {
@@ -350,6 +364,7 @@
else
pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ kfree(mcelog);
return err;
}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index eb2d41c..e780830 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -146,9 +146,9 @@
regs.cs = m->cs;
pregs = ®s;
}
- /* in mcheck exeception handler, irq will be disabled */
+ /* do_machine_check() expects interrupts disabled -- at least */
local_irq_save(flags);
- do_machine_check(pregs, 0);
+ do_machine_check(pregs);
local_irq_restore(flags);
m->finished = 0;
}
@@ -199,7 +199,7 @@
* calling irq_enter, but the necessary
* machinery isn't exported currently.
*/
- /*FALL THROUGH*/
+ fallthrough;
case MCJ_CTX_PROCESS:
raise_exception(m, NULL);
break;
@@ -347,7 +347,7 @@
char buf[MAX_FLAG_OPT_SIZE], *__buf;
int err;
- if (cnt > MAX_FLAG_OPT_SIZE)
+ if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
@@ -494,7 +494,7 @@
i_mce.status |= MCI_STATUS_SYNDV;
if (inj_type == SW_INJ) {
- mce_inject_log(&i_mce);
+ mce_log(&i_mce);
return;
}
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index f235096..886d464 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -85,8 +85,10 @@
* initialization is vendor keyed and this
* makes sure none of the backdoors are entered otherwise.
*/
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 0;
+
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
return 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
@@ -113,15 +115,16 @@
/*
* BIOS should indicate support for LMCE by setting bit 20 in
- * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
- * generate a #GP fault.
+ * IA32_FEAT_CTL without which touching MCG_EXT_CTL will generate a #GP
+ * fault. The MSR must also be locked for LMCE_ENABLED to take effect.
+ * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally
+ * locks the MSR in the event that it wasn't already locked by BIOS.
*/
- rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
- if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
- (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
- return true;
+ rdmsrl(MSR_IA32_FEAT_CTL, tmp);
+ if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED)))
+ return false;
- return false;
+ return tmp & FEAT_CTL_LMCE_ENABLED;
}
bool mce_intel_cmci_poll(void)
@@ -190,7 +193,7 @@
if (!atomic_sub_return(1, &cmci_storm_on_cpus))
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
- /* FALLTHROUGH */
+ fallthrough;
case CMCI_STORM_SUBSIDED:
/*
@@ -423,7 +426,7 @@
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
-static void intel_init_cmci(void)
+void intel_init_cmci(void)
{
int banks;
@@ -442,7 +445,7 @@
cmci_recheck();
}
-static void intel_init_lmce(void)
+void intel_init_lmce(void)
{
u64 val;
@@ -455,7 +458,7 @@
wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
}
-static void intel_clear_lmce(void)
+void intel_clear_lmce(void)
{
u64 val;
@@ -482,6 +485,9 @@
case INTEL_FAM6_BROADWELL_D:
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_ICELAKE_X:
+ case INTEL_FAM6_ICELAKE_D:
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
@@ -517,3 +523,21 @@
{
intel_clear_lmce();
}
+
+bool intel_filter_mce(struct mce *m)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
+ if ((c->x86 == 6) &&
+ ((c->x86_model == INTEL_FAM6_HASWELL) ||
+ (c->x86_model == INTEL_FAM6_HASWELL_L) ||
+ (c->x86_model == INTEL_FAM6_BROADWELL) ||
+ (c->x86_model == INTEL_FAM6_HASWELL_G) ||
+ (c->x86_model == INTEL_FAM6_SKYLAKE_X)) &&
+ (m->bank == 0) &&
+ ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
+ return true;
+
+ return false;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 231954f..88dcc79 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -8,6 +8,9 @@
#include <linux/device.h>
#include <asm/mce.h>
+/* Pointer to the installed machine check handler for this CPU setup. */
+extern void (*machine_check_vector)(struct pt_regs *);
+
enum severity_level {
MCE_NO_SEVERITY,
MCE_DEFERRED_SEVERITY,
@@ -35,7 +38,8 @@
int mce_gen_pool_init(void);
struct llist_node *mce_gen_pool_prepare_records(void);
-extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
+extern int (*mce_severity)(struct mce *a, struct pt_regs *regs,
+ int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
@@ -45,11 +49,19 @@
bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
+void intel_init_cmci(void);
+void intel_init_lmce(void);
+void intel_clear_lmce(void);
+bool intel_filter_mce(struct mce *m);
#else
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
+static inline void intel_init_cmci(void) { }
+static inline void intel_init_lmce(void) { }
+static inline void intel_clear_lmce(void) { }
+static inline bool intel_filter_mce(struct mce *m) { return false; };
#endif
void mce_timer_kick(unsigned long interval);
@@ -78,8 +90,6 @@
}
#endif
-void mce_inject_log(struct mce *m);
-
/*
* We consider records to be equivalent if bank+status+addr+misc all match.
* This is only used when the system is going down because of a fatal error
@@ -110,6 +120,7 @@
bool dont_log_ce;
bool cmci_disabled;
bool ignore_ce;
+ bool print_all;
__u64 lmce_disabled : 1,
disabled : 1,
@@ -139,7 +150,7 @@
* Recovery. It indicates support for data poisoning in HW and deferred
* error interrupts.
*/
- succor : 1,
+ succor : 1,
/*
* (AMD) SMCA: This bit indicates support for Scalable MCA which expands
@@ -147,9 +158,12 @@
* banks. Also, to accommodate the new banks and registers, the MCA
* register space is moved to a new MSR range.
*/
- smca : 1,
+ smca : 1,
- __reserved_0 : 61;
+ /* AMD-style error thresholding banks present. */
+ amd_threshold : 1,
+
+ __reserved_0 : 60;
};
extern struct mce_vendor_flags mce_flags;
diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c
index 4ae6df5..19e90ca 100644
--- a/arch/x86/kernel/cpu/mce/p5.c
+++ b/arch/x86/kernel/cpu/mce/p5.c
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/smp.h>
+#include <linux/hardirq.h>
#include <asm/processor.h>
#include <asm/traps.h>
@@ -20,12 +21,11 @@
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
-static void pentium_machine_check(struct pt_regs *regs, long error_code)
+static noinstr void pentium_machine_check(struct pt_regs *regs)
{
u32 loaddr, hi, lotype;
- ist_enter(regs);
-
+ instrumentation_begin();
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -38,8 +38,7 @@
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
- ist_exit(regs);
+ instrumentation_end();
}
/* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 0d09eb1..83df991 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -13,6 +13,9 @@
#include <asm/mce.h>
#include <asm/intel-family.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include "internal.h"
@@ -97,10 +100,6 @@
EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
),
MCESEV(
- DEFERRED, "Deferred error",
- NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
- ),
- MCESEV(
KEEP, "Corrected error",
NOSER, BITCLR(MCI_STATUS_UC)
),
@@ -216,6 +215,47 @@
#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+static bool is_copy_from_user(struct pt_regs *regs)
+{
+ u8 insn_buf[MAX_INSN_SIZE];
+ struct insn insn;
+ unsigned long addr;
+
+ if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
+ return false;
+
+ kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
+ insn_get_opcode(&insn);
+ if (!insn.opcode.got)
+ return false;
+
+ switch (insn.opcode.value) {
+ /* MOV mem,reg */
+ case 0x8A: case 0x8B:
+ /* MOVZ mem,reg */
+ case 0xB60F: case 0xB70F:
+ insn_get_modrm(&insn);
+ insn_get_sib(&insn);
+ if (!insn.modrm.got || !insn.sib.got)
+ return false;
+ addr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ break;
+ /* REP MOVS */
+ case 0xA4: case 0xA5:
+ addr = regs->si;
+ break;
+ default:
+ return false;
+ }
+
+ if (fault_in_kernel_space(addr))
+ return false;
+
+ current->mce_vaddr = (void __user *)addr;
+
+ return true;
+}
+
/*
* If mcgstatus indicated that ip/cs on the stack were
* no good, then "m->cs" will be zero and we will have
@@ -227,12 +267,26 @@
* distinguish an exception taken in user from from one
* taken in the kernel.
*/
-static int error_context(struct mce *m)
+static int error_context(struct mce *m, struct pt_regs *regs)
{
+ enum handler_type t;
+
if ((m->cs & 3) == 3)
return IN_USER;
- if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+ if (!mc_recoverable(m->mcgstatus))
+ return IN_KERNEL;
+
+ t = ex_get_fault_handler_type(m->ip);
+ if (t == EX_HANDLER_FAULT) {
+ m->kflags |= MCE_IN_KERNEL_RECOV;
return IN_KERNEL_RECOV;
+ }
+ if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) {
+ m->kflags |= MCE_IN_KERNEL_RECOV;
+ m->kflags |= MCE_IN_KERNEL_COPYIN;
+ return IN_KERNEL_RECOV;
+ }
+
return IN_KERNEL;
}
@@ -267,9 +321,10 @@
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
*/
-static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
+static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
+ char **msg, bool is_excp)
{
- enum context ctx = error_context(m);
+ enum context ctx = error_context(m, regs);
/* Processor Context Corrupt, no need to fumble too much, die! */
if (m->status & MCI_STATUS_PCC)
@@ -319,10 +374,11 @@
return MCE_KEEP_SEVERITY;
}
-static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
+static int mce_severity_intel(struct mce *m, struct pt_regs *regs,
+ int tolerant, char **msg, bool is_excp)
{
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
- enum context ctx = error_context(m);
+ enum context ctx = error_context(m, regs);
struct severity *s;
for (s = severities;; s++) {
@@ -356,7 +412,7 @@
}
/* Default to mce_severity_intel */
-int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) =
mce_severity_intel;
void __init mcheck_vendor_init_severity(void)
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
index bc441d6..a7cd2d2 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/arch/x86/kernel/cpu/mce/therm_throt.c
@@ -40,15 +40,58 @@
#define THERMAL_THROTTLING_EVENT 0
#define POWER_LIMIT_EVENT 1
-/*
- * Current thermal event state:
+/**
+ * struct _thermal_state - Represent the current thermal event state
+ * @next_check: Stores the next timestamp, when it is allowed
+ * to log the next warning message.
+ * @last_interrupt_time: Stores the timestamp for the last threshold
+ * high event.
+ * @therm_work: Delayed workqueue structure
+ * @count: Stores the current running count for thermal
+ * or power threshold interrupts.
+ * @last_count: Stores the previous running count for thermal
+ * or power threshold interrupts.
+ * @max_time_ms: This shows the maximum amount of time CPU was
+ * in throttled state for a single thermal
+ * threshold high to low state.
+ * @total_time_ms: This is a cumulative time during which CPU was
+ * in the throttled state.
+ * @rate_control_active: Set when a throttling message is logged.
+ * This is used for the purpose of rate-control.
+ * @new_event: Stores the last high/low status of the
+ * THERM_STATUS_PROCHOT or
+ * THERM_STATUS_POWER_LIMIT.
+ * @level: Stores whether this _thermal_state instance is
+ * for a CORE level or for PACKAGE level.
+ * @sample_index: Index for storing the next sample in the buffer
+ * temp_samples[].
+ * @sample_count: Total number of samples collected in the buffer
+ * temp_samples[].
+ * @average: The last moving average of temperature samples
+ * @baseline_temp: Temperature at which thermal threshold high
+ * interrupt was generated.
+ * @temp_samples: Storage for temperature samples to calculate
+ * moving average.
+ *
+ * This structure is used to represent data related to thermal state for a CPU.
+ * There is a separate storage for core and package level for each CPU.
*/
struct _thermal_state {
- bool new_event;
- int event;
u64 next_check;
+ u64 last_interrupt_time;
+ struct delayed_work therm_work;
unsigned long count;
unsigned long last_count;
+ unsigned long max_time_ms;
+ unsigned long total_time_ms;
+ bool rate_control_active;
+ bool new_event;
+ u8 level;
+ u8 sample_index;
+ u8 sample_count;
+ u8 average;
+ u8 baseline_temp;
+ u8 temp_samples[3];
};
struct thermal_state {
@@ -121,8 +164,22 @@
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
+define_therm_throt_device_show_func(core_throttle, max_time_ms);
+define_therm_throt_device_one_ro(core_throttle_max_time_ms);
+
+define_therm_throt_device_show_func(package_throttle, max_time_ms);
+define_therm_throt_device_one_ro(package_throttle_max_time_ms);
+
+define_therm_throt_device_show_func(core_throttle, total_time_ms);
+define_therm_throt_device_one_ro(core_throttle_total_time_ms);
+
+define_therm_throt_device_show_func(package_throttle, total_time_ms);
+define_therm_throt_device_one_ro(package_throttle_total_time_ms);
+
static struct attribute *thermal_throttle_attrs[] = {
&dev_attr_core_throttle_count.attr,
+ &dev_attr_core_throttle_max_time_ms.attr,
+ &dev_attr_core_throttle_total_time_ms.attr,
NULL
};
@@ -135,6 +192,112 @@
#define CORE_LEVEL 0
#define PACKAGE_LEVEL 1
+#define THERM_THROT_POLL_INTERVAL HZ
+#define THERM_STATUS_PROCHOT_LOG BIT(1)
+
+#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
+#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
+
+static void clear_therm_status_log(int level)
+{
+ int msr;
+ u64 mask, msr_val;
+
+ if (level == CORE_LEVEL) {
+ msr = MSR_IA32_THERM_STATUS;
+ mask = THERM_STATUS_CLEAR_CORE_MASK;
+ } else {
+ msr = MSR_IA32_PACKAGE_THERM_STATUS;
+ mask = THERM_STATUS_CLEAR_PKG_MASK;
+ }
+
+ rdmsrl(msr, msr_val);
+ msr_val &= mask;
+ wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
+}
+
+static void get_therm_status(int level, bool *proc_hot, u8 *temp)
+{
+ int msr;
+ u64 msr_val;
+
+ if (level == CORE_LEVEL)
+ msr = MSR_IA32_THERM_STATUS;
+ else
+ msr = MSR_IA32_PACKAGE_THERM_STATUS;
+
+ rdmsrl(msr, msr_val);
+ if (msr_val & THERM_STATUS_PROCHOT_LOG)
+ *proc_hot = true;
+ else
+ *proc_hot = false;
+
+ *temp = (msr_val >> 16) & 0x7F;
+}
+
+static void __maybe_unused throttle_active_work(struct work_struct *work)
+{
+ struct _thermal_state *state = container_of(to_delayed_work(work),
+ struct _thermal_state, therm_work);
+ unsigned int i, avg, this_cpu = smp_processor_id();
+ u64 now = get_jiffies_64();
+ bool hot;
+ u8 temp;
+
+ get_therm_status(state->level, &hot, &temp);
+ /* temperature value is offset from the max so lesser means hotter */
+ if (!hot && temp > state->baseline_temp) {
+ if (state->rate_control_active)
+ pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n",
+ this_cpu,
+ state->level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+
+ state->rate_control_active = false;
+ return;
+ }
+
+ if (time_before64(now, state->next_check) &&
+ state->rate_control_active)
+ goto re_arm;
+
+ state->next_check = now + CHECK_INTERVAL;
+
+ if (state->count != state->last_count) {
+ /* There was one new thermal interrupt */
+ state->last_count = state->count;
+ state->average = 0;
+ state->sample_count = 0;
+ state->sample_index = 0;
+ }
+
+ state->temp_samples[state->sample_index] = temp;
+ state->sample_count++;
+ state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples);
+ if (state->sample_count < ARRAY_SIZE(state->temp_samples))
+ goto re_arm;
+
+ avg = 0;
+ for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i)
+ avg += state->temp_samples[i];
+
+ avg /= ARRAY_SIZE(state->temp_samples);
+
+ if (state->average > avg) {
+ pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
+ this_cpu,
+ state->level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ state->rate_control_active = true;
+ }
+
+ state->average = avg;
+
+re_arm:
+ clear_therm_status_log(state->level);
+ schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
+}
+
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
@@ -178,27 +341,33 @@
if (new_event)
state->count++;
- if (time_before64(now, state->next_check) &&
- state->count != state->last_count)
+ if (event != THERMAL_THROTTLING_EVENT)
return;
- state->next_check = now + CHECK_INTERVAL;
- state->last_count = state->count;
+ if (new_event && !state->last_interrupt_time) {
+ bool hot;
+ u8 temp;
- /* if we just entered the thermal event */
- if (new_event) {
- if (event == THERMAL_THROTTLING_EVENT)
- pr_warn("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
- this_cpu,
- level == CORE_LEVEL ? "Core" : "Package",
- state->count);
- return;
- }
- if (old_event) {
- if (event == THERMAL_THROTTLING_EVENT)
- pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
- level == CORE_LEVEL ? "Core" : "Package");
- return;
+ get_therm_status(state->level, &hot, &temp);
+ /*
+ * Ignore short temperature spike as the system is not close
+ * to PROCHOT. 10C offset is large enough to ignore. It is
+ * already dropped from the high threshold temperature.
+ */
+ if (temp > 10)
+ return;
+
+ state->baseline_temp = temp;
+ state->last_interrupt_time = now;
+ schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
+ } else if (old_event && state->last_interrupt_time) {
+ unsigned long throttle_time;
+
+ throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time);
+ if (throttle_time > state->max_time_ms)
+ state->max_time_ms = throttle_time;
+ state->total_time_ms += throttle_time;
+ state->last_interrupt_time = 0;
}
}
@@ -244,20 +413,47 @@
if (err)
return err;
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
+ if (err)
+ goto del_group;
+ }
+
if (cpu_has(c, X86_FEATURE_PTS)) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ if (err)
+ goto del_group;
+
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_max_time_ms.attr,
+ thermal_attr_group.name);
+ if (err)
+ goto del_group;
+
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_total_time_ms.attr,
+ thermal_attr_group.name);
+ if (err)
+ goto del_group;
+
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
+ if (err)
+ goto del_group;
+ }
}
+ return 0;
+
+del_group:
+ sysfs_remove_group(&dev->kobj, &thermal_attr_group);
+
return err;
}
@@ -269,14 +465,38 @@
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int thermal_throttle_online(unsigned int cpu)
{
+ struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
+ u32 l;
+
+ state->package_throttle.level = PACKAGE_LEVEL;
+ state->core_throttle.level = CORE_LEVEL;
+
+ INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
+ INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
+
+ /* Unmask the thermal vector after the above workqueues are initialized. */
+ l = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
return thermal_throttle_add_dev(dev, cpu);
}
static int thermal_throttle_offline(unsigned int cpu)
{
+ struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
+ u32 l;
+
+ /* Mask the thermal vector before draining evtl. pending work */
+ l = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED);
+
+ cancel_delayed_work_sync(&state->package_throttle.therm_work);
+ cancel_delayed_work_sync(&state->core_throttle.therm_work);
+
+ state->package_throttle.rate_control_active = false;
+ state->core_throttle.rate_control_active = false;
thermal_throttle_remove_dev(dev);
return 0;
@@ -394,14 +614,13 @@
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
{
- entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
- exiting_ack_irq();
+ ack_APIC_irq();
}
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
@@ -512,10 +731,6 @@
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
- /* Unmask the thermal vector: */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
tm2 ? "TM2" : "TM1");
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index 28812cc..6a059a0 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -21,12 +21,11 @@
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
{
- entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
- exiting_ack_irq();
+ ack_APIC_irq();
}
diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c
index a30ea13..9c9f0ab 100644
--- a/arch/x86/kernel/cpu/mce/winchip.c
+++ b/arch/x86/kernel/cpu/mce/winchip.c
@@ -6,6 +6,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
+#include <linux/hardirq.h>
#include <asm/processor.h>
#include <asm/traps.h>
@@ -16,14 +17,12 @@
#include "internal.h"
/* Machine check handler for WinChip C6: */
-static void winchip_machine_check(struct pt_regs *regs, long error_code)
+static noinstr void winchip_machine_check(struct pt_regs *regs)
{
- ist_enter(regs);
-
+ instrumentation_begin();
pr_emerg("CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
- ist_exit(regs);
+ instrumentation_end();
}
/* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index a0e52bd..3f6b137 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -567,7 +567,7 @@
void reload_ucode_amd(void)
{
struct microcode_amd *mc;
- u32 rev, dummy;
+ u32 rev, dummy __always_unused;
mc = (struct microcode_amd *)amd_ucode_patch;
@@ -673,7 +673,7 @@
struct ucode_cpu_info *uci;
struct ucode_patch *p;
enum ucode_state ret;
- u32 rev, dummy;
+ u32 rev, dummy __always_unused;
BUG_ON(raw_smp_processor_id() != cpu);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 4a4198b..bbbd248 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -63,11 +63,6 @@
*/
static DEFINE_MUTEX(microcode_mutex);
-/*
- * Serialize late loading so that CPUs get updated one-by-one.
- */
-static DEFINE_RAW_SPINLOCK(update_lock);
-
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
struct cpu_info_ctx {
@@ -150,7 +145,6 @@
bool get_builtin_firmware(struct cpio_data *cd, const char *name)
{
-#ifdef CONFIG_FW_LOADER
struct builtin_fw *b_fw;
for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
@@ -160,7 +154,6 @@
return true;
}
}
-#endif
return false;
}
@@ -550,8 +543,7 @@
/*
* Returns:
* < 0 - on error
- * 0 - no update done
- * 1 - microcode was updated
+ * 0 - success (no update done or microcode was updated)
*/
static int __reload_late(void *info)
{
@@ -566,27 +558,38 @@
if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
return -1;
- raw_spin_lock(&update_lock);
- apply_microcode_local(&err);
- raw_spin_unlock(&update_lock);
+ /*
+ * On an SMT system, it suffices to load the microcode on one sibling of
+ * the core because the microcode engine is shared between the threads.
+ * Synchronization still needs to take place so that no concurrent
+ * loading attempts happen on multiple threads of an SMT core. See
+ * below.
+ */
+ if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu)
+ apply_microcode_local(&err);
+ else
+ goto wait_for_siblings;
- /* siblings return UCODE_OK because their engine got updated already */
- if (err > UCODE_NFOUND) {
- pr_warn("Error reloading microcode on CPU %d\n", cpu);
+ if (err >= UCODE_NFOUND) {
+ if (err == UCODE_ERROR)
+ pr_warn("Error reloading microcode on CPU %d\n", cpu);
+
ret = -1;
- } else if (err == UCODE_UPDATED || err == UCODE_OK) {
- ret = 1;
}
- /*
- * Increase the wait timeout to a safe value here since we're
- * serializing the microcode update and that could take a while on a
- * large number of CPUs. And that is fine as the *actual* timeout will
- * be determined by the last CPU finished updating and thus cut short.
- */
- if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
+wait_for_siblings:
+ if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC))
panic("Timeout during microcode update!\n");
+ /*
+ * At least one thread has completed update on each core.
+ * For others, simply call the update to make sure the
+ * per-cpu cpuinfo can be updated with right microcode
+ * revision.
+ */
+ if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu)
+ apply_microcode_local(&err);
+
return ret;
}
@@ -602,7 +605,7 @@
atomic_set(&late_cpus_out, 0);
ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
- if (ret > 0)
+ if (ret == 0)
microcode_check();
pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode);
@@ -643,7 +646,7 @@
put:
put_online_cpus();
- if (ret >= 0)
+ if (ret == 0)
ret = size;
return ret;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index b224d4d..7e8e07b 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -748,6 +748,7 @@
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
struct microcode_intel *mc;
enum ucode_state ret;
static int prev_rev;
@@ -793,7 +794,7 @@
return UCODE_ERROR;
}
- if (rev != prev_rev) {
+ if (bsp && rev != prev_rev) {
pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
rev,
mc->hdr.date & 0xffff,
@@ -809,7 +810,7 @@
c->microcode = rev;
/* Update boot_cpu_data's revision too, if we're on the BSP: */
- if (c->cpu_index == boot_cpu_data.cpu_index)
+ if (bsp)
boot_cpu_data.microcode = rev;
return ret;
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index aed45b8..1db560e 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,8 +6,7 @@
set -e
-IN=$1
-OUT=$2
+OUT=$1
dump_array()
{
@@ -15,6 +14,7 @@
SIZE=$2
PFX=$3
POSTFIX=$4
+ IN=$5
PFX_SZ=$(echo $PFX | wc -c)
TABS="$(printf '\t\t\t\t\t')"
@@ -57,11 +57,18 @@
echo "#endif"
echo ""
- dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" $2
echo ""
- dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" $2
+ echo ""
+ echo "#ifdef CONFIG_X86_VMX_FEATURE_NAMES"
+ echo "#ifndef _ASM_X86_VMXFEATURES_H"
+ echo "#include <asm/vmxfeatures.h>"
+ echo "#endif"
+ dump_array "x86_vmx_flags" "NVMXINTS*32" "VMX_FEATURE_" "" $3
+ echo "#endif /* CONFIG_X86_VMX_FEATURE_NAMES */"
) > $OUT
trap - EXIT
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 1c2f9ba..65d1171 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -23,6 +23,7 @@
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
+#include <asm/idtentry.h>
#include <asm/irq_regs.h>
#include <asm/i8259.h>
#include <asm/apic.h>
@@ -40,11 +41,10 @@
static void (*hv_kexec_handler)(void);
static void (*hv_crash_handler)(struct pt_regs *regs);
-__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- entering_irq();
inc_irq_stat(irq_hv_callback_count);
if (vmbus_handler)
vmbus_handler();
@@ -52,13 +52,17 @@
if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
ack_APIC_irq();
- exiting_irq();
set_irq_regs(old_regs);
}
-void hv_setup_vmbus_irq(void (*handler)(void))
+int hv_setup_vmbus_irq(int irq, void (*handler)(void))
{
+ /*
+ * The 'irq' argument is ignored on x86/x64 because a hard-coded
+ * interrupt vector is used for Hyper-V interrupts.
+ */
vmbus_handler = handler;
+ return 0;
}
void hv_remove_vmbus_irq(void)
@@ -73,19 +77,16 @@
* Routines to do per-architecture handling of stimer0
* interrupts when in Direct Mode
*/
-
-__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- entering_irq();
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
ack_APIC_irq();
- exiting_irq();
set_irq_regs(old_regs);
}
@@ -134,14 +135,32 @@
{
if (kexec_in_progress && hv_kexec_handler)
hv_kexec_handler();
+
+ /*
+ * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
+ * corrupts the old VP Assist Pages and can crash the kexec kernel.
+ */
+ if (kexec_in_progress && hyperv_init_cpuhp > 0)
+ cpuhp_remove_state(hyperv_init_cpuhp);
+
+ /* The function calls stop_other_cpus(). */
native_machine_shutdown();
+
+ /* Disable the hypercall page when there is only 1 active CPU. */
+ if (kexec_in_progress)
+ hyperv_cleanup();
}
static void hv_machine_crash_shutdown(struct pt_regs *regs)
{
if (hv_crash_handler)
hv_crash_handler(regs);
+
+ /* The function calls crash_smp_send_stop(). */
native_machine_crash_shutdown(regs);
+
+ /* Disable the hypercall page when there is only 1 active CPU. */
+ hyperv_cleanup();
}
#endif /* CONFIG_KEXEC_CORE */
#endif /* CONFIG_HYPERV */
@@ -252,7 +271,7 @@
hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
}
- if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_platform.calibrate_tsc = hv_get_tsc_khz;
x86_platform.calibrate_cpu = hv_get_tsc_khz;
@@ -274,7 +293,7 @@
crash_kexec_post_notifiers = true;
#ifdef CONFIG_X86_LOCAL_APIC
- if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
/*
* Get the APIC frequency.
@@ -300,7 +319,10 @@
machine_ops.shutdown = hv_machine_shutdown;
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
- mark_tsc_unstable("running on Hyper-V");
+ if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
+ wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ }
/*
* Generation 2 instances don't support reading the NMI status from
@@ -326,17 +348,19 @@
x86_platform.apic_post_init = hyperv_init;
hyperv_setup_mmu_ops();
/* Setup the IDT for hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback);
/* Setup the IDT for reenlightenment notifications */
- if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)
+ if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) {
alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
- hyperv_reenlightenment_vector);
+ asm_sysvec_hyperv_reenlightenment);
+ }
/* Setup the IDT for stimer0 */
- if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
alloc_intr_gate(HYPERV_STIMER0_VECTOR,
- hv_stimer0_callback_vector);
+ asm_sysvec_hyperv_stimer0);
+ }
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
@@ -356,13 +380,13 @@
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
#endif
-}
-
-void hv_setup_sched_clock(void *sched_clock)
-{
-#ifdef CONFIG_PARAVIRT
- pv_ops.time.sched_clock = sched_clock;
-#endif
+ /*
+ * TSC should be marked as unstable only after Hyper-V
+ * clocksource has been initialized. This ensures that the
+ * stability of the sched_clock is not altered.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ mark_tsc_unstable("running on Hyper-V");
}
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 7218280..ca67091 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -98,7 +98,7 @@
case 7:
if (size < 0x40)
break;
- /* Else, fall through */
+ fallthrough;
case 6:
case 5:
case 4:
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 4ea906f..a29997e 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -15,7 +15,7 @@
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
@@ -761,7 +761,7 @@
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb();
+ flush_tlb_local();
/* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
@@ -778,7 +778,7 @@
{
/* Flush TLBs (no need to flush caches - they are disabled) */
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb();
+ flush_tlb_local();
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 4d36dcc..a5c506f 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -101,9 +101,6 @@
int length;
size_t linelen;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
memset(line, 0, LINE_SIZE);
len = min_t(size_t, len, LINE_SIZE - 1);
@@ -226,8 +223,6 @@
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 0);
@@ -236,24 +231,18 @@
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
@@ -279,8 +268,6 @@
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 1);
@@ -289,8 +276,6 @@
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
break;
@@ -298,16 +283,12 @@
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
@@ -373,28 +354,6 @@
return single_release(ino, file);
}
-static int mtrr_seq_show(struct seq_file *seq, void *offset);
-
-static int mtrr_open(struct inode *inode, struct file *file)
-{
- if (!mtrr_if)
- return -EIO;
- if (!mtrr_if->get)
- return -ENXIO;
- return single_open(file, mtrr_seq_show, NULL);
-}
-
-static const struct file_operations mtrr_fops = {
- .owner = THIS_MODULE,
- .open = mtrr_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = mtrr_write,
- .unlocked_ioctl = mtrr_ioctl,
- .compat_ioctl = mtrr_ioctl,
- .release = mtrr_close,
-};
-
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
@@ -426,6 +385,29 @@
return 0;
}
+static int mtrr_open(struct inode *inode, struct file *file)
+{
+ if (!mtrr_if)
+ return -EIO;
+ if (!mtrr_if->get)
+ return -ENXIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ return single_open(file, mtrr_seq_show, NULL);
+}
+
+static const struct proc_ops mtrr_proc_ops = {
+ .proc_open = mtrr_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = mtrr_write,
+ .proc_ioctl = mtrr_ioctl,
+#ifdef CONFIG_COMPAT
+ .proc_compat_ioctl = mtrr_ioctl,
+#endif
+ .proc_release = mtrr_close,
+};
+
static int __init mtrr_if_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -436,7 +418,7 @@
(!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
return -ENODEV;
- proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
+ proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops);
return 0;
}
arch_initcall(mtrr_if_init);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 507039c..6a80f36 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -52,7 +52,7 @@
#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9556930..a548d91 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,6 +63,10 @@
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ case X86_VENDOR_CENTAUR:
+ return msr - MSR_ARCH_PERFMON_PERFCTR0;
}
return 0;
}
@@ -92,6 +96,10 @@
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ case X86_VENDOR_CENTAUR:
+ return msr - MSR_ARCH_PERFMON_EVENTSEL0;
}
return 0;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index cb2e498..4eec888 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -7,6 +7,10 @@
#include "cpu.h"
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+extern const char * const x86_vmx_flags[NVMXINTS*32];
+#endif
+
/*
* Get CPU information for use by the procfs.
*/
@@ -102,6 +106,17 @@
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) {
+ seq_puts(m, "\nvmx flags\t:");
+ for (i = 0; i < 32*NVMXINTS; i++) {
+ if (test_bit(i, (unsigned long *)c->vmx_capability) &&
+ x86_vmx_flags[i] != NULL)
+ seq_printf(m, " %s", x86_vmx_flags[i]);
+ }
+ }
+#endif
+
seq_puts(m, "\nbugs\t\t:");
for (i = 0; i < 32*NBUGINTS; i++) {
unsigned int bug_bit = 32*NCAPINTS + i;
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 5c900f9..c4be620 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -29,7 +29,8 @@
#ifdef CONFIG_ARCH_RANDOM
void x86_init_rdrand(struct cpuinfo_x86 *c)
{
- unsigned long tmp;
+ unsigned int changed = 0;
+ unsigned long tmp, prev;
int i;
if (!cpu_has(c, X86_FEATURE_RDRAND))
@@ -42,5 +43,24 @@
return;
}
}
+
+ /*
+ * Stupid sanity-check whether RDRAND does *actually* generate
+ * some at least random-looking data.
+ */
+ prev = tmp;
+ for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
+ if (rdrand_long(&tmp)) {
+ if (prev != tmp)
+ changed++;
+
+ prev = tmp;
+ }
+ }
+
+ if (WARN_ON_ONCE(!changed))
+ pr_emerg(
+"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\"");
+
}
#endif
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 87a34b6..4ccb903 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -22,7 +22,7 @@
#include <linux/cpuhotplug.h>
#include <asm/intel-family.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
#include "internal.h"
/* Mutex to protect rdtgroup access. */
@@ -168,6 +168,7 @@
.name = "MB",
.domains = domain_init(RDT_RESOURCE_MBA),
.cache_level = 3,
+ .parse_ctrlval = parse_bw,
.format_str = "%d=%*u",
.fflags = RFTYPE_RES_MB,
},
@@ -254,23 +255,30 @@
{
union cpuid_0x10_3_eax eax;
union cpuid_0x10_x_edx edx;
- u32 ebx, ecx;
+ u32 ebx, ecx, max_delay;
cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
r->num_closid = edx.split.cos_max + 1;
- r->membw.max_delay = eax.split.max_delay + 1;
+ max_delay = eax.split.max_delay + 1;
r->default_ctrl = MAX_MBA_BW;
- r->membw.mbm_width = MBM_CNTR_WIDTH;
+ r->membw.arch_needs_linear = true;
if (ecx & MBA_IS_LINEAR) {
r->membw.delay_linear = true;
- r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay;
- r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay;
+ r->membw.min_bw = MAX_MBA_BW - max_delay;
+ r->membw.bw_gran = MAX_MBA_BW - max_delay;
} else {
if (!rdt_get_mb_table(r))
return false;
+ r->membw.arch_needs_linear = false;
}
r->data_width = 3;
+ if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
+ r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
+ else
+ r->membw.throttle_mode = THREAD_THROTTLE_MAX;
+ thread_throttle_mode_init();
+
r->alloc_capable = true;
r->alloc_enabled = true;
@@ -289,8 +297,13 @@
/* AMD does not use delay */
r->membw.delay_linear = false;
+ r->membw.arch_needs_linear = false;
- r->membw.mbm_width = MBM_CNTR_WIDTH_AMD;
+ /*
+ * AMD does not use memory delay throttle model to control
+ * the allocation like Intel does.
+ */
+ r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
r->membw.min_bw = 0;
r->membw.bw_gran = 1;
/* Max value is 2048, Data width should be 4 in decimal */
@@ -348,19 +361,6 @@
rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE);
}
-static int get_cache_id(int cpu, int level)
-{
- struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
- int i;
-
- for (i = 0; i < ci->num_leaves; i++) {
- if (ci->info_list[i].level == level)
- return ci->info_list[i].id;
- }
-
- return -1;
-}
-
static void
mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
{
@@ -558,18 +558,20 @@
*/
static void domain_add_cpu(int cpu, struct rdt_resource *r)
{
- int id = get_cache_id(cpu, r->cache_level);
+ int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
struct list_head *add_pos = NULL;
struct rdt_domain *d;
d = rdt_find_domain(r, id, &add_pos);
if (IS_ERR(d)) {
- pr_warn("Could't find cache id for cpu %d\n", cpu);
+ pr_warn("Couldn't find cache id for CPU %d\n", cpu);
return;
}
if (d) {
cpumask_set_cpu(cpu, &d->cpu_mask);
+ if (r->cache.arch_has_per_cpu_cfg)
+ rdt_domain_reconfigure_cdp(r);
return;
}
@@ -588,6 +590,8 @@
}
if (r->mon_capable && domain_setup_mon_state(r, d)) {
+ kfree(d->ctrl_val);
+ kfree(d->mbps_val);
kfree(d);
return;
}
@@ -604,12 +608,12 @@
static void domain_remove_cpu(int cpu, struct rdt_resource *r)
{
- int id = get_cache_id(cpu, r->cache_level);
+ int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
struct rdt_domain *d;
d = rdt_find_domain(r, id, NULL);
if (IS_ERR_OR_NULL(d)) {
- pr_warn("Could't find cache id for cpu %d\n", cpu);
+ pr_warn("Couldn't find cache id for CPU %d\n", cpu);
return;
}
@@ -920,12 +924,13 @@
r->rid == RDT_RESOURCE_L3CODE ||
r->rid == RDT_RESOURCE_L2 ||
r->rid == RDT_RESOURCE_L2DATA ||
- r->rid == RDT_RESOURCE_L2CODE)
- r->cbm_validate = cbm_validate_intel;
- else if (r->rid == RDT_RESOURCE_MBA) {
+ r->rid == RDT_RESOURCE_L2CODE) {
+ r->cache.arch_has_sparse_bitmaps = false;
+ r->cache.arch_has_empty_bitmaps = false;
+ r->cache.arch_has_per_cpu_cfg = false;
+ } else if (r->rid == RDT_RESOURCE_MBA) {
r->msr_base = MSR_IA32_MBA_THRTL_BASE;
r->msr_update = mba_wrmsr_intel;
- r->parse_ctrlval = parse_bw_intel;
}
}
}
@@ -940,12 +945,13 @@
r->rid == RDT_RESOURCE_L3CODE ||
r->rid == RDT_RESOURCE_L2 ||
r->rid == RDT_RESOURCE_L2DATA ||
- r->rid == RDT_RESOURCE_L2CODE)
- r->cbm_validate = cbm_validate_amd;
- else if (r->rid == RDT_RESOURCE_MBA) {
+ r->rid == RDT_RESOURCE_L2CODE) {
+ r->cache.arch_has_sparse_bitmaps = true;
+ r->cache.arch_has_empty_bitmaps = true;
+ r->cache.arch_has_per_cpu_cfg = true;
+ } else if (r->rid == RDT_RESOURCE_MBA) {
r->msr_base = MSR_IA32_MBA_BW_BASE;
r->msr_update = mba_wrmsr_amd;
- r->parse_ctrlval = parse_bw_amd;
}
}
}
@@ -960,6 +966,36 @@
static enum cpuhp_state rdt_online;
+/* Runs once on the BSP during boot. */
+void resctrl_cpu_detect(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ c->x86_cache_mbm_width_offset = -1;
+ return;
+ }
+
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = cpuid_ebx(0xf);
+
+ if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
+ u32 eax, ebx, ecx, edx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ c->x86_cache_mbm_width_offset = eax & 0xff;
+
+ if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
+ c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
+ }
+}
+
static int __init resctrl_late_init(void)
{
struct rdt_resource *r;
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 055c861..c877642 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -23,53 +23,6 @@
/*
* Check whether MBA bandwidth percentage value is correct. The value is
- * checked against the minimum and maximum bandwidth values specified by
- * the hardware. The allocated bandwidth percentage is rounded to the next
- * control step available on the hardware.
- */
-static bool bw_validate_amd(char *buf, unsigned long *data,
- struct rdt_resource *r)
-{
- unsigned long bw;
- int ret;
-
- ret = kstrtoul(buf, 10, &bw);
- if (ret) {
- rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
- return false;
- }
-
- if (bw < r->membw.min_bw || bw > r->default_ctrl) {
- rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
- r->membw.min_bw, r->default_ctrl);
- return false;
- }
-
- *data = roundup(bw, (unsigned long)r->membw.bw_gran);
- return true;
-}
-
-int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r,
- struct rdt_domain *d)
-{
- unsigned long bw_val;
-
- if (d->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
- return -EINVAL;
- }
-
- if (!bw_validate_amd(data->buf, &bw_val, r))
- return -EINVAL;
-
- d->new_ctrl = bw_val;
- d->have_new_ctrl = true;
-
- return 0;
-}
-
-/*
- * Check whether MBA bandwidth percentage value is correct. The value is
* checked against the minimum and max bandwidth values specified by the
* hardware. The allocated bandwidth percentage is rounded to the next
* control step available on the hardware.
@@ -82,7 +35,7 @@
/*
* Only linear delay values is supported for current Intel SKUs.
*/
- if (!r->membw.delay_linear) {
+ if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
rdt_last_cmd_puts("No support for non-linear MB domains\n");
return false;
}
@@ -104,8 +57,8 @@
return true;
}
-int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r,
- struct rdt_domain *d)
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
{
unsigned long bw_val;
@@ -123,12 +76,14 @@
}
/*
- * Check whether a cache bit mask is valid. The SDM says:
+ * Check whether a cache bit mask is valid.
+ * For Intel the SDM says:
* Please note that all (and only) contiguous '1' combinations
* are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
* Additionally Haswell requires at least two bits set.
+ * AMD allows non-contiguous bitmasks.
*/
-bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r)
+static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
{
unsigned long first_bit, zero_bit, val;
unsigned int cbm_len = r->cache.cbm_len;
@@ -140,7 +95,8 @@
return false;
}
- if (val == 0 || val > r->default_ctrl) {
+ if ((!r->cache.arch_has_empty_bitmaps && val == 0) ||
+ val > r->default_ctrl) {
rdt_last_cmd_puts("Mask out of range\n");
return false;
}
@@ -148,7 +104,9 @@
first_bit = find_first_bit(&val, cbm_len);
zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
- if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) {
+ /* Are non-contiguous bitmaps allowed? */
+ if (!r->cache.arch_has_sparse_bitmaps &&
+ (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
return false;
}
@@ -164,30 +122,6 @@
}
/*
- * Check whether a cache bit mask is valid. AMD allows non-contiguous
- * bitmasks
- */
-bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r)
-{
- unsigned long val;
- int ret;
-
- ret = kstrtoul(buf, 16, &val);
- if (ret) {
- rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
- return false;
- }
-
- if (val > r->default_ctrl) {
- rdt_last_cmd_puts("Mask out of range\n");
- return false;
- }
-
- *data = val;
- return true;
-}
-
-/*
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
@@ -212,7 +146,7 @@
return -EINVAL;
}
- if (!r->cbm_validate(data->buf, &cbm_val, r))
+ if (!cbm_validate(data->buf, &cbm_val, r))
return -EINVAL;
if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
@@ -495,14 +429,16 @@
return ret;
}
-void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
- struct rdtgroup *rdtgrp, int evtid, int first)
+void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
+ struct rdt_domain *d, struct rdtgroup *rdtgrp,
+ int evtid, int first)
{
/*
* setup the parameters to send to the IPI to read the data.
*/
rr->rgrp = rdtgrp;
rr->evtid = evtid;
+ rr->r = r;
rr->d = d;
rr->val = 0;
rr->first = first;
@@ -539,7 +475,7 @@
goto out;
}
- mon_event_read(&rr, d, rdtgrp, evtid, false);
+ mon_event_read(&rr, r, d, rdtgrp, evtid, false);
if (rr.val & RMID_VAL_ERROR)
seq_puts(m, "Error\n");
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 499cb2e..f65d3c0 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -31,16 +31,22 @@
#define CQM_LIMBOCHECK_INTERVAL 1000
-#define MBM_CNTR_WIDTH 24
-#define MBM_CNTR_WIDTH_AMD 44
+#define MBM_CNTR_WIDTH_BASE 24
#define MBM_OVERFLOW_INTERVAL 1000
#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
#define MBA_MAX_MBPS U32_MAX
#define MAX_MBA_BW_AMD 0x800
+#define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63)
#define RMID_VAL_UNAVAIL BIT_ULL(62)
+/*
+ * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
+ * data to be returned. The counter width is discovered from the hardware
+ * as an offset from MBM_CNTR_WIDTH_BASE.
+ */
+#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
struct rdt_fs_context {
@@ -88,6 +94,7 @@
struct rmid_read {
struct rdtgroup *rgrp;
+ struct rdt_resource *r;
struct rdt_domain *d;
int evtid;
bool first;
@@ -351,6 +358,10 @@
* in a cache bit mask
* @shareable_bits: Bitmask of shareable resource with other
* executing entities
+ * @arch_has_sparse_bitmaps: True if a bitmap like f00f is valid.
+ * @arch_has_empty_bitmaps: True if the '0' bitmap is valid.
+ * @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache
+ * level has CPU scope.
*/
struct rdt_cache {
unsigned int cbm_len;
@@ -358,27 +369,44 @@
unsigned int cbm_idx_mult;
unsigned int cbm_idx_offset;
unsigned int shareable_bits;
+ bool arch_has_sparse_bitmaps;
+ bool arch_has_empty_bitmaps;
+ bool arch_has_per_cpu_cfg;
+};
+
+/**
+ * enum membw_throttle_mode - System's memory bandwidth throttling mode
+ * @THREAD_THROTTLE_UNDEFINED: Not relevant to the system
+ * @THREAD_THROTTLE_MAX: Memory bandwidth is throttled at the core
+ * always using smallest bandwidth percentage
+ * assigned to threads, aka "max throttling"
+ * @THREAD_THROTTLE_PER_THREAD: Memory bandwidth is throttled at the thread
+ */
+enum membw_throttle_mode {
+ THREAD_THROTTLE_UNDEFINED = 0,
+ THREAD_THROTTLE_MAX,
+ THREAD_THROTTLE_PER_THREAD,
};
/**
* struct rdt_membw - Memory bandwidth allocation related data
- * @max_delay: Max throttle delay. Delay is the hardware
- * representation for memory bandwidth.
* @min_bw: Minimum memory bandwidth percentage user can request
* @bw_gran: Granularity at which the memory bandwidth is allocated
* @delay_linear: True if memory B/W delay is in linear scale
- * @mbm_width: memory B/W monitor counter width
+ * @arch_needs_linear: True if we can't configure non-linear resources
+ * @throttle_mode: Bandwidth throttling mode when threads request
+ * different memory bandwidths
* @mba_sc: True if MBA software controller(mba_sc) is enabled
* @mb_map: Mapping of memory B/W percentage to memory B/W delay
*/
struct rdt_membw {
- u32 max_delay;
- u32 min_bw;
- u32 bw_gran;
- u32 delay_linear;
- u32 mbm_width;
- bool mba_sc;
- u32 *mb_map;
+ u32 min_bw;
+ u32 bw_gran;
+ u32 delay_linear;
+ bool arch_needs_linear;
+ enum membw_throttle_mode throttle_mode;
+ bool mba_sc;
+ u32 *mb_map;
};
static inline bool is_llc_occupancy_enabled(void)
@@ -430,7 +458,6 @@
* @cache: Cache allocation related data
* @format_str: Per resource format string to show domain value
* @parse_ctrlval: Per resource function pointer to parse control values
- * @cbm_validate Cache bitmask validate function
* @evt_list: List of monitoring events
* @num_rmid: Number of RMIDs available
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
@@ -457,19 +484,17 @@
int (*parse_ctrlval)(struct rdt_parse_data *data,
struct rdt_resource *r,
struct rdt_domain *d);
- bool (*cbm_validate)(char *buf, u32 *data, struct rdt_resource *r);
struct list_head evt_list;
int num_rmid;
unsigned int mon_scale;
+ unsigned int mbm_width;
unsigned long fflags;
};
int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
struct rdt_domain *d);
-int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r,
- struct rdt_domain *d);
-int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r,
- struct rdt_domain *d);
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
extern struct mutex rdtgroup_mutex;
@@ -588,8 +613,9 @@
unsigned int dom_id);
void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
struct rdt_domain *d);
-void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
- struct rdtgroup *rdtgrp, int evtid, int first);
+void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
+ struct rdt_domain *d, struct rdtgroup *rdtgrp,
+ int evtid, int first);
void mbm_setup_overflow_handler(struct rdt_domain *dom,
unsigned long delay_ms);
void mbm_handle_overflow(struct work_struct *work);
@@ -600,8 +626,7 @@
void cqm_handle_limbo(struct work_struct *work);
bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
void __check_limbo(struct rdt_domain *d, bool force_free);
-bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r);
-bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
+void __init thread_throttle_mode_init(void);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 008bcb1..576f16a 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -214,11 +214,10 @@
list_add_tail(&entry->list, &rmid_free_lru);
}
-static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
{
- u64 shift, chunks;
+ u64 shift = 64 - width, chunks;
- shift = 64 - rdt_resources_all[RDT_RESOURCE_MBA].membw.mbm_width;
chunks = (cur_msr << shift) - (prev_msr << shift);
return chunks >>= shift;
}
@@ -256,7 +255,7 @@
return 0;
}
- chunks = mbm_overflow_count(m->prev_msr, tval);
+ chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width);
m->chunks += chunks;
m->prev_msr = tval;
@@ -278,7 +277,7 @@
if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
return;
- chunks = mbm_overflow_count(m->prev_bw_msr, tval);
+ chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
cur_bw = (chunks * r->mon_scale) >> 20;
if (m->delta_comp)
@@ -437,11 +436,12 @@
}
}
-static void mbm_update(struct rdt_domain *d, int rmid)
+static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
{
struct rmid_read rr;
rr.first = false;
+ rr.r = r;
rr.d = d;
/*
@@ -480,19 +480,13 @@
mutex_lock(&rdtgroup_mutex);
r = &rdt_resources_all[RDT_RESOURCE_L3];
- d = get_domain_from_cpu(cpu, r);
-
- if (!d) {
- pr_warn_once("Failure to get domain for limbo worker\n");
- goto out_unlock;
- }
+ d = container_of(work, struct rdt_domain, cqm_limbo.work);
__check_limbo(d, false);
if (has_busy_rmid(r, d))
schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
-out_unlock:
mutex_unlock(&rdtgroup_mutex);
}
@@ -513,6 +507,7 @@
struct rdtgroup *prgrp, *crgrp;
int cpu = smp_processor_id();
struct list_head *head;
+ struct rdt_resource *r;
struct rdt_domain *d;
mutex_lock(&rdtgroup_mutex);
@@ -520,16 +515,15 @@
if (!static_branch_likely(&rdt_mon_enable_key))
goto out_unlock;
- d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
- if (!d)
- goto out_unlock;
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+ d = container_of(work, struct rdt_domain, mbm_over.work);
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- mbm_update(d, prgrp->mon.rmid);
+ mbm_update(r, d, prgrp->mon.rmid);
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list)
- mbm_update(d, crgrp->mon.rmid);
+ mbm_update(r, d, crgrp->mon.rmid);
if (is_mba_sc(NULL))
update_mba_bw(prgrp, d);
@@ -617,11 +611,18 @@
int rdt_get_mon_l3_config(struct rdt_resource *r)
{
+ unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
unsigned int cl_size = boot_cpu_data.x86_cache_size;
int ret;
r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+ r->mbm_width = MBM_CNTR_WIDTH_BASE;
+
+ if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
+ r->mbm_width += mbm_offset;
+ else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
+ pr_warn("Ignoring impossible MBM counter offset\n");
/*
* A reasonable upper limit on the max threshold is the number
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index d7623e1..0daf2f1 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -24,7 +24,7 @@
#include <asm/cacheflush.h>
#include <asm/intel-family.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
#include <asm/perf_event.h>
#include "../../events/perf_event.h" /* For X86_CONFIG() */
@@ -1326,9 +1326,9 @@
* pseudo-locked region will still be here on return.
*
* The mutex has to be released temporarily to avoid a potential
- * deadlock with the mm->mmap_sem semaphore which is obtained in
- * the device_create() and debugfs_create_dir() callpath below
- * as well as before the mmap() callback is called.
+ * deadlock with the mm->mmap_lock which is obtained in the
+ * device_create() and debugfs_create_dir() callpath below as well as
+ * before the mmap() callback is called.
*/
mutex_unlock(&rdtgroup_mutex);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 28f7862..5a59e33 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -29,7 +29,7 @@
#include <uapi/linux/magic.h>
-#include <asm/resctrl_sched.h>
+#include <asm/resctrl.h>
#include "internal.h"
DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
@@ -591,6 +591,18 @@
return 0;
}
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (rdt_alloc_capable &&
+ (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (rdt_mon_capable &&
+ (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
/**
* rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
* @r: Resource group
@@ -606,8 +618,7 @@
rcu_read_lock();
for_each_process_thread(p, t) {
- if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
- (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
+ if (is_closid_match(t, r) || is_rmid_match(t, r)) {
ret = 1;
break;
}
@@ -705,8 +716,7 @@
rcu_read_lock();
for_each_process_thread(p, t) {
- if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
- (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
+ if (is_closid_match(t, r) || is_rmid_match(t, r))
seq_printf(s, "%d\n", t->pid);
}
rcu_read_unlock();
@@ -728,6 +738,92 @@
return ret;
}
+#ifdef CONFIG_PROC_CPU_RESCTRL
+
+/*
+ * A task can only be part of one resctrl control group and of one monitor
+ * group which is associated to that control group.
+ *
+ * 1) res:
+ * mon:
+ *
+ * resctrl is not available.
+ *
+ * 2) res:/
+ * mon:
+ *
+ * Task is part of the root resctrl control group, and it is not associated
+ * to any monitor group.
+ *
+ * 3) res:/
+ * mon:mon0
+ *
+ * Task is part of the root resctrl control group and monitor group mon0.
+ *
+ * 4) res:group0
+ * mon:
+ *
+ * Task is part of resctrl control group group0, and it is not associated
+ * to any monitor group.
+ *
+ * 5) res:group0
+ * mon:mon1
+ *
+ * Task is part of resctrl control group group0 and monitor group mon1.
+ */
+int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ struct rdtgroup *rdtg;
+ int ret = 0;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Return empty if resctrl has not been mounted. */
+ if (!static_branch_unlikely(&rdt_enable_key)) {
+ seq_puts(s, "res:\nmon:\n");
+ goto unlock;
+ }
+
+ list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
+ struct rdtgroup *crg;
+
+ /*
+ * Task information is only relevant for shareable
+ * and exclusive groups.
+ */
+ if (rdtg->mode != RDT_MODE_SHAREABLE &&
+ rdtg->mode != RDT_MODE_EXCLUSIVE)
+ continue;
+
+ if (rdtg->closid != tsk->closid)
+ continue;
+
+ seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
+ rdtg->kn->name);
+ seq_puts(s, "mon:");
+ list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
+ mon.crdtgrp_list) {
+ if (tsk->rmid != crg->mon.rmid)
+ continue;
+ seq_printf(s, "%s", crg->kn->name);
+ break;
+ }
+ seq_putc(s, '\n');
+ goto unlock;
+ }
+ /*
+ * The above search should succeed. Otherwise return
+ * with an error.
+ */
+ ret = -ENOENT;
+unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+#endif
+
static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -930,6 +1026,19 @@
return 0;
}
+static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
+ seq_puts(seq, "per-thread\n");
+ else
+ seq_puts(seq, "max\n");
+
+ return 0;
+}
+
static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
@@ -1426,6 +1535,17 @@
.seq_show = rdt_delay_linear_show,
.fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
},
+ /*
+ * Platform specific which (if any) capabilities are provided by
+ * thread_throttle_mode. Defer "fflags" initialization to platform
+ * discovery.
+ */
+ {
+ .name = "thread_throttle_mode",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_thread_throttle_mode_show,
+ },
{
.name = "max_threshold_occupancy",
.mode = 0644,
@@ -1496,7 +1616,7 @@
lockdep_assert_held(&rdtgroup_mutex);
for (rft = rfts; rft < rfts + len; rft++) {
- if ((fflags & rft->fflags) == rft->fflags) {
+ if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
ret = rdtgroup_add_file(kn, rft);
if (ret)
goto error;
@@ -1513,6 +1633,33 @@
return ret;
}
+static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
+{
+ struct rftype *rfts, *rft;
+ int len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if (!strcmp(rft->name, name))
+ return rft;
+ }
+
+ return NULL;
+}
+
+void __init thread_throttle_mode_init(void)
+{
+ struct rftype *rft;
+
+ rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
+ if (!rft)
+ return;
+
+ rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
+}
+
/**
* rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
* @r: The resource group with which the file is associated.
@@ -1743,8 +1890,13 @@
r_l = &rdt_resources_all[level];
list_for_each_entry(d, &r_l->domains, list) {
- /* Pick one CPU from each domain instance to update MSR */
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ if (r_l->cache.arch_has_per_cpu_cfg)
+ /* Pick all the CPUs in the domain instance */
+ for_each_cpu(cpu, &d->cpu_mask)
+ cpumask_set_cpu(cpu, cpu_mask);
+ else
+ /* Pick one CPU from each domain instance to update MSR */
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
}
cpu = get_cpu();
/* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
@@ -2037,25 +2189,20 @@
nr__rdt_params
};
-static const struct fs_parameter_spec rdt_param_specs[] = {
+static const struct fs_parameter_spec rdt_fs_parameters[] = {
fsparam_flag("cdp", Opt_cdp),
fsparam_flag("cdpl2", Opt_cdpl2),
fsparam_flag("mba_MBps", Opt_mba_mbps),
{}
};
-static const struct fs_parameter_description rdt_fs_parameters = {
- .name = "rdt",
- .specs = rdt_param_specs,
-};
-
static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
struct fs_parse_result result;
int opt;
- opt = fs_parse(fc, &rdt_fs_parameters, param, &result);
+ opt = fs_parse(fc, rdt_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -2146,18 +2293,6 @@
return 0;
}
-static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
-{
- return (rdt_alloc_capable &&
- (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
-}
-
-static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
-{
- return (rdt_mon_capable &&
- (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
-}
-
/*
* Move tasks from one to the other group. If @from is NULL, then all tasks
* in the systems are moved unconditionally (used for teardown).
@@ -2288,7 +2423,7 @@
static struct file_system_type rdt_fs_type = {
.name = "resctrl",
.init_fs_context = rdt_init_fs_context,
- .parameters = &rdt_fs_parameters,
+ .parameters = rdt_fs_parameters,
.kill_sb = rdt_kill_sb,
};
@@ -2369,7 +2504,7 @@
goto out_destroy;
if (is_mbm_event(mevt->evtid))
- mon_event_read(&rr, d, prgrp, mevt->evtid, true);
+ mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
}
kernfs_activate(kn);
return 0;
@@ -2639,7 +2774,6 @@
}
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
const char *name, umode_t mode,
enum rdt_group_type rtype, struct rdtgroup **r)
{
@@ -2750,15 +2884,12 @@
* to monitor a subset of tasks and cpus in its parent ctrl_mon group.
*/
static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
- const char *name,
- umode_t mode)
+ const char *name, umode_t mode)
{
struct rdtgroup *rdtgrp, *prgrp;
int ret;
- ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
- &rdtgrp);
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
if (ret)
return ret;
@@ -2780,7 +2911,6 @@
* to allocate and monitor resources.
*/
static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
const char *name, umode_t mode)
{
struct rdtgroup *rdtgrp;
@@ -2788,8 +2918,7 @@
u32 closid;
int ret;
- ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
- &rdtgrp);
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
if (ret)
return ret;
@@ -2863,14 +2992,14 @@
* subdirectory
*/
if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
- return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+ return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
/*
* If RDT monitoring is supported and the parent directory is a valid
* "mon_groups" directory, add a monitoring subdirectory.
*/
if (rdt_mon_capable && is_mon_groups(parent_kn, name))
- return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+ return rdtgroup_mkdir_mon(parent_kn, name, mode);
return -EPERM;
}
@@ -3089,14 +3218,14 @@
* It may also be ok since that would enable debugging of RDT before
* resctrl is mounted.
* The reason why the debugfs directory is created here and not in
- * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
+ * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
* during the debugfs directory creation also &sb->s_type->i_mutex_key
* (the lockdep class of inode->i_rwsem). Other filesystem
* interactions (eg. SyS_getdents) have the lock ordering:
- * &sb->s_type->i_mutex_key --> &mm->mmap_sem
- * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
+ * &sb->s_type->i_mutex_key --> &mm->mmap_lock
+ * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
* is taken, thus creating dependency:
- * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
+ * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
* issues considering the other two lock dependencies.
* By creating the debugfs directory here we avoid a dependency
* that may cause deadlock (even though file operations cannot
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index adf9b71..866c9a9 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -4,7 +4,7 @@
*/
#include <linux/cpu.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/apic.h>
#include <asm/processor.h>
@@ -35,12 +35,15 @@
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
{ X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
{ X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
+ { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
{ X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
+ { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 },
+ { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 24da5ee..91288da 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -7,7 +7,7 @@
#include <linux/cpu.h>
#include <asm/apic.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/processor.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 032509a..e2ad30e 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -14,6 +14,9 @@
#include "cpu.h"
+#undef pr_fmt
+#define pr_fmt(fmt) "tsx: " fmt
+
enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
void tsx_disable(void)
@@ -99,7 +102,7 @@
tsx_ctrl_state = x86_get_tsx_auto_mode();
} else {
tsx_ctrl_state = TSX_CTRL_DISABLE;
- pr_err("tsx: invalid option, defaulting to off\n");
+ pr_err("invalid option, defaulting to off\n");
}
} else {
/* tsx= not provided */
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index 32b4dc9..ec8064c 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <asm/msr.h>
+#include <asm/mwait.h>
#define UMWAIT_C02_ENABLE 0
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 46d7326..924571f 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -25,12 +25,15 @@
#include <linux/init.h>
#include <linux/export.h>
#include <linux/clocksource.h>
+#include <linux/cpu.h>
+#include <linux/reboot.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
#include <asm/timer.h>
#include <asm/apic.h>
#include <asm/vmware.h>
+#include <asm/svm.h>
#undef pr_fmt
#define pr_fmt(fmt) "vmware: " fmt
@@ -47,6 +50,11 @@
#define VMWARE_CMD_GETVCPU_INFO 68
#define VMWARE_CMD_LEGACY_X2APIC 3
#define VMWARE_CMD_VCPU_RESERVED 31
+#define VMWARE_CMD_STEALCLOCK 91
+
+#define STEALCLOCK_NOT_AVAILABLE (-1)
+#define STEALCLOCK_DISABLED 0
+#define STEALCLOCK_ENABLED 1
#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
__asm__("inl (%%dx), %%eax" : \
@@ -86,6 +94,18 @@
} \
} while (0)
+struct vmware_steal_time {
+ union {
+ uint64_t clock; /* stolen time counter in units of vtsc */
+ struct {
+ /* only for little-endian */
+ uint32_t clock_low;
+ uint32_t clock_high;
+ };
+ };
+ uint64_t reserved[7];
+};
+
static unsigned long vmware_tsc_khz __ro_after_init;
static u8 vmware_hypercall_mode __ro_after_init;
@@ -103,15 +123,25 @@
#ifdef CONFIG_PARAVIRT
static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
-static int vmw_sched_clock __initdata = 1;
+static bool vmw_sched_clock __initdata = true;
+static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64);
+static bool has_steal_clock;
+static bool steal_acc __initdata = true; /* steal time accounting */
static __init int setup_vmw_sched_clock(char *s)
{
- vmw_sched_clock = 0;
+ vmw_sched_clock = false;
return 0;
}
early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+static __init int parse_no_stealacc(char *arg)
+{
+ steal_acc = false;
+ return 0;
+}
+early_param("no-steal-acc", parse_no_stealacc);
+
static unsigned long long notrace vmware_sched_clock(void)
{
unsigned long long ns;
@@ -122,7 +152,7 @@
return ns;
}
-static void __init vmware_sched_clock_setup(void)
+static void __init vmware_cyc2ns_setup(void)
{
struct cyc2ns_data *d = &vmware_cyc2ns;
unsigned long long tsc_now = rdtsc();
@@ -132,17 +162,201 @@
d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
d->cyc2ns_shift);
- pv_ops.time.sched_clock = vmware_sched_clock;
- pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset);
+ pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset);
}
+static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2)
+{
+ uint32_t result, info;
+
+ asm volatile (VMWARE_HYPERCALL :
+ "=a"(result),
+ "=c"(info) :
+ "a"(VMWARE_HYPERVISOR_MAGIC),
+ "b"(0),
+ "c"(VMWARE_CMD_STEALCLOCK),
+ "d"(0),
+ "S"(arg1),
+ "D"(arg2) :
+ "memory");
+ return result;
+}
+
+static bool stealclock_enable(phys_addr_t pa)
+{
+ return vmware_cmd_stealclock(upper_32_bits(pa),
+ lower_32_bits(pa)) == STEALCLOCK_ENABLED;
+}
+
+static int __stealclock_disable(void)
+{
+ return vmware_cmd_stealclock(0, 1);
+}
+
+static void stealclock_disable(void)
+{
+ __stealclock_disable();
+}
+
+static bool vmware_is_stealclock_available(void)
+{
+ return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE;
+}
+
+/**
+ * vmware_steal_clock() - read the per-cpu steal clock
+ * @cpu: the cpu number whose steal clock we want to read
+ *
+ * The function reads the steal clock if we are on a 64-bit system, otherwise
+ * reads it in parts, checking that the high part didn't change in the
+ * meantime.
+ *
+ * Return:
+ * The steal clock reading in ns.
+ */
+static uint64_t vmware_steal_clock(int cpu)
+{
+ struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu);
+ uint64_t clock;
+
+ if (IS_ENABLED(CONFIG_64BIT))
+ clock = READ_ONCE(steal->clock);
+ else {
+ uint32_t initial_high, low, high;
+
+ do {
+ initial_high = READ_ONCE(steal->clock_high);
+ /* Do not reorder initial_high and high readings */
+ virt_rmb();
+ low = READ_ONCE(steal->clock_low);
+ /* Keep low reading in between */
+ virt_rmb();
+ high = READ_ONCE(steal->clock_high);
+ } while (initial_high != high);
+
+ clock = ((uint64_t)high << 32) | low;
+ }
+
+ return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+}
+
+static void vmware_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ if (!stealclock_enable(slow_virt_to_phys(st))) {
+ has_steal_clock = false;
+ return;
+ }
+
+ pr_info("vmware-stealtime: cpu %d, pa %llx\n",
+ cpu, (unsigned long long) slow_virt_to_phys(st));
+}
+
+static void vmware_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ stealclock_disable();
+}
+
+static void vmware_guest_cpu_init(void)
+{
+ if (has_steal_clock)
+ vmware_register_steal_time();
+}
+
+static void vmware_pv_guest_cpu_reboot(void *unused)
+{
+ vmware_disable_steal_time();
+}
+
+static int vmware_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vmware_pv_reboot_nb = {
+ .notifier_call = vmware_pv_reboot_notify,
+};
+
+#ifdef CONFIG_SMP
+static void __init vmware_smp_prepare_boot_cpu(void)
+{
+ vmware_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+}
+
+static int vmware_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ vmware_guest_cpu_init();
+ local_irq_enable();
+ return 0;
+}
+
+static int vmware_cpu_down_prepare(unsigned int cpu)
+{
+ local_irq_disable();
+ vmware_disable_steal_time();
+ local_irq_enable();
+ return 0;
+}
+#endif
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ static_key_slow_inc(¶virt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(¶virt_steal_rq_enabled);
+ }
+
+ return 0;
+}
+arch_initcall(activate_jump_labels);
+
static void __init vmware_paravirt_ops_setup(void)
{
pv_info.name = "VMware hypervisor";
pv_ops.cpu.io_delay = paravirt_nop;
- if (vmware_tsc_khz && vmw_sched_clock)
- vmware_sched_clock_setup();
+ if (vmware_tsc_khz == 0)
+ return;
+
+ vmware_cyc2ns_setup();
+
+ if (vmw_sched_clock)
+ pv_ops.time.sched_clock = vmware_sched_clock;
+
+ if (vmware_is_stealclock_available()) {
+ has_steal_clock = true;
+ pv_ops.time.steal_clock = vmware_steal_clock;
+
+ /* We use reboot notifier only to disable steal clock */
+ register_reboot_notifier(&vmware_pv_reboot_nb);
+
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu =
+ vmware_smp_prepare_boot_cpu;
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "x86/vmware:online",
+ vmware_cpu_online,
+ vmware_cpu_down_prepare) < 0)
+ pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n");
+#else
+ vmware_guest_cpu_init();
+#endif
+ }
}
#else
#define vmware_paravirt_ops_setup() do {} while (0)
@@ -213,7 +427,7 @@
vmware_set_capabilities();
}
-static u8 vmware_select_hypercall(void)
+static u8 __init vmware_select_hypercall(void)
{
int eax, ebx, ecx, edx;
@@ -263,10 +477,49 @@
(eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0;
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,
+ struct pt_regs *regs)
+{
+ /* Copy VMWARE specific Hypercall parameters to the GHCB */
+ ghcb_set_rip(ghcb, regs->ip);
+ ghcb_set_rbx(ghcb, regs->bx);
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_rsi(ghcb, regs->si);
+ ghcb_set_rdi(ghcb, regs->di);
+ ghcb_set_rbp(ghcb, regs->bp);
+}
+
+static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ if (!(ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb) &&
+ ghcb_rsi_is_valid(ghcb) &&
+ ghcb_rdi_is_valid(ghcb) &&
+ ghcb_rbp_is_valid(ghcb)))
+ return false;
+
+ regs->bx = ghcb->save.rbx;
+ regs->cx = ghcb->save.rcx;
+ regs->dx = ghcb->save.rdx;
+ regs->si = ghcb->save.rsi;
+ regs->di = ghcb->save.rdi;
+ regs->bp = ghcb->save.rbp;
+
+ return true;
+}
+#endif
+
const __initconst struct hypervisor_x86 x86_hyper_vmware = {
- .name = "VMware",
- .detect = vmware_platform,
- .type = X86_HYPER_VMWARE,
- .init.init_platform = vmware_platform_setup,
- .init.x2apic_available = vmware_legacy_x2apic_available,
+ .name = "VMware",
+ .detect = vmware_platform,
+ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish,
+#endif
};
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index 8e6f2f4..05fa4ef 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -2,6 +2,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include "cpu.h"
@@ -16,13 +17,6 @@
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -58,8 +52,6 @@
if (c->x86 >= 0x6)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- cpu_detect_cache_sizes(c);
}
static void early_init_zhaoxin(struct cpuinfo_x86 *c)
@@ -89,31 +81,6 @@
}
-static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
-
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
static void init_zhaoxin(struct cpuinfo_x86 *c)
{
early_init_zhaoxin(c);
@@ -141,8 +108,7 @@
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
- if (cpu_has(c, X86_FEATURE_VMX))
- zhaoxin_detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
}
#ifdef CONFIG_X86_32