Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8367bd7..8baff50 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -51,10 +51,11 @@
#include <linux/err.h>
#include <linux/nmi.h>
#include <linux/tboot.h>
-#include <linux/stackprotector.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
#include <linux/numa.h>
+#include <linux/pgtable.h>
+#include <linux/overflow.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -63,7 +64,6 @@
#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/mwait.h>
@@ -80,6 +80,7 @@
#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
#include <asm/hw_irq.h>
+#include <asm/stackprotector.h>
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -147,6 +148,8 @@
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
+static void init_freq_invariance(bool secondary);
+
/*
* Report back to the Boot Processor during boot time or to the caller processor
* during CPU online.
@@ -183,6 +186,8 @@
*/
set_cpu_sibling_map(raw_smp_processor_id());
+ init_freq_invariance(true);
+
/*
* Get our bogomips.
* Update loops_per_jiffy in cpu_data. Previous call to
@@ -222,10 +227,9 @@
load_cr3(swapper_pg_dir);
__flush_tlb_all();
#endif
- load_current_idt();
+ cpu_init_exception_handling();
cpu_init();
x86_cpuinit.early_percpu_clock_init();
- preempt_disable();
smp_callin();
enable_start_cpu0 = 0;
@@ -255,21 +259,10 @@
/* enable local interrupts */
local_irq_enable();
- /* to prevent fake stack check failure in clock setup */
- boot_init_stack_canary();
-
x86_cpuinit.setup_percpu_clockev();
wmb();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
-
- /*
- * Prevent tail call to cpu_startup_entry() because the stack protector
- * guard has been changed a couple of function calls up, in
- * boot_init_stack_canary() and must not be checked before tail calling
- * another function.
- */
- prevent_tail_call_optimization();
}
/**
@@ -458,47 +451,12 @@
return false;
}
-/*
- * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
- *
- * These are Intel CPUs that enumerate an LLC that is shared by
- * multiple NUMA nodes. The LLC on these systems is shared for
- * off-package data access but private to the NUMA node (half
- * of the package) for on-package access.
- *
- * CPUID (the source of the information about the LLC) can only
- * enumerate the cache as being shared *or* unshared, but not
- * this particular configuration. The CPU in this case enumerates
- * the cache to be shared across the entire package (spanning both
- * NUMA nodes).
- */
-
-static const struct x86_cpu_id snc_cpu[] = {
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
- {}
-};
-
-static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
-
- /* Do not match if we do not have a valid APICID for cpu: */
- if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
- return false;
-
- /* Do not match if LLC id does not match: */
- if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
- return false;
-
- /*
- * Allow the SNC topology without warning. Return of false
- * means 'c' does not share the LLC of 'o'. This will be
- * reflected to userspace.
- */
- if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
- return false;
-
- return topology_sane(c, o, "llc");
+ if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_die_id == o->cpu_die_id)
+ return true;
+ return false;
}
/*
@@ -513,12 +471,50 @@
return false;
}
-static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+/*
+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
+ *
+ * Any Intel CPU that has multiple nodes per package and does not
+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
+ *
+ * When in SNC mode, these CPUs enumerate an LLC that is shared
+ * by multiple NUMA nodes. The LLC is shared for off-package data
+ * access but private to the NUMA node (half of the package) for
+ * on-package access. CPUID (the source of the information about
+ * the LLC) can only enumerate the cache as shared or unshared,
+ * but not this particular configuration.
+ */
+
+static const struct x86_cpu_id intel_cod_cpu[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
+ X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
+ X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
+ {}
+};
+
+static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- if ((c->phys_proc_id == o->phys_proc_id) &&
- (c->cpu_die_id == o->cpu_die_id))
- return true;
- return false;
+ const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+ bool intel_snc = id && id->driver_data;
+
+ /* Do not match if we do not have a valid APICID for cpu: */
+ if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
+ return false;
+
+ /* Do not match if LLC id does not match: */
+ if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
+ return false;
+
+ /*
+ * Allow the SNC topology without warning. Return of false
+ * means 'c' does not share the LLC of 'o'. This will be
+ * reflected to userspace.
+ */
+ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
+ return false;
+
+ return topology_sane(c, o, "llc");
}
@@ -592,14 +588,23 @@
for_each_cpu(i, cpu_sibling_setup_mask) {
o = &cpu_data(i);
+ if (match_pkg(c, o) && !topology_same_node(c, o))
+ x86_has_numa_in_package = true;
+
if ((i == cpu) || (has_smt && match_smt(c, o)))
link_mask(topology_sibling_cpumask, cpu, i);
if ((i == cpu) || (has_mp && match_llc(c, o)))
link_mask(cpu_llc_shared_mask, cpu, i);
+ if ((i == cpu) || (has_mp && match_die(c, o)))
+ link_mask(topology_die_cpumask, cpu, i);
}
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ if (threads > __max_smt_threads)
+ __max_smt_threads = threads;
+
/*
* This needs a separate iteration over the cpus because we rely on all
* topology_sibling_cpumask links to be set-up.
@@ -613,8 +618,7 @@
/*
* Does this new cpu bringup a new core?
*/
- if (cpumask_weight(
- topology_sibling_cpumask(cpu)) == 1) {
+ if (threads == 1) {
/*
* for each core in package, increment
* the booted_cores for this new cpu
@@ -631,16 +635,7 @@
} else if (i != cpu && !c->booted_cores)
c->booted_cores = cpu_data(i).booted_cores;
}
- if (match_pkg(c, o) && !topology_same_node(c, o))
- x86_has_numa_in_package = true;
-
- if ((i == cpu) || (has_mp && match_die(c, o)))
- link_mask(topology_die_cpumask, cpu, i);
}
-
- threads = cpumask_weight(topology_sibling_cpumask(cpu));
- if (threads > __max_smt_threads)
- __max_smt_threads = threads;
}
/* maps the cpu to the sched domain representing multi-core */
@@ -1007,6 +1002,7 @@
alternatives_enable_smp();
per_cpu(current_task, cpu) = idle;
+ cpu_init_stack_canary(cpu, idle);
/* Initialize the interrupt stack(s) */
ret = irq_init_percpu_irqstack(cpu);
@@ -1345,7 +1341,7 @@
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
-
+ init_freq_invariance(false);
smp_sanity_check();
switch (apic_intr_mode) {
@@ -1380,12 +1376,12 @@
speculative_store_bypass_ht_init();
}
-void arch_enable_nonboot_cpus_begin(void)
+void arch_thaw_secondary_cpus_begin(void)
{
set_mtrr_aps_delayed_init();
}
-void arch_enable_nonboot_cpus_end(void)
+void arch_thaw_secondary_cpus_end(void)
{
mtrr_aps_init();
}
@@ -1442,7 +1438,7 @@
/*
* cpu_possible_mask should be static, it cannot change as cpu's
* are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and dont expect to
+ * are allocated by some modules at init time, and don't expect to
* do this dynamically on cpu arrival/departure.
* cpu_present_mask on the other hand can change dynamically.
* In case when cpu_hotplug is not compiled, then we resort to current
@@ -1660,13 +1656,17 @@
local_irq_disable();
}
-static bool wakeup_cpu0(void)
+/**
+ * cond_wakeup_cpu0 - Wake up CPU0 if needed.
+ *
+ * If NMI wants to wake up CPU0, start CPU0.
+ */
+void cond_wakeup_cpu0(void)
{
if (smp_processor_id() == 0 && enable_start_cpu0)
- return true;
-
- return false;
+ start_cpu0();
}
+EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
/*
* We need to flush the caches before going to sleep, lest we have
@@ -1735,11 +1735,8 @@
__monitor(mwait_ptr, 0, 0);
mb();
__mwait(eax, 0);
- /*
- * If NMI wants to wake up CPU0, start CPU0.
- */
- if (wakeup_cpu0())
- start_cpu0();
+
+ cond_wakeup_cpu0();
}
}
@@ -1750,11 +1747,8 @@
while (1) {
native_halt();
- /*
- * If NMI wants to wake up CPU0, start CPU0.
- */
- if (wakeup_cpu0())
- start_cpu0();
+
+ cond_wakeup_cpu0();
}
}
@@ -1786,3 +1780,339 @@
}
#endif
+
+#ifdef CONFIG_X86_64
+/*
+ * APERF/MPERF frequency ratio computation.
+ *
+ * The scheduler wants to do frequency invariant accounting and needs a <1
+ * ratio to account for the 'current' frequency, corresponding to
+ * freq_curr / freq_max.
+ *
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
+ * our P-state setting is little more than a request/hint, we need to observe
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
+ * interval after discarding idle time. This is given by:
+ *
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
+ *
+ * where freq_base is the max non-turbo P-state.
+ *
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
+ * can't know which turbo states will be available at a given point in time:
+ * it all depends on the thermal headroom of the entire package. We set it to
+ * the turbo level with 4 cores active.
+ *
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
+ * which would ignore the entire turbo range (a conspicuous part, making
+ * freq_curr/freq_max always maxed out).
+ *
+ * An exception to the heuristic above is the Atom uarch, where we choose the
+ * highest turbo level for freq_max since Atom's are generally oriented towards
+ * power efficiency.
+ *
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
+ */
+
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+static DEFINE_PER_CPU(u64, arch_prev_aperf);
+static DEFINE_PER_CPU(u64, arch_prev_mperf);
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
+
+void arch_set_max_freq_ratio(bool turbo_disabled)
+{
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
+ arch_turbo_freq_ratio;
+}
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
+
+static bool turbo_disabled(void)
+{
+ u64 misc_en;
+ int err;
+
+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ if (err)
+ return false;
+
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
+}
+
+static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ int err;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
+
+ return true;
+}
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define X86_MATCH(model) \
+ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
+ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
+ X86_MATCH(XEON_PHI_KNL),
+ X86_MATCH(XEON_PHI_KNM),
+ {}
+};
+
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
+ X86_MATCH(SKYLAKE_X),
+ {}
+};
+
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
+ X86_MATCH(ATOM_GOLDMONT),
+ X86_MATCH(ATOM_GOLDMONT_D),
+ X86_MATCH(ATOM_GOLDMONT_PLUS),
+ {}
+};
+
+static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
+ int num_delta_fratio)
+{
+ int fratio, delta_fratio, found;
+ int err, i;
+ u64 msr;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ fratio = (msr >> 8) & 0xFF;
+ i = 16;
+ found = 0;
+ do {
+ if (found >= num_delta_fratio) {
+ *turbo_freq = fratio;
+ return true;
+ }
+
+ delta_fratio = (msr >> (i + 5)) & 0x7;
+
+ if (delta_fratio) {
+ found += 1;
+ fratio -= delta_fratio;
+ }
+
+ i += 8;
+ } while (i < 64);
+
+ return true;
+}
+
+static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
+{
+ u64 ratios, counts;
+ u32 group_size;
+ int err, i;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ if (err)
+ return false;
+
+ for (i = 0; i < 64; i += 8) {
+ group_size = (counts >> i) & 0xFF;
+ if (group_size >= size) {
+ *turbo_freq = (ratios >> i) & 0xFF;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ u64 msr;
+ int err;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
+
+ return true;
+}
+
+static bool intel_set_max_freq_ratio(void)
+{
+ u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
+
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
+ goto out;
+
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ return false;
+
+out:
+ /*
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
+ */
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
+
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
+ arch_set_max_freq_ratio(turbo_disabled());
+
+ return true;
+}
+
+static void init_counter_refs(void)
+{
+ u64 aperf, mperf;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+
+ this_cpu_write(arch_prev_aperf, aperf);
+ this_cpu_write(arch_prev_mperf, mperf);
+}
+
+static void init_freq_invariance(bool secondary)
+{
+ bool ret = false;
+
+ if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
+ return;
+
+ if (secondary) {
+ if (static_branch_likely(&arch_scale_freq_key)) {
+ init_counter_refs();
+ }
+ return;
+ }
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ ret = intel_set_max_freq_ratio();
+
+ if (ret) {
+ init_counter_refs();
+ static_branch_enable(&arch_scale_freq_key);
+ } else {
+ pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
+ }
+}
+
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ static_branch_disable(&arch_scale_freq_key);
+}
+
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+
+void arch_scale_freq_tick(void)
+{
+ u64 freq_scale = SCHED_CAPACITY_SCALE;
+ u64 aperf, mperf;
+ u64 acnt, mcnt;
+
+ if (!arch_scale_freq_invariant())
+ return;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+
+ acnt = aperf - this_cpu_read(arch_prev_aperf);
+ mcnt = mperf - this_cpu_read(arch_prev_mperf);
+
+ this_cpu_write(arch_prev_aperf, aperf);
+ this_cpu_write(arch_prev_mperf, mperf);
+
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
+
+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ goto error;
+
+ freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
+
+ if (freq_scale > SCHED_CAPACITY_SCALE)
+ freq_scale = SCHED_CAPACITY_SCALE;
+
+ this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
+}
+#else
+static inline void init_freq_invariance(bool secondary)
+{
+}
+#endif /* CONFIG_X86_64 */